[Refactor] Refactor configs and metafile (#1369)

* update base datasets

* update base

* update barlowtwins

* update with new convention

* update

* update

* update

* add schedule

* add densecl

* add eva

* add mae

* add maskfeat

* add milan and mixmim

* add moco

* add swav simclr

* add simmim and simsiam

* refine

* update

* add to model index

* update config inheritance

* fix error in metafile

* Update pre-commit and metafile check script

* update metafile

* fix name error

* Fix classification model name and config name

---------

Co-authored-by: mzr1996 <mzr1996@163.com>
pull/1380/head
Yixiao Fang 2023-02-23 11:17:16 +08:00 committed by GitHub
parent 36bea13fca
commit 89000c10eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
158 changed files with 9137 additions and 256 deletions

View File

@ -1,4 +1,7 @@
import argparse
import logging
import re
import sys
from pathlib import Path
import yaml
@ -7,6 +10,31 @@ from modelindex.models.Collection import Collection
from modelindex.models.Model import Model
from modelindex.models.ModelIndex import ModelIndex
class ContextFilter(logging.Filter):
metafile = None
name = None
failed = False
def filter(self, record: logging.LogRecord):
record.color = {
logging.WARNING: '\x1b[33;20m',
logging.ERROR: '\x1b[31;1m',
}.get(record.levelno, '')
self.failed = self.failed or (record.levelno >= logging.ERROR)
record.metafile = self.metafile or ''
record.name = ('' if self.name is None else '\x1b[32m' + self.name +
'\x1b[0m: ')
return True
context = ContextFilter()
logging.basicConfig(
format='[%(metafile)s] %(color)s%(levelname)s\x1b[0m - %(name)s%(message)s'
)
logger = logging.getLogger()
logger.addFilter(context)
prog_description = """\
Check the format of metafile.
"""
@ -23,61 +51,116 @@ def parse_args():
'-w',
action='store_true',
help='Whether to enable all warnings.')
parser.add_argument('--skip', action='append', help='Rules to skip check.')
args = parser.parse_args()
args.skip = args.skip or []
return args
def check_collection(modelindex: ModelIndex):
if len(modelindex.collections) != 1:
return 'One metafile should have only one collection.'
def check_collection(modelindex: ModelIndex, skip=[]):
if len(modelindex.collections) == 0:
return ['No collection field.']
elif len(modelindex.collections) > 1:
logger.error('One metafile should have only one collection.')
collection: Collection = modelindex.collections[0]
if collection.name is None:
return 'The collection should have `Name` field.'
logger.error('The collection should have `Name` field.')
if collection.readme is None:
return 'The collection should have `README` field.'
logger.error('The collection should have `README` field.')
if not (MMCLS_ROOT / collection.readme).exists():
return f'The README {collection.readme} is not found.'
logger.error(f'The README {collection.readme} is not found.')
if not isinstance(collection.paper, dict):
return ('The collection should have `Paper` field with '
'`Title` and `URL`.')
if 'Title' not in collection.paper:
logger.error('The collection should have `Paper` field with '
'`Title` and `URL`.')
elif 'Title' not in collection.paper:
# URL is not necessary.
return "The collection's paper should have `Paper` field."
logger.error("The collection's paper should have `Paper` field.")
def check_model(model: Model, wall=True):
def check_model_name(name):
fields = name.split('_')
if len(fields) > 5:
logger.warning('Too many fields.')
return
elif len(fields) < 3:
logger.warning('Too few fields.')
return
elif len(fields) == 5:
algo, model, pre, train, data = fields
elif len(fields) == 3:
model, train, data = fields
algo, pre = None, None
elif len(fields) == 4 and fields[1].endswith('-pre'):
model, pre, train, data = fields
algo = None
else:
algo, model, train, data = fields
pre = None
if pre is not None and not pre.endswith('-pre'):
logger.warning(f'The position of `{pre}` should be '
'pre-training information, and ends with `-pre`.')
if '3rdparty' not in train and re.match(r'\d+xb\d+', train) is None:
logger.warning(f'The position of `{train}` should be training '
'infomation, and starts with `3rdparty` or '
'`{num_device}xb{batch_per_device}`')
def check_model(model: Model, skip=[]):
context.name = None
if model.name is None:
return "A model doesn't have `Name` field."
if model.metadata is None:
return f'{model.name}: No `Metadata` field.'
if model.metadata.parameters is None or model.metadata.flops is None:
return (
f'{model.name}: Metadata should have `Parameters` and '
'`FLOPs` fields. You can use `tools/analysis_tools/get_flops.py` '
'to calculate them.')
if model.results is not None:
logger.error("A model doesn't have `Name` field.")
return
context.name = model.name
check_model_name(model.name)
if model.name.endswith('.py'):
logger.error("Don't add `.py` suffix in model name.")
if model.metadata is None and 'metadata' not in skip:
logger.error('No `Metadata` field.')
if (model.metadata.parameters is None
or model.metadata.flops is None) and 'flops-param' not in skip:
logger.error('Metadata should have `Parameters` and `FLOPs` fields. '
'You can use `tools/analysis_tools/get_flops.py` '
'to calculate them.')
if model.results is not None and 'result' not in skip:
result = model.results[0]
if not isinstance(result.dataset, str):
return (
f'{model.name}: Dataset field of Results should be a string. '
'If you want to specify the training dataset, please use '
'`Metadata.Training Data` field.')
if model.config is None:
return f'{model.name}: No `Config` field.'
if not (MMCLS_ROOT / model.config).exists():
return f'{model.name}: The config {model.config} is not found.'
if model.in_collection is None:
return f'{model.name}: No `In Collection` field.'
logger.error('Dataset field of Results should be a string. '
'If you want to specify the training dataset, '
'please use `Metadata.Training Data` field.')
if wall and model.data.get(
'Converted From') is not None and '3rdparty' not in model.name:
print(f'WARN: The model name {model.name} should include '
"'3rdparty' since it's converted from other repository.")
if wall and model.weights is not None and model.weights.endswith('.pth'):
if 'config' not in skip:
if model.config is None:
logger.error('No `Config` field.')
elif not (MMCLS_ROOT / model.config).exists():
logger.error(f'The config {model.config} is not found.')
if model.in_collection is None:
logger.error('No `In Collection` field.')
if (model.data.get('Converted From') is not None
and '3rdparty' not in model.name):
logger.warning("The model name should include '3rdparty' "
"since it's converted from other repository.")
if (model.weights is not None and model.weights.endswith('.pth')
and 'ckpt-name' not in skip):
basename = model.weights.rsplit('/', 1)[-1]
if not basename.startswith(model.name):
print(f'WARN: The checkpoint name {basename} is not the '
f'same as the model name {model.name}.')
logger.warning(f'The checkpoint name {basename} is not the '
'same as the model name.')
context.name = None
def main(metafile: Path, args):
@ -87,28 +170,38 @@ def main(metafile: Path, args):
elif metafile.samefile(MMCLS_ROOT / 'model-index.yml'):
return
context.metafile = metafile
with open(MMCLS_ROOT / 'model-index.yml', 'r') as f:
metafile_list = yaml.load(f, yaml.Loader)['Import']
if not any(
metafile.samefile(MMCLS_ROOT / file)
for file in metafile_list):
raise ValueError(f'The metafile {metafile} is not imported in '
'the `model-index.yml`.')
logger.error(
'The metafile is not imported in the `model-index.yml`.')
modelindex = load(str(metafile))
modelindex.build_models_with_collections()
collection_err = check_collection(modelindex)
if collection_err is not None:
raise ValueError(f'The `Collections` in the {metafile} is wrong:'
f'\n\t{collection_err}')
check_collection(modelindex, args.skip)
names = {model.name for model in modelindex.models}
for model in modelindex.models:
model_err = check_model(model, args.Wall)
if model_err is not None:
raise ValueError(
f'The `Models` in the {metafile} is wrong:\n\t{model_err}')
check_model(model, args.skip)
for downstream in model.data.get('Downstream', []):
if downstream not in names:
context.name = model.name
logger.error(
f"The downstream model {downstream} doesn't exist.")
if __name__ == '__main__':
args = parse_args()
if args.Wall:
logger.setLevel(logging.WARNING)
else:
logger.setLevel(logging.ERROR)
for metafile in args.metafile:
main(metafile, args)
sys.exit(int(context.failed))

View File

@ -289,7 +289,7 @@ def fill_model_by_prompt(model: dict, defaults: dict):
def update_model_by_dict(model: dict, update_dict: dict, defaults: dict):
# Name
if 'name override' in update_dict:
model['Name'] = update_dict['name override']
model['Name'] = update_dict['name override'].strip()
# In Collection
model['In Collection'] = defaults.get('In Collection')
@ -306,7 +306,7 @@ def update_model_by_dict(model: dict, update_dict: dict, defaults: dict):
# Metadata.Flops, Metadata.Parameters
flops = model.get('Metadata', {}).get('FLOPs')
params = model.get('Metadata', {}).get('Parameters')
if config_updated and (flops is None or params is None):
if config_updated or (flops is None or params is None):
print(f'Automatically compute FLOPs and Parameters of {model["Name"]}')
flops, params = get_flops(str(MMCLS_ROOT / model['Config']))

View File

@ -51,6 +51,7 @@ repos:
- repo: local
hooks:
- id: metafile
args: ['--skip', 'flops-param']
name: metafile
description: Check the format of metafile
entry: python .dev_scripts/check_metafile.py

View File

@ -0,0 +1,51 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='TwoNormDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
second_mean=[127.5, 127.5, 127.5],
second_std=[127.5, 127.5, 127.5],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandomResizedCropAndInterpolationWithTwoPic',
size=224,
second_size=224,
interpolation='bicubic',
second_interpolation='bicubic',
scale=(0.2, 1.0)),
dict(
type='BEiTMaskGenerator',
input_size=(14, 14),
num_masking_patches=75,
max_num_patches=75,
min_num_patches=16),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,57 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=192,
scale=(0.67, 1.0),
ratio=(3. / 4., 4. / 3.)),
dict(type='RandomFlip', prob=0.5),
dict(
type='SimMIMMaskGenerator',
input_size=192,
mask_patch_size=32,
model_patch_size=4,
mask_ratio=0.6),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# for visualization
vis_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(192, 192), backend='pillow'),
dict(
type='SimMIMMaskGenerator',
input_size=192,
mask_patch_size=32,
model_patch_size=4,
mask_ratio=0.6),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]

View File

@ -0,0 +1,83 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
num_classes=1000,
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=192,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=219,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=192),
dict(type='PackClsInputs'),
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
collate_fn=dict(type='default_collate'),
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
)
val_dataloader = dict(
batch_size=64,
num_workers=5,
collate_fn=dict(type='default_collate'),
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator

View File

@ -0,0 +1,82 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
view_pipeline1 = [
dict(
type='RandomResizedCrop',
size=224,
interpolation='bicubic',
backend='pillow'),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=1.),
dict(type='RandomSolarize', prob=0.),
]
view_pipeline2 = [
dict(
type='RandomResizedCrop',
size=224,
interpolation='bicubic',
backend='pillow'),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=0.1),
dict(type='RandomSolarize', prob=0.2)
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiView',
num_views=[1, 1],
transforms=[view_pipeline1, view_pipeline2]),
dict(type='PackSelfSupInputs', meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=32,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,52 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
# The difference between mocov2 and mocov1 is the transforms in the pipeline
view_pipeline = [
dict(
type='RandomResizedCrop', size=224, scale=(0.2, 1.), backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=0.5),
dict(type='RandomFlip', prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='MultiView', num_views=2, transforms=[view_pipeline]),
dict(type='PackSelfSupInputs', meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=32,
num_workers=8,
drop_last=True,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,57 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
num_classes=1000,
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', scale=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackClsInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]
train_dataloader = dict(
batch_size=32,
num_workers=4,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
persistent_workers=True,
pin_memory=True,
)
val_dataloader = dict(
batch_size=32,
num_workers=4,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator

View File

@ -0,0 +1,49 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
view_pipeline = [
dict(type='RandomResizedCrop', size=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.8,
contrast=0.8,
saturation=0.8,
hue=0.2)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='MultiView', num_views=2, transforms=[view_pipeline]),
dict(type='PackSelfSupInputs', meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=32,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,33 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
scale=(0.2, 1.0),
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5),
dict(type='PackSelfSupInputs', meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=512,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,77 @@
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
view_pipeline1 = [
dict(
type='RandomResizedCrop', size=224, scale=(0.2, 1.), backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=1.),
dict(type='RandomSolarize', prob=0.),
dict(type='RandomFlip', prob=0.5),
]
view_pipeline2 = [
dict(
type='RandomResizedCrop', size=224, scale=(0.2, 1.), backend='pillow'),
dict(
type='RandomApply',
transforms=[
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1)
],
prob=0.8),
dict(
type='RandomGrayscale',
prob=0.2,
keep_channels=True,
channel_weights=(0.114, 0.587, 0.2989)),
dict(type='RandomGaussianBlur', sigma_min=0.1, sigma_max=2.0, prob=0.1),
dict(type='RandomSolarize', prob=0.2),
dict(type='RandomFlip', prob=0.5),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiView',
num_views=[1, 1],
transforms=[view_pipeline1, view_pipeline2]),
dict(type='PackSelfSupInputs', meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=512,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))

View File

@ -0,0 +1,23 @@
# model settings
model = dict(
type='MAE',
backbone=dict(type='MAEViT', arch='b', patch_size=16, mask_ratio=0.75),
neck=dict(
type='MAEPretrainDecoder',
patch_size=16,
in_chans=3,
embed_dim=768,
decoder_embed_dim=512,
decoder_depth=8,
decoder_num_heads=16,
mlp_ratio=4.,
),
head=dict(
type='MAEPretrainHead',
norm_pix=True,
patch_size=16,
loss=dict(type='MAEReconstructionLoss')),
init_cfg=[
dict(type='Xavier', layer='Linear', distribution='uniform'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
])

View File

@ -0,0 +1,20 @@
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='LARS', lr=4.8, weight_decay=1e-6, momentum=0.9))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR', T_max=190, by_epoch=True, begin=10, end=200)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=200)

View File

@ -0,0 +1,14 @@
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='LARS', lr=1.6, momentum=0.9, weight_decay=0.))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
val_cfg = dict()
test_cfg = dict()

View File

@ -0,0 +1,14 @@
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.3, momentum=0.9, weight_decay=1e-6))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=100, by_epoch=True, begin=0, end=100)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
val_cfg = dict()
test_cfg = dict()

View File

@ -0,0 +1,12 @@
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.03, weight_decay=1e-4, momentum=0.9))
# learning rate scheduler
param_scheduler = [
dict(type='CosineAnnealingLR', T_max=200, by_epoch=True, begin=0, end=200)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=200)

View File

@ -0,0 +1,14 @@
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=1e-4))
# learning rate scheduler
param_scheduler = [
dict(type='MultiStepLR', by_epoch=True, milestones=[60, 80], gamma=0.1)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
val_cfg = dict()
test_cfg = dict()

View File

@ -13,7 +13,7 @@ Collections:
URL: https://github.com/open-mmlab/mmclassification/blob/v1.0.0rc3/mmcls/models/heads/margin_head.py
Models:
- Name: resnet50-arcface_inshop
- Name: resnet50-arcface_8xb32_inshop
Metadata:
FLOPs: 16571226112
Parameters: 31693888

View File

@ -0,0 +1,81 @@
# BarlowTwins
> [Barlow Twins: Self-Supervised Learning via Redundancy Reduction](https://arxiv.org/abs/2103.03230)
<!-- [ALGORITHM] -->
## Abstract
Self-supervised learning (SSL) is rapidly closing the gap with supervised methods on large computer vision benchmarks. A successful approach to SSL is to learn embeddings which are invariant to distortions of the input sample. However, a recurring issue with this approach is the existence of trivial constant solutions. Most current methods avoid such solutions by careful implementation details. We propose an objective function that naturally avoids collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample, and making it as close to the identity matrix as possible. This causes the embedding vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors. The method is called Barlow Twins, owing to neuroscientist H. Barlow's redundancy-reduction principle applied to a pair of identical networks. Barlow Twins does not require large batches nor asymmetry between the network twins such as a predictor network, gradient stopping, or a moving average on the weight updates. Intriguingly it benefits from very high-dimensional output vectors. Barlow Twins outperforms previous methods on ImageNet for semi-supervised classification in the low-data regime, and is on par with current state of the art for ImageNet classification with a linear classifier head, and for transfer tasks of classification and object detection.
<div align="center">
<img src="https://user-images.githubusercontent.com/36138628/163914714-082de804-0b5f-4024-94f9-880e6ef334fa.png" width="800" />
</div>
## Models and Benchmarks
In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
### Classification
The classification benchmarks includes 1 downstream task datasets, **ImageNet**. If not specified, the results are Top-1 (%).
#### ImageNet Linear Evaluation
The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
| Self-Supervised Config | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- |
| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 15.51 | 33.98 | 45.96 | 61.90 | 71.01 |
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tbody>
<tr>
<td>BarlowTwins</td>
<td>ResNet50</td>
<td>300</td>
<td>2048</td>
<td>71.8</td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220825-57307488.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220726_033718.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-coslr-100e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/resnet50_linear-8xb32-coslr-100e_in1k/resnet50_linear-8xb32-coslr-100e_in1k_20220825-52fde35f.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/resnet50_linear-8xb32-coslr-100e_in1k/resnet50_linear-8xb32-coslr-100e_in1k_20220730_093018.json'>log</a></td>
<td>/</td>
</tr>
</tbody>
</table>
#### ImageNet Nearest-Neighbor Classification
The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
| Self-Supervised Config | k=10 | k=20 | k=100 | k=200 |
| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 63.6 | 63.8 | 62.7 | 61.9 |
## Citation
```bibtex
@inproceedings{zbontar2021barlow,
title={Barlow twins: Self-supervised learning via redundancy reduction},
author={Zbontar, Jure and Jing, Li and Misra, Ishan and LeCun, Yann and Deny, St{\'e}phane},
booktitle={International Conference on Machine Learning},
year={2021},
}
```

View File

@ -0,0 +1,72 @@
_base_ = [
'../_base_/datasets/imagenet_bs32_byol.py',
'../_base_/default_runtime.py',
]
# datasets
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='BarlowTwins',
backbone=dict(
type='ResNet',
depth=50,
in_channels=3,
out_indices=[4], # 0: conv-1, x: stage-x
norm_cfg=dict(type='SyncBN'),
zero_init_residual=True),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=8192,
out_channels=8192,
num_layers=3,
with_last_bn=False,
with_last_bn_affine=False,
with_avg_pool=True,
init_cfg=dict(
type='Kaiming', distribution='uniform', layer=['Linear'])),
head=dict(
type='LatentCrossCorrelationHead',
in_channels=8192,
loss=dict(type='CrossCorrelationLoss')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='LARS', lr=1.6, momentum=0.9, weight_decay=1e-6),
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lr_mult=0.024, lars_exclude=True),
'bias': dict(decay_mult=0, lr_mult=0.024, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(
decay_mult=0, lr_mult=0.024, lars_exclude=True),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.6e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=990,
eta_min=0.0016,
by_epoch=True,
begin=10,
end=1000,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=1000)
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,72 @@
_base_ = [
'../_base_/datasets/imagenet_bs32_byol.py',
'../_base_/default_runtime.py',
]
# datasets
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='BarlowTwins',
backbone=dict(
type='ResNet',
depth=50,
in_channels=3,
out_indices=[4], # 0: conv-1, x: stage-x
norm_cfg=dict(type='SyncBN'),
zero_init_residual=True),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=8192,
out_channels=8192,
num_layers=3,
with_last_bn=False,
with_last_bn_affine=False,
with_avg_pool=True,
init_cfg=dict(
type='Kaiming', distribution='uniform', layer=['Linear'])),
head=dict(
type='LatentCrossCorrelationHead',
in_channels=8192,
loss=dict(type='CrossCorrelationLoss')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='LARS', lr=1.6, momentum=0.9, weight_decay=1e-6),
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lr_mult=0.024, lars_exclude=True),
'bias': dict(decay_mult=0, lr_mult=0.024, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(
decay_mult=0, lr_mult=0.024, lars_exclude=True),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.6e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=290,
eta_min=0.0016,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,12 @@
_base_ = [
'../../_base_/models/resnet50.py',
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_sgd_coslr_100e.py',
'../../_base_/default_runtime.py',
]
model = dict(backbone=dict(frozen_stages=4))
# runtime settings
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))

View File

@ -0,0 +1,38 @@
Collections:
- Name: BarlowTwins
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- LARS
Training Resources: 8x A100 GPUs
Architecture:
- ResNet
- BarlowTwins
Paper:
URL: https://arxiv.org/abs/2103.03230
Title: "Barlow Twins: Self-Supervised Learning via Redundancy Reduction"
README: configs/barlowtwins/README.md
Models:
- Name: barlowtwins_resnet50_8xb256-coslr-300e_in1k
In Collection: BarlowTwins
Metadata:
Epochs: 300
Batch Size: 2048
Results: null
Config: configs/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220825-57307488.pth
Downstream:
- resnet50_barlowtwins-pre_8xb32-linear-coslr-100e_in1k
- Name: resnet50_barlowtwins-pre_8xb32-linear-coslr-100e_in1k
In Collection: BarlowTwins
Metadata:
Epochs: 100
Batch Size: 256
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 71.8
Config: configs/barlowtwins/benchmarks/resnet50_8xb32-linear-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k/resnet50_linear-8xb32-coslr-100e_in1k/resnet50_linear-8xb32-coslr-100e_in1k_20220825-52fde35f.pth

View File

@ -12,7 +12,43 @@ We introduce a self-supervised vision representation model BEiT, which stands fo
<img src="https://user-images.githubusercontent.com/36138628/203688351-adac7146-4e71-4ab6-8958-5cfe643a2dc5.png" width="70%"/>
</div>
## Results and models
## Self-supervised Learning
### ImageNet-1k
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tr>
<td>BEiT</td>
<td>ViT-base</td>
<td>300</td>
<td>2048</td>
<td>/</td>
<td>83.1</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221128-ab79e626.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221123_103802.json'>log</a></td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/beit/classification/vit-base-p16_ft-8xb128-coslr-100e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221128-0ca393e9.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221127_162126.json'>log</a></td>
</tr>
</tbody>
</table>
## Classification
### ImageNet-1k
@ -22,17 +58,13 @@ We introduce a self-supervised vision representation model BEiT, which stands fo
*Models with * are converted from the [official repo](https://github.com/microsoft/unilm/tree/master/beit). The config files of these models are only for inference.*
For BEiT self-supervised learning algorithm, welcome to [MMSelfSup page](https://github.com/open-mmlab/mmselfsup/tree/dev-1.x/configs/selfsup/beit) to get more information.
## Citation
```bibtex
@article{beit,
title={{BEiT}: {BERT} Pre-Training of Image Transformers},
author={Hangbo Bao and Li Dong and Furu Wei},
year={2021},
eprint={2106.08254},
archivePrefix={arXiv},
primaryClass={cs.CV}
@inproceedings{bao2022beit,
title={{BE}iT: {BERT} Pre-Training of Image Transformers},
author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
booktitle={International Conference on Learning Representations},
year={2022},
}
```

View File

@ -0,0 +1,132 @@
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='TwoNormDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
second_mean=[-20.4, -20.4, -20.4],
second_std=[204., 204., 204.],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ColorJitter',
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandomResizedCropAndInterpolationWithTwoPic',
size=224,
second_size=112,
interpolation='bicubic',
second_interpolation='lanczos',
scale=(0.08, 1.0)),
dict(
type='BEiTMaskGenerator',
input_size=(14, 14),
num_masking_patches=75,
max_num_patches=None,
min_num_patches=16),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# model settings
model = dict(
type='BEiT',
backbone=dict(
type='BEiTViT',
arch='base',
patch_size=16,
drop_path_rate=0.1,
final_norm=True,
layer_scale_init_value=0.1,
init_cfg=[
dict(type='TruncNormal', std=0.02, layer='Linear'),
dict(type='TruncNormal', std=0.02, layer='Conv2d'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=None,
head=dict(
type='BEiTV1Head',
embed_dims=768,
num_embed=8192,
loss=dict(type='BEiTLoss')),
target_generator=dict(
type='DALL-E',
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/dalle_encoder.pth', # noqa: E501
)))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW', lr=1.5e-3, betas=(0.9, 0.999), weight_decay=0.05),
clip_grad=dict(max_norm=3.0),
paramwise_cfg=dict(
custom_keys={
# the following configurations are designed for BEiT
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'q_bias': dict(decay_mult=0.0),
'v_bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0),
'.gamma': dict(decay_mult=0.0),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=1e-5,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
find_unused_parameters = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,134 @@
# mmcls:: means we use the default settings from MMClassification
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
data_preprocessor = dict(
num_classes=1000,
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiT',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
use_abs_pos_emb=False,
use_rel_pos_bias=True,
use_shared_rel_pos_bias=False),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
file_client_args = dict(backend='disk')
train_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit',
layer_decay_rate=0.65),
constructor='mmselfsup.LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
_delete_=True,
custom_keys={
# the following configurations are designed for BEiT
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'q_bias': dict(decay_mult=0.0),
'v_bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0),
'.gamma': dict(decay_mult=0.0),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=20,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
by_epoch=True,
begin=20,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=2))
train_cfg = dict(by_epoch=True, max_epochs=100)
randomness = dict(seed=0)

View File

@ -1,7 +1,7 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
data_preprocessor = dict(

View File

@ -29,13 +29,38 @@ Models:
- ImageNet-21k
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 85.28
Top 5 Accuracy: 97.59
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 85.28
Top 5 Accuracy: 97.59
Weights: https://download.openmmlab.com/mmclassification/v0/beit/beit-base_3rdparty_in1k_20221114-c0a4df23.pth
Converted From:
Weights: https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth
Code: https://github.com/microsoft/unilm/tree/master/beit
Config: configs/beit/beit-base-p16_8xb64_in1k.py
Config: configs/beit/benchmarks/beit-base-p16_8xb64_in1k.py
- Name: beit_beit-base-p16_8xb256-amp-coslr-300e_in1k
In Collection: BEiT
Metadata:
Epochs: 300
Batch Size: 2048
FLOPs: 17581219584
Parameters: 86530984
Results: null
Config: configs/beit/beit_beit-base-p16_8xb256-amp-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221128-ab79e626.pth
Downstream:
- beit-base-p16_beit-pre_8xb128-coslr-100e_in1k
- Name: beit-base-p16_beit-pre_8xb128-coslr-100e_in1k
In Collection: BEiT
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.1
Config: configs/beit/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/beit/beit_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221128-0ca393e9.pth

View File

@ -0,0 +1,119 @@
_base_ = [
'../_base_/datasets/imagenet_bs256_beitv2.py',
'../_base_/default_runtime.py',
]
# model settings
vqkd_encoder = dict(
arch='base',
img_size=224,
patch_size=16,
in_channels=3,
out_indices=-1,
drop_rate=0.,
drop_path_rate=0.,
norm_cfg=dict(type='LN', eps=1e-6),
final_norm=True,
with_cls_token=True,
avg_token=False,
frozen_stages=-1,
output_cls_token=False,
use_abs_pos_emb=True,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
layer_scale_init_value=0.,
interpolate_mode='bicubic',
patch_cfg=dict(),
layer_cfgs=dict(),
init_cfg=None)
layer_scale_init_value = 0.1
drop_path_rate = 0.1 # 0. for 300 epochs and 0.1 for 1600 epochs.
model = dict(
type='BEiT',
backbone=dict(
type='BEiTViT',
arch='base',
patch_size=16,
out_indices=[-4, -1],
drop_path_rate=drop_path_rate,
final_norm=False,
layer_scale_init_value=layer_scale_init_value,
init_cfg=[
dict(type='TruncNormal', std=0.02, layer='Linear'),
dict(type='TruncNormal', std=0.02, layer='Conv2d'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='BEiTV2Neck',
num_layers=2,
early_layers=9,
backbone_arch='base',
drop_path_rate=drop_path_rate,
layer_scale_init_value=layer_scale_init_value,
),
head=dict(
type='BEiTV2Head',
embed_dims=768,
num_embed=8192,
loss=dict(type='BEiTLoss')),
target_generator=dict(
type='VQKD',
encoder_config=vqkd_encoder,
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/vqkd_encoder.pth' # noqa
)))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
# betas: (0.9, 0.98) for 300 epochs and (0.9, 0.999) for 1600 epochs.
optimizer=dict(
type='AdamW', lr=1.5e-3, betas=(0.9, 0.999), weight_decay=0.05),
clip_grad=dict(max_norm=3.0),
paramwise_cfg=dict(
custom_keys={
# the following configurations are designed for BEiT
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'q_bias': dict(decay_mult=0.0),
'v_bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0),
'.gamma': dict(decay_mult=0.0),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=1e-5,
by_epoch=True,
begin=10,
end=1600,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=1600)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
find_unused_parameters = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,119 @@
_base_ = [
'../_base_/datasets/imagenet_bs256_beitv2.py',
'../_base_/default_runtime.py',
]
# model settings
vqkd_encoder = dict(
arch='base',
img_size=224,
patch_size=16,
in_channels=3,
out_indices=-1,
drop_rate=0.,
drop_path_rate=0.,
norm_cfg=dict(type='LN', eps=1e-6),
final_norm=True,
with_cls_token=True,
avg_token=False,
frozen_stages=-1,
output_cls_token=False,
use_abs_pos_emb=True,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
layer_scale_init_value=0.,
interpolate_mode='bicubic',
patch_cfg=dict(),
layer_cfgs=dict(),
init_cfg=None)
layer_scale_init_value = 0.1
drop_path_rate = 0. # 0. for 300 epochs and 0.1 for 1600 epochs.
model = dict(
type='BEiT',
backbone=dict(
type='BEiTViT',
arch='base',
patch_size=16,
out_indices=[-4, -1],
drop_path_rate=drop_path_rate,
final_norm=False,
layer_scale_init_value=layer_scale_init_value,
init_cfg=[
dict(type='TruncNormal', std=0.02, layer='Linear'),
dict(type='TruncNormal', std=0.02, layer='Conv2d'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='BEiTV2Neck',
num_layers=2,
early_layers=9,
backbone_arch='base',
drop_path_rate=drop_path_rate,
layer_scale_init_value=layer_scale_init_value,
),
head=dict(
type='BEiTV2Head',
embed_dims=768,
num_embed=8192,
loss=dict(type='BEiTLoss')),
target_generator=dict(
type='VQKD',
encoder_config=vqkd_encoder,
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/vqkd_encoder.pth' # noqa
)))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
# betas: (0.9, 0.98) for 300 epochs and (0.9, 0.999) for 1600 epochs.
optimizer=dict(
type='AdamW', lr=1.5e-3, betas=(0.9, 0.98), weight_decay=0.05),
clip_grad=dict(max_norm=3.0),
paramwise_cfg=dict(
custom_keys={
# the following configurations are designed for BEiT
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'q_bias': dict(decay_mult=0.0),
'v_bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0),
'.gamma': dict(decay_mult=0.0),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=1e-5,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
find_unused_parameters = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,128 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiT',
arch='base',
img_size=224,
patch_size=16,
# 0.2 for 1600 epochs pretrained models and 0.1 for 300 epochs.
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
use_abs_pos_emb=False,
use_rel_pos_bias=True,
use_shared_rel_pos_bias=False),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
file_client_args = dict(backend='disk')
train_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=5e-4,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit',
# 0.6 for 1600 epochs pretrained models and 0.65 for 300 epochs
layer_decay_rate=0.65),
constructor='mmselfsup.LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
_delete_=True,
custom_keys={
# the following configurations are designed for BEiT
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'q_bias': dict(decay_mult=0.0),
'v_bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0),
'.gamma': dict(decay_mult=0.0),
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=20,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
by_epoch=True,
begin=20,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=2))
train_cfg = dict(by_epoch=True, max_epochs=100)
randomness = dict(seed=0)

View File

@ -1,7 +1,7 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
model = dict(

View File

@ -29,13 +29,36 @@ Models:
- ImageNet-21k
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 86.47
Top 5 Accuracy: 97.99
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 86.47
Top 5 Accuracy: 97.99
Weights: https://download.openmmlab.com/mmclassification/v0/beit/beitv2-base_3rdparty_in1k_20221114-73e11905.pth
Converted From:
Weights: https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth
Code: https://github.com/microsoft/unilm/tree/master/beit2
Config: configs/beitv2/beitv2-base-p16_8xb64_in1k.py
Config: configs/beitv2/benchmarks/beit-base-p16_8xb64_in1k.py
- Name: beitv2_beit-base-p16_8xb256-amp-coslr-300e_in1k
In Collection: BEiTv2
Metadata:
Epochs: 300
Batch Size: 2048
Results: null
Config: configs/beitv2/beitv2_beit-base-p16_8xb256-amp-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/beitv2/beitv2_vit-base-p16_8xb256-amp-coslr-300e_in1k/beitv2_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221212-a157be30.pth
Downstream:
- beit-base-p16_beitv2-pre_8xb128-coslr-100e_in1k
- Name: beit-base-p16_beitv2-pre_8xb128-coslr-100e_in1k
In Collection: BEiTv2
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.0
Config: configs/beitv2/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/beitv2/beitv2_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221212-d1c0789e.pth

View File

@ -0,0 +1,134 @@
# BYOL
> [Bootstrap your own latent: A new approach to self-supervised Learning](https://arxiv.org/abs/2006.07733)
<!-- [ALGORITHM] -->
## Abstract
**B**ootstrap **Y**our **O**wn **L**atent (BYOL) is a new approach to self-supervised image representation learning. BYOL relies on two neural networks, referred to as online and target networks, that interact and learn from each other. From an augmented view of an image, we train the online network to predict the target network representation of the same image under a different augmented view. At the same time, we update the target network with a slow-moving average of the online network.
<div align="center">
<img src="https://user-images.githubusercontent.com/36138628/149720208-5ffbee78-1437-44c7-9ddb-b8caab60d2c3.png" width="800" />
</div>
## Models and Benchmarks
In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
### Classification
The classification benchmarks includes 4 downstream task datasets, **VOC**, **ImageNet**, **iNaturalist2018** and **Places205**. If not specified, the results are Top-1 (%).
#### VOC SVM / Low-shot SVM
The **Best Layer** indicates that the best results are obtained from which layers feature map. For example, if the **Best Layer** is **feature3**, its best result is obtained from the second stage of ResNet (1 for stem layer, 2-5 for 4 stage layers).
Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
| Self-Supervised Config | Best Layer | SVM | k=1 | k=2 | k=4 | k=8 | k=16 | k=32 | k=64 | k=96 |
| ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | feature5 | 86.31 | 45.37 | 56.83 | 68.47 | 74.12 | 78.30 | 81.53 | 83.56 | 84.73 |
#### ImageNet Linear Evaluation
The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
| Self-Supervised Config | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 |
| --------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 15.16 | 35.26 | 47.77 | 63.10 | 71.21 |
| [resnet50_16xb256-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_16xb256-coslr-200e_in1k.py) | 15.41 | 35.15 | 47.77 | 62.59 | 71.85 |
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tbody>
<tr>
<td>BYOL</td>
<td>ResNet50</td>
<td>200</td>
<td>4096</td>
<td>71.8</td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/byol/byol_resnet50_16xb256-coslr-200e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220825-de817331.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220721_150515.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220825-7596c6f5.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220724_130251.json'>log</a></td>
<td>/</td>
</tr>
</tbody>
</table>
#### Places205 Linear Evaluation
The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-28e_places205.py](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/classification/places205/resnet50_mhead_8xb32-steplr-28e_places205.py) for details of config.
| Self-Supervised Config | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 |
| --------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 21.25 | 36.55 | 43.66 | 50.74 | 53.82 |
| [resnet50_8xb32-accum16-coslr-300e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py) | 21.18 | 36.68 | 43.42 | 51.04 | 54.06 |
#### ImageNet Nearest-Neighbor Classification
The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
| Self-Supervised Config | k=10 | k=20 | k=100 | k=200 |
| --------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 63.9 | 64.2 | 62.9 | 61.9 |
| [resnet50_8xb32-accum16-coslr-300e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py) | 66.1 | 66.3 | 65.2 | 64.4 |
### Detection
The detection benchmarks includes 2 downstream task datasets, **Pascal VOC 2007 + 2012** and **COCO2017**. This benchmark follows the evluation protocols set up by MoCo.
#### Pascal VOC 2007 + 2012
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmdetection/voc0712/faster-rcnn_r50-c4_ms-24k_voc0712.py) for details.
| Self-Supervised Config | AP50 |
| --------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 80.35 |
#### COCO2017
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmdetection/coco/mask-rcnn_r50_fpn_ms-1x_coco.py) for details.
| Self-Supervised Config | mAP(Box) | AP50(Box) | AP75(Box) | mAP(Mask) | AP50(Mask) | AP75(Mask) |
| ------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | --------- | --------- | --------- | ---------- | ---------- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 40.9 | 61.0 | 44.6 | 36.8 | 58.1 | 39.5 |
### Segmentation
The segmentation benchmarks includes 2 downstream task datasets, **Cityscapes** and **Pascal VOC 2012 + Aug**. It follows the evluation protocols set up by MMSegmentation.
#### Pascal VOC 2012 + Aug
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmsegmentation/voc12aug/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py) for details.
| Self-Supervised Config | mIOU |
| --------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| [resnet50_8xb32-accum16-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py) | 67.16 |
## Citation
```bibtex
@inproceedings{grill2020bootstrap,
title={Bootstrap your own latent: A new approach to self-supervised learning},
author={Grill, Jean-Bastien and Strub, Florian and Altch{\'e}, Florent and Tallec, Corentin and Richemond, Pierre H and Buchatskaya, Elena and Doersch, Carl and Pires, Bernardo Avila and Guo, Zhaohan Daniel and Azar, Mohammad Gheshlaghi and others},
booktitle={NeurIPS},
year={2020}
}
```

View File

@ -0,0 +1,15 @@
_base_ = [
'../../_base_/models/resnet50.py',
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_lars_coslr_90e.py',
'../../_base_/default_runtime.py',
]
model = dict(backbone=dict(frozen_stages=4))
# dataset summary
train_dataloader = dict(batch_size=512)
# runtime settings
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))

View File

@ -0,0 +1,61 @@
_base_ = [
'../_base_/datasets/imagenet_bs32_byol.py',
'../_base_/schedules/imagenet_lars_coslr_200e.py',
'../_base_/default_runtime.py',
]
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='BYOL',
base_momentum=0.99,
backbone=dict(
type='ResNet',
depth=50,
in_channels=3,
out_indices=[4], # 0: conv-1, x: stage-x
norm_cfg=dict(type='SyncBN')),
neck=dict(
type='NonLinearNeck',
in_channels=2048,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=True,
with_last_bn=False,
with_avg_pool=True),
head=dict(
type='LatentPredictHead',
predictor=dict(
type='NonLinearNeck',
in_channels=256,
hid_channels=4096,
out_channels=256,
num_layers=2,
with_bias=True,
with_last_bn=False,
with_avg_pool=False),
loss=dict(type='CosineSimilarityLoss')),
)
# optimizer
optimizer = dict(type='LARS', lr=4.8, momentum=0.9, weight_decay=1e-6)
optim_wrapper = dict(
type='OptimWrapper',
optimizer=optimizer,
paramwise_cfg=dict(
custom_keys={
'bn': dict(decay_mult=0, lars_exclude=True),
'bias': dict(decay_mult=0, lars_exclude=True),
# bn layer in ResNet block downsample module
'downsample.1': dict(decay_mult=0, lars_exclude=True),
}),
)
# runtime settings
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,38 @@
Collections:
- Name: BYOL
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- LARS
Training Resources: 8x V100 GPUs (b256), 16x A100-80G GPUs (b4096)
Architecture:
- ResNet
- BYOL
Paper:
URL: https://arxiv.org/abs/2006.07733
Title: "Bootstrap your own latent: A new approach to self-supervised Learning"
README: configs/byol/README.md
Models:
- Name: byol_resnet50_16xb256-coslr-200e_in1k
In Collection: BYOL
Metadata:
Epochs: 200
Batch Size: 4096
Results: null
Config: configs/byol/byol_resnet50_16xb256-coslr-200e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/byol_resnet50_16xb256-coslr-200e_in1k_20220825-de817331.pth
Downstream:
- resnet50_byol-pre_8xb512-linear-coslr-90e_in1k
- Name: resnet50_byol-pre_8xb512-linear-coslr-90e_in1k
In Collection: BYOL
Metadata:
Epochs: 90
Batch Size: 4096
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 71.8
Config: configs/byol/benchmarks/resnet50_8xb512-linear-coslr-90e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/byol/byol_resnet50_16xb256-coslr-200e_in1k/resnet50_linear-8xb512-coslr-90e_in1k/resnet50_linear-8xb512-coslr-90e_in1k_20220825-7596c6f5.pth

View File

@ -0,0 +1,39 @@
# CAE
> [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
<!-- [ALGORITHM] -->
## Abstract
We present a novel masked image modeling (MIM) approach, context autoencoder (CAE), for self-supervised learning. We randomly partition the image into two sets: visible patches and masked patches. The CAE architecture consists of: (i) an encoder that takes visible patches as input and outputs their latent representations, (ii) a latent context regressor that predicts the masked patch representations from the visible patch representations that are not updated in this regressor, (iii) a decoder that takes the estimated masked patch representations as input and makes predictions for the masked patches, and (iv) an alignment module that aligns the masked patch representation estimation with the masked patch representations computed from the encoder. In comparison to previous MIM methods that couple the encoding and decoding roles, e.g., using a single module in BEiT, our approach attempts to separate the encoding role (content understanding) from the decoding role (making predictions for masked patches) using different modules, improving the content understanding capability. In addition, our approach makes predictions from the visible patches to the masked patches in the latent representation space that is expected to take on semantics. In addition, we present the explanations about why contrastive pretraining and supervised pretraining perform similarly and why MIM potentially performs better. We demonstrate the effectiveness of our CAE through superior transfer performance in downstream tasks: semantic segmentation, and object detection and instance segmentation.
<div align="center">
<img src="https://user-images.githubusercontent.com/30762564/165459947-6c6ef13c-0593-4765-b44e-6da0a079802a.png" width="40%"/>
</div>
## Prerequisite
Create a new folder `cae_ckpt` under the root directory and download the
[weights](https://download.openmmlab.com/mmselfsup/cae/dalle_encoder.pth) for `dalle` encoder to that folder
## Models and Benchmarks
Here, we report the results of the model, which is pre-trained on ImageNet-1k
for 300 epochs, the details are below:
| Backbone | Pre-train epoch | Fine-tuning Top-1 | Pre-train Config | Fine-tuning Config | Download |
| :------: | :-------------: | :---------------: | :-----------------------------------------------------: | :-------------------------------------------------------: | :----------------------------------------------: |
| ViT-B/16 | 300 | 83.2 | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) \| [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json) |
## Citation
```bibtex
@article{CAE,
title={Context Autoencoder for Self-Supervised Representation Learning},
author={Xiaokang Chen, Mingyu Ding, Xiaodi Wang, Ying Xin, Shentong Mo,
Yunhao Wang, Shumin Han, Ping Luo, Gang Zeng, Jingdong Wang},
journal={ArXiv},
year={2022}
}
```

View File

@ -0,0 +1,136 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# CAE fine-tuning setting
# dataset
data_preprocessor = dict(
num_classes=1000,
# RGB format normalization parameters
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
# convert image from BGR to RGB
to_rgb=True,
)
bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]
file_client_args = dict(backend='disk')
train_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackClsInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline), batch_size=128)
val_dataloader = dict(dataset=dict(pipeline=test_pipeline), batch_size=128)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='BEiT',
arch='base',
img_size=224,
patch_size=16,
avg_token=True, # use average token for cls head
final_norm=False, # do not use final norm
drop_path_rate=0.1,
layer_scale_init_value=0.1,
output_cls_token=False,
use_abs_pos_emb=True,
use_rel_pos_bias=True,
use_shared_rel_pos_bias=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=dict(type='TruncNormal', layer='Linear', std=2e-5)),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=8e-3,
betas=(0.9, 0.999),
weight_decay=0.05,
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.65), # layer-wise lr decay factor
constructor='mmselfsup.LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
train_cfg = dict(by_epoch=True, max_epochs=100)
randomness = dict(seed=0)

View File

@ -0,0 +1,4 @@
_base_ = 'cae_vit-base-p16_8xb256-amp-coslr-300e_in1k.py'
# dataset 128 x 16
train_dataloader = dict(batch_size=128)

View File

@ -0,0 +1,122 @@
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
file_client_args = dict(backend='disk')
data_preprocessor = dict(
type='CAEDataPreprocessor',
mean=[124, 117, 104],
std=[59, 58, 58],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomResizedCropAndInterpolationWithTwoPic',
size=224,
second_size=112,
interpolation='bicubic',
second_interpolation='lanczos',
scale=(0.08, 1.0)),
dict(
type='BEiTMaskGenerator',
input_size=(14, 14),
num_masking_patches=75,
max_num_patches=None,
min_num_patches=16),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# model settings
model = dict(
type='CAE',
backbone=dict(
type='CAEViT',
arch='b',
patch_size=16,
init_values=0.1,
qkv_bias=False),
neck=dict(
type='CAENeck',
patch_size=16,
embed_dims=768,
num_heads=12,
regressor_depth=4,
decoder_depth=4,
mlp_ratio=4,
init_values=0.1,
),
head=dict(type='CAEHead', loss=dict(type='CAELoss', lambd=2)),
target_generator=dict(
type='DALL-E',
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/dalle_encoder.pth', # noqa: E501
)),
data_preprocessor=dict(
type='mmselfsup.CAEDataPreprocessor',
mean=[124, 117, 104],
std=[59, 58, 58],
bgr_to_rgb=True),
base_momentum=0.0)
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(type='AdamW', lr=1.5e-3, betas=(0.9, 0.999)),
clip_grad=dict(max_norm=3.0),
paramwise_cfg=dict(
bias_decay_mult=0.0, norm_decay_mult=0.0, flat_decay_mult=0.0))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=290,
eta_min=1e-5,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
find_unused_parameters = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,37 @@
Collections:
- Name: CAE
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 16x A100-80G GPUs
Architecture:
- ViT
Paper:
URL: https://arxiv.org/abs/2202.03026
Title: "Context Autoencoder for Self-Supervised Representation Learning"
README: configs/cae/README.md
Models:
- Name: cae_vit-base-p16_8xb256-amp-coslr-300e_in1k
In Collection: CAE
Metadata:
Epochs: 300
Batch Size: 2048
Results: null
Config: configs/cae/cae_vit-base-p16_16xb128-amp-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/cae/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k_20220825-404a1929.pth
Downstream:
- beit-base-p16_cae-pre_8xb128-coslr-100e_in1k
- Name: beit-base-p16_cae-pre_8xb128-coslr-100e_in1k
In Collection: CAE
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.2
Config: configs/cae/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/cae/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k_20220825-f3d234cd.pth

View File

@ -11,7 +11,7 @@ Collections:
README: configs/convmixer/README.md
Models:
- Name: convmixer-768-32_10xb64_in1k
- Name: convmixer-768-32_3rdparty_in1k
Metadata:
FLOPs: 19623051264
Parameters: 21110248
@ -27,7 +27,7 @@ Models:
Converted From:
Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_768_32_ks7_p7_relu.pth.tar
Code: https://github.com/locuslab/convmixer
- Name: convmixer-1024-20_10xb64_in1k
- Name: convmixer-1024-20_3rdparty_in1k
Metadata:
FLOPs: 5550112768
Parameters: 24383464
@ -43,7 +43,7 @@ Models:
Converted From:
Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_1024_20_ks9_p14.pth.tar
Code: https://github.com/locuslab/convmixer
- Name: convmixer-1536-20_10xb64_in1k
- Name: convmixer-1536-20_3rdparty_in1k
Metadata:
FLOPs: 48713170944
Parameters: 51625960

View File

@ -18,22 +18,22 @@ Recently, neural networks purely based on attention were shown to address image
The teacher of the distilled version DeiT is RegNetY-16GF.
| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------: | :---------------------------------------------------------------------: |
| DeiT-tiny | From scratch | 5.72 | 1.08 | 74.50 | 92.24 | [config](./deit-tiny_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) |
| DeiT-tiny distilled\* | From scratch | 5.72 | 1.08 | 74.51 | 91.90 | [config](./deit-tiny-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) |
| DeiT-small | From scratch | 22.05 | 4.24 | 80.69 | 95.06 | [config](./deit-small_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) |
| DeiT-small distilled\* | From scratch | 22.05 | 4.24 | 81.17 | 95.40 | [config](./deit-small-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) |
| DeiT-base | From scratch | 86.57 | 16.86 | 81.76 | 95.81 | [config](./deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) |
| DeiT-base\* | From scratch | 86.57 | 16.86 | 81.79 | 95.59 | [config](./deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth) |
| DeiT-base distilled\* | From scratch | 86.57 | 16.86 | 83.33 | 96.49 | [config](./deit-base-distilled_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) |
| DeiT-base 384px\* | ImageNet-1k | 86.86 | 49.37 | 83.04 | 96.31 | [config](./deit-base_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) |
| DeiT-base distilled 384px\* | ImageNet-1k | 86.86 | 49.37 | 85.55 | 97.35 | [config](./deit-base-distilled_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) |
| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------------------------: | :-------------------------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------: | :----------------------------------------------: |
| deit-tiny_4xb256_in1k | From scratch | 5.72 | 1.26 | 74.50 | 92.24 | [config](./deit-tiny_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) |
| deit-tiny-distilled_3rdparty_in1k\* | From scratch | 5.91 | 1.27 | 74.51 | 91.90 | [config](./deit-tiny-distilled_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) |
| deit-small_4xb256_in1k | From scratch | 22.05 | 4.61 | 80.69 | 95.06 | [config](./deit-small_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) |
| deit-small-distilled_3rdparty_in1k\* | From scratch | 22.44 | 4.63 | 81.17 | 95.40 | [config](./deit-small-distilled_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) |
| deit-base_16xb64_in1k | From scratch | 86.57 | 17.58 | 81.76 | 95.81 | [config](./deit-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) |
| deit-base_3rdparty_in1k\* | From scratch | 86.57 | 17.58 | 81.79 | 95.59 | [config](./deit-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth) |
| deit-base-distilled_3rdparty_in1k\* | From scratch | 87.34 | 17.67 | 83.33 | 96.49 | [config](./deit-base-distilled_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) |
| deit-base_224px-pre_3rdparty_in1k-384px\* | ImageNet-1k 224px | 86.86 | 55.54 | 83.04 | 96.31 | [config](./deit-base_16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) |
| deit-base-distilled_224px-pre_3rdparty_in1k-384px\* | ImageNet-1k 224px distalled | 87.63 | 55.65 | 85.55 | 97.35 | [config](./deit-base-distilled_16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/deit). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
```{warning}
MMClassification doesn't support training the distilled version DeiT.
MMPretrain doesn't support training the distilled version DeiT.
And we provide distilled version checkpoints for inference only.
```

View File

@ -0,0 +1,37 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_384.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='DistilledVisionTransformer',
arch='deit-base',
img_size=384,
patch_size=16,
),
neck=None,
head=dict(
type='DeiTClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
# Change to the path of the pretrained model
# init_cfg=dict(type='Pretrained', checkpoint=''),
)
# dataset settings
train_dataloader = dict(batch_size=32)
# schedule settings
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (16 GPUs) x (32 samples per GPU)
auto_scale_lr = dict(base_batch_size=512)

View File

@ -0,0 +1,46 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='DistilledVisionTransformer',
arch='deit-base',
img_size=224,
patch_size=16),
neck=None,
head=dict(
type='DeiTClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]),
)
# dataset settings
train_dataloader = dict(batch_size=64)
# schedule settings
optim_wrapper = dict(
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=5.0),
)

View File

@ -1,9 +0,0 @@
_base_ = './deit-base_ft-16xb32_in1k-384px.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer'),
head=dict(type='DeiTClsHead'),
# Change to the path of the pretrained model
# init_cfg=dict(type='Pretrained', checkpoint=''),
)

View File

@ -1,10 +0,0 @@
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-base'),
head=dict(type='DeiTClsHead', in_channels=768),
)
# dataset settings
train_dataloader = dict(batch_size=64)

View File

@ -0,0 +1,50 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='deit-base',
img_size=224,
patch_size=16,
drop_path_rate=0.1),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]),
)
# dataset settings
train_dataloader = dict(batch_size=64)
# schedule settings
optim_wrapper = dict(
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=5.0),
)
# runtime settings
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]

View File

@ -1,14 +0,0 @@
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(
type='VisionTransformer', arch='deit-base', drop_path_rate=0.1),
head=dict(type='VisionTransformerClsHead', in_channels=768),
)
# dataset settings
train_dataloader = dict(batch_size=64)
# runtime settings
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]

View File

@ -0,0 +1,46 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='DistilledVisionTransformer',
arch='deit-small',
img_size=224,
patch_size=16),
neck=None,
head=dict(
type='DeiTClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]),
)
# data settings
train_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=5.0),
)

View File

@ -1,7 +0,0 @@
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-small'),
head=dict(type='DeiTClsHead', in_channels=384),
)

View File

@ -0,0 +1,47 @@
# The distillation config is only for evaluation.
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='DistilledVisionTransformer',
arch='deit-tiny',
img_size=224,
patch_size=16),
neck=None,
head=dict(
type='DeiTClsHead',
num_classes=1000,
in_channels=192,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]),
)
# data settings
train_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=5.0),
)

View File

@ -1,7 +0,0 @@
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='DistilledVisionTransformer', arch='deit-tiny'),
head=dict(type='DeiTClsHead', in_channels=192),
)

View File

@ -0,0 +1,48 @@
# In small and tiny arch, remove drop path and EMA hook comparing with the
# original config
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='deit-tiny',
img_size=224,
patch_size=16),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=192,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]),
)
# data settings
train_dataloader = dict(batch_size=256)
# schedule settings
optim_wrapper = dict(
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}),
clip_grad=dict(max_norm=5.0),
)

View File

@ -1,7 +0,0 @@
_base_ = './deit-small_pt-4xb256_in1k.py'
# model settings
model = dict(
backbone=dict(type='VisionTransformer', arch='deit-tiny'),
head=dict(type='VisionTransformerClsHead', in_channels=192),
)

View File

@ -8,47 +8,47 @@ Collections:
- Attention Dropout
- Multi-Head Attention
Paper:
Title: Training data-efficient image transformers & distillation through attention
URL: https://arxiv.org/abs/2012.12877
Title: "Training data-efficient image transformers & distillation through attention"
README: configs/deit/README.md
Code:
URL: v0.19.0
Version: https://github.com/open-mmlab/mmclassification/blob/v0.19.0/mmcls/models/backbones/deit.py
Models:
- Name: deit-tiny_pt-4xb256_in1k
- Name: deit-tiny_4xb256_in1k
Metadata:
FLOPs: 1080000000
Parameters: 5720000
FLOPs: 1258219200
Parameters: 5717416
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 74.50
Top 1 Accuracy: 74.5
Top 5 Accuracy: 92.24
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth
Config: configs/deit/deit-tiny_pt-4xb256_in1k.py
- Name: deit-tiny-distilled_3rdparty_pt-4xb256_in1k
Config: configs/deit/deit-tiny_4xb256_in1k.py
- Name: deit-tiny-distilled_3rdparty_in1k
Metadata:
FLOPs: 1080000000
Parameters: 5720000
FLOPs: 1265371776
Parameters: 5910800
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 74.51
Top 5 Accuracy: 91.90
Top 5 Accuracy: 91.9
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth
Config: configs/deit/deit-tiny-distilled_4xb256_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L108
Config: configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py
- Name: deit-small_pt-4xb256_in1k
- Name: deit-small_4xb256_in1k
Metadata:
FLOPs: 4240000000
Parameters: 22050000
FLOPs: 4607954304
Parameters: 22050664
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -57,27 +57,27 @@ Models:
Top 5 Accuracy: 95.06
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth
Config: configs/deit/deit-small_pt-4xb256_in1k.py
- Name: deit-small-distilled_3rdparty_pt-4xb256_in1k
Config: configs/deit/deit-small_4xb256_in1k.py
- Name: deit-small-distilled_3rdparty_in1k
Metadata:
FLOPs: 4240000000
Parameters: 22050000
FLOPs: 4632876288
Parameters: 22436432
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.17
Top 5 Accuracy: 95.40
Top 5 Accuracy: 95.4
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth
Config: configs/deit/deit-small-distilled_4xb256_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L123
Config: configs/deit/deit-small-distilled_pt-4xb256_in1k.py
- Name: deit-base_pt-16xb64_in1k
- Name: deit-base_16xb64_in1k
Metadata:
FLOPs: 16860000000
Parameters: 86570000
FLOPs: 17581972224
Parameters: 86567656
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -86,11 +86,11 @@ Models:
Top 5 Accuracy: 95.81
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth
Config: configs/deit/deit-base_pt-16xb64_in1k.py
- Name: deit-base_3rdparty_pt-16xb64_in1k
Config: configs/deit/deit-base_16xb64_in1k.py
- Name: deit-base_3rdparty_in1k
Metadata:
FLOPs: 16860000000
Parameters: 86570000
FLOPs: 17581972224
Parameters: 86567656
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -99,14 +99,14 @@ Models:
Top 5 Accuracy: 95.59
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth
Config: configs/deit/deit-base_16xb64_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L93
Config: configs/deit/deit-base_pt-16xb64_in1k.py
- Name: deit-base-distilled_3rdparty_pt-16xb64_in1k
- Name: deit-base-distilled_3rdparty_in1k
Metadata:
FLOPs: 16860000000
Parameters: 86570000
FLOPs: 17674283520
Parameters: 87338192
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -115,14 +115,14 @@ Models:
Top 5 Accuracy: 96.49
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth
Config: configs/deit/deit-base-distilled_16xb64_in1k.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L138
Config: configs/deit/deit-base-distilled_pt-16xb64_in1k.py
- Name: deit-base_3rdparty_ft-16xb32_in1k-384px
- Name: deit-base_224px-pre_3rdparty_in1k-384px
Metadata:
FLOPs: 49370000000
Parameters: 86860000
FLOPs: 55538974464
Parameters: 86859496
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -131,14 +131,14 @@ Models:
Top 5 Accuracy: 96.31
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth
Config: configs/deit/deit-base_16xb32_in1k-384px.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L153
Config: configs/deit/deit-base_ft-16xb32_in1k-384px.py
- Name: deit-base-distilled_3rdparty_ft-16xb32_in1k-384px
- Name: deit-base-distilled_224px-pre_3rdparty_in1k-384px
Metadata:
FLOPs: 49370000000
Parameters: 86860000
FLOPs: 55645294080
Parameters: 87630032
In Collection: DeiT
Results:
- Dataset: ImageNet-1k
@ -147,7 +147,7 @@ Models:
Top 5 Accuracy: 97.35
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth
Config: configs/deit/deit-base-distilled_16xb32_in1k-384px.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth
Code: https://github.com/facebookresearch/deit/blob/f5123946205daf72a88783dae94cabff98c49c55/models.py#L168
Config: configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py

View File

@ -0,0 +1,131 @@
# DenseCL
> [Dense Contrastive Learning for Self-Supervised Visual Pre-Training](https://arxiv.org/abs/2011.09157)
<!-- [ALGORITHM] -->
## Abstract
To date, most existing self-supervised learning methods are designed and optimized for image classification. These pre-trained models can be sub-optimal for dense prediction tasks due to the discrepancy between image-level prediction and pixel-level prediction. To fill this gap, we aim to design an effective, dense self-supervised learning method that directly works at the level of pixels (or local features) by taking into account the correspondence between local features. We present dense contrastive learning (DenseCL), which implements self-supervised learning by optimizing a pairwise contrastive (dis)similarity loss at the pixel level between two views of input images.
<div align="center">
<img src="https://user-images.githubusercontent.com/36138628/149721111-bab03a6d-a30d-418e-b338-43c3689cfc65.png" width="900" />
</div>
## Models and Benchmarks
In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
### Classification
The classification benchmarks includes 4 downstream task datasets, **VOC**, **ImageNet**, **iNaturalist2018** and **Places205**. If not specified, the results are Top-1 (%).
#### VOC SVM / Low-shot SVM
The **Best Layer** indicates that the best results are obtained from which layers feature map. For example, if the **Best Layer** is **feature3**, its best result is obtained from the second stage of ResNet (1 for stem layer, 2-5 for 4 stage layers).
Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
| Self-Supervised Config | Best Layer | SVM | k=1 | k=2 | k=4 | k=8 | k=16 | k=32 | k=64 | k=96 |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ---- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | feature5 | 82.5 | 42.68 | 50.64 | 61.74 | 68.17 | 72.99 | 76.07 | 79.19 | 80.55 |
#### ImageNet Linear Evaluation
The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
| Self-Supervised Config | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | 15.86 | 35.47 | 49.46 | 64.06 | 62.95 |
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tbody>
<tr>
<td>DenseCL</td>
<td>ResNet50</td>
<td>200</td>
<td>256</td>
<td>63.5</td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/densecl_resnet50_8xb32-coslr-200e_in1k_20220825-3078723b.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/densecl_resnet50_8xb32-coslr-200e_in1k_20220727_221415.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-f0f0a579.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220730_091650.json'>log</a></td>
<td>/</td>
</tr>
</tbody>
</table>
#### Places205 Linear Evaluation
The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-28e_places205.py](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/classification/places205/resnet50_mhead_8xb32-steplr-28e_places205.py) for details of config.
| Self-Supervised Config | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | 21.32 | 36.20 | 43.97 | 51.04 | 50.45 |
#### ImageNet Nearest-Neighbor Classification
The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
| Self-Supervised Config | k=10 | k=20 | k=100 | k=200 |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | 48.2 | 48.5 | 46.8 | 45.6 |
### Detection
The detection benchmarks includes 2 downstream task datasets, **Pascal VOC 2007 + 2012** and **COCO2017**. This benchmark follows the evluation protocols set up by MoCo.
#### Pascal VOC 2007 + 2012
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmdetection/voc0712/faster-rcnn_r50-c4_ms-24k_voc0712.py) for details.
| Self-Supervised Config | AP50 |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | 82.14 |
#### COCO2017
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmdetection/coco/mask-rcnn_r50_fpn_ms-1x_coco.py) for details.
| Self-Supervised Config | mAP(Box) | AP50(Box) | AP75(Box) | mAP(Mask) | AP50(Mask) | AP75(Mask) |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | -------- | --------- | --------- | --------- | ---------- | ---------- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | | | | | | |
### Segmentation
The segmentation benchmarks includes 2 downstream task datasets, **Cityscapes** and **Pascal VOC 2012 + Aug**. It follows the evluation protocols set up by MMSegmentation.
#### Pascal VOC 2012 + Aug
Please refer to [config](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/benchmarks/mmsegmentation/voc12aug/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py) for details.
| Self-Supervised Config | mIOU |
| ----------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/1.x/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py) | 69.47 |
## Citation
```bibtex
@inproceedings{wang2021dense,
title={Dense contrastive learning for self-supervised visual pre-training},
author={Wang, Xinlong and Zhang, Rufeng and Shen, Chunhua and Kong, Tao and Li, Lei},
booktitle={CVPR},
year={2021}
}
```

View File

@ -0,0 +1,17 @@
_base_ = [
'../../_base_/models/resnet50.py',
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_sgd_steplr_100e.py',
'../../_base_/default_runtime.py',
]
model = dict(backbone=dict(frozen_stages=4))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=30., momentum=0.9, weight_decay=0.))
# runtime settings
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))

View File

@ -0,0 +1,44 @@
_base_ = [
'../_base_/datasets/imagenet_bs32_mocov2.py',
'../_base_/schedules/imagenet_sgd_coslr_200e.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='DenseCL',
queue_len=65536,
feat_dim=128,
momentum=0.999,
loss_lambda=0.5,
data_preprocessor=dict(
mean=(123.675, 116.28, 103.53),
std=(58.395, 57.12, 57.375),
bgr_to_rgb=True),
backbone=dict(
type='ResNet',
depth=50,
in_channels=3,
out_indices=[4], # 0: conv-1, x: stage-x
norm_cfg=dict(type='BN')),
neck=dict(
type='DenseCLNeck',
in_channels=2048,
hid_channels=2048,
out_channels=128,
num_grid=None),
head=dict(
type='ContrastiveHead',
loss=dict(type='CrossEntropyLoss'),
temperature=0.2),
)
find_unused_parameters = True
# runtime settings
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=256)

View File

@ -0,0 +1,38 @@
Collections:
- Name: DenseCL
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- SGD with Momentum
- Weight Decay
Training Resources: 8x V100 GPUs
Architecture:
- ResNet
Paper:
URL: https://arxiv.org/abs/2011.09157
Title: "Dense contrastive learning for self-supervised visual pre-training"
README: configs/densecl/README.md
Models:
- Name: densecl_resnet50_8xb32-coslr-200e_in1k
In Collection: DenseCL
Metadata:
Epochs: 200
Batch Size: 256
Results: null
Config: configs/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/densecl_resnet50_8xb32-coslr-200e_in1k_20220825-3078723b.pth
Downstream:
- resnet50_densecl-pre_8xb32-linear-steplr-100e_in1k
- Name: resnet50_densecl-pre_8xb32-linear-steplr-100e_in1k
In Collection: DenseCL
Metadata:
Epochs: 100
Batch Size: 256
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 63.5
Config: configs/densecl/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/densecl/densecl_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-f0f0a579.pth

View File

@ -10,7 +10,7 @@ Collections:
README: configs/densenet/README.md
Models:
- Name: densenet121_4xb256_in1k
- Name: densenet121_3rdparty_in1k
Metadata:
FLOPs: 2881695488
Parameters: 7978856
@ -26,7 +26,7 @@ Models:
Converted From:
Weights: https://download.pytorch.org/models/densenet121-a639ec97.pth
Code: https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
- Name: densenet169_4xb256_in1k
- Name: densenet169_3rdparty_in1k
Metadata:
FLOPs: 3416860160
Parameters: 14149480
@ -42,7 +42,7 @@ Models:
Converted From:
Weights: https://download.pytorch.org/models/densenet169-b2777c0a.pth
Code: https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
- Name: densenet201_4xb256_in1k
- Name: densenet201_3rdparty_in1k
Metadata:
FLOPs: 4365236736
Parameters: 20013928
@ -58,7 +58,7 @@ Models:
Converted From:
Weights: https://download.pytorch.org/models/densenet201-c1103571.pth
Code: https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
- Name: densenet161_4xb256_in1k
- Name: densenet161_3rdparty_in1k
Metadata:
FLOPs: 7816363968
Parameters: 28681000

View File

@ -0,0 +1,120 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-4,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.65), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,70 @@
_base_ = [
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
avg_token=False,
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=dict(type='mmselfsup.ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]),
data_preprocessor=dict(
num_classes=1000,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=3.2, weight_decay=0.0, momentum=0.9),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=90,
by_epoch=True,
begin=10,
end=100,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,85 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='EVA',
backbone=dict(init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='MAEPretrainDecoder',
predict_feature_dim=512,
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
head=dict(
type='MILANPretrainHead',
loss=dict(
type='CosineSimilarityLoss', shift_factor=2.0, scale_factor=2.0),
),
target_generator=dict(
type='CLIPGenerator',
tokenizer_path= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/clip_vit_base_16.pth.tar' # noqa
),
init_cfg=None)
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -212,3 +212,39 @@ Models:
Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_196px_21k_to_1k_ft_88p6.pt
Code: https://github.com/baaivision/EVA
Config: configs/eva/eva-l-p14_8xb16_in1k-196px.py
- Name: eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k
In Collection: EVA
Metadata:
Epochs: 400
Batch Size: 4096
Results: null
Config: configs/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k_20221226-26d90f07.pth
Downstream:
- vit-base-p16_eva-mae-style-pre_8xb128-coslr-100e_in1k
- vit-base-p16_eva-mae-style-pre_8xb2048-linear-coslr-100e_in1k
- Name: vit-base-p16_eva-mae-style-pre_8xb128-coslr-100e_in1k
In Collection: EVA
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.7
Config: configs/eva/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20221226-f61cf992.pth
- Name: vit-base-p16_eva-mae-style-pre_8xb2048-linear-coslr-100e_in1k
In Collection: EVA
Metadata:
Epochs: 100
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 69.0
Config: configs/eva/benchmarks/vit-base-p16_8xb2048-linear-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/eva/eva-mae-style_vit-base-p16_16xb256-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221226-ef51bf09.pth

View File

@ -0,0 +1,168 @@
# MAE
> [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
<!-- [ALGORITHM] -->
## Abstract
This paper shows that masked autoencoders (MAE) are
scalable self-supervised learners for computer vision. Our
MAE approach is simple: we mask random patches of the
input image and reconstruct the missing pixels. It is based
on two core designs. First, we develop an asymmetric
encoder-decoder architecture, with an encoder that operates only on the
visible subset of patches (without mask tokens), along with a lightweight
decoder that reconstructs the original image from the latent representation
and mask tokens. Second, we find that masking a high proportion
of the input image, e.g., 75%, yields a nontrivial and
meaningful self-supervisory task. Coupling these two designs enables us to
train large models efficiently and effectively: we accelerate
training (by 3× or more) and improve accuracy. Our scalable approach allows
for learning high-capacity models that generalize well: e.g., a vanilla
ViT-Huge model achieves the best accuracy (87.8%) among
methods that use only ImageNet-1K data. Transfer performance in downstream tasks outperforms supervised pretraining and shows promising scaling behavior.
<div align="center">
<img src="https://user-images.githubusercontent.com/30762564/150733959-2959852a-c7bd-4d3f-911f-3e8d8839fe67.png" width="40%"/>
</div>
## Models and Benchmarks
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="9">MAE</td>
<td>ViT-base</td>
<td>300</td>
<td>4096</td>
<td>60.8</td>
<td>83.1</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-base-p16_8xb512-amp-coslr-300e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-300e_in1k/mae_vit-base-p16_8xb512-coslr-300e-fp16_in1k_20220829-c2cf66ba.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-300e_in1k/mae_vit-base-p16_8xb512-coslr-300e-fp16_in1k_20220718_152424.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-300e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k_20220720_104514.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-300e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220713_140138.json'>log</a></td>
</tr>
<tr>
<td>ViT-base</td>
<td>400</td>
<td>4096</td>
<td>62.5</td>
<td>83.3</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-base-p16_8xb512-amp-coslr-400e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k_20220825-bc79e40b.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k_20220628_200815.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k_20220713_142534.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220708_183134.json'>log</a></td>
</tr>
<tr>
<td>ViT-base</td>
<td>800</td>
<td>4096</td>
<td>65.1</td>
<td>83.3</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-base-p16_8xb512-amp-coslr-800e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-base-p16_8xb512-coslr-800e-fp16_in1k_20220825-5d81fbc4.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-base-p16_8xb512-coslr-800e-fp16_in1k_20220718_134405.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-800e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k20220721_203941.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-800e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220724_232940.json'>log</a></td>
</tr>
<tr>
<td>ViT-base</td>
<td>1600</td>
<td>4096</td>
<td>67.1</td>
<td>83.5</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-base-p16_8xb512-amp-coslr-1600e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220815_103458.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k/vit-base-p16_linear-8xb2048-coslr-90e_in1k_20220724_232557.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220825-cf70aa21.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220721_202304.json'>log</a></td>
</tr>
<tr>
<td>ViT-large</td>
<td>400</td>
<td>4096</td>
<td>70.7</td>
<td>85.2</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-large-p16_8xb512-amp-coslr-400e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k_20220825-b11d0425.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k_20220726_202204.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k_20220803_101331.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_ft-8xb128-coslr-50e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k_20220729_122511.json'>log</a></td>
</tr>
<tr>
<td>ViT-large</td>
<td>800</td>
<td>4096</td>
<td>73.7</td>
<td>85.4</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-large-p16_8xb512-amp-coslr-800e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k_20220825-df72726a.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k_20220804_104018.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k_20220808_092730.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_ft-8xb128-coslr-50e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k_20220730_235819.json'>log</a></td>
</tr>
<tr>
<td>ViT-large</td>
<td>1600</td>
<td>4096</td>
<td>75.5</td>
<td>85.7</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-large-p16_8xb512-amp-coslr-1600e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k_20220825-cc7e98c9.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k_20220806_210725.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_linear-8xb2048-coslr-90e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k/vit-large-p16_linear-8xb2048-coslr-90e_in1k_20220813_155615.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-large-p16_ft-8xb128-coslr-50e_in1k.py'>config</a> | model | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k/vit-large-p16_ft-8xb128-coslr-50e_in1k_20220813_125305.json'>log</a></td>
</tr>
<tr>
<td>ViT-huge-FT-224</td>
<td>1600</td>
<td>4096</td>
<td>/</td>
<td>86.9</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-huge-p16_8xb512-amp-coslr-1600e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k_20220916-ff848775.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k_20220814_135241.json'>log</a></td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-huge-p16_ft-8xb128-coslr-50e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k_20220916-0bfc9bfd.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k_20220829_114027.json'>log</a></td>
</tr>
<tr>
<td>ViT-huge-FT-448</td>
<td>1600</td>
<td>4096</td>
<td>/</td>
<td>87.3</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/mae/mae_vit-huge-p16_8xb512-amp-coslr-1600e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k_20220916-ff848775.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k_20220814_135241.json'>log</a></td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448_20220916-95b6a0ce.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448_20220913_113737.json'>log</a></td>
</tr>
</tbody>
</table>
## Evaluating MAE on Detection and Segmentation
If you want to evaluate your model on detection or segmentation task, we provide a [script](https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/tools/model_converters/mmcls2timm.py) to convert the model keys from MMClassification style to timm style.
```sh
cd $MMSELFSUP
python tools/model_converters/mmcls2timm.py $src_ckpt $dst_ckpt
```
Then, using this converted ckpt, you can evaluate your model on detection task, following [Detectron2](https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet)
and on semantic segmentation task, following this [project](https://github.com/implus/mae_segmentation). Besides, using the unconverted ckpt, you can use
[MMSegmentation](https://github.com/open-mmlab/mmsegmentation/tree/master/configs/mae) to evaluate your model.
## Citation
```bibtex
@article{He2021MaskedAA,
title={Masked Autoencoders Are Scalable Vision Learners},
author={Kaiming He and Xinlei Chen and Saining Xie and Yanghao Li and
Piotr Doll'ar and Ross B. Girshick},
journal={arXiv},
year={2021}
}
```

View File

@ -0,0 +1,120 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=2e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.65), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
train_cfg = dict(by_epoch=True, max_epochs=100)
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,64 @@
_base_ = [
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
avg_token=False,
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=dict(type='ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=80,
by_epoch=True,
begin=10,
end=90,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=90)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,122 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=448,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=512,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=448),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='huge',
img_size=448,
patch_size=14,
drop_path_rate=0.3, # set to 0.3
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1280,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
# learning rate and layer decay rate are set to 0.004 and 0.75 respectively
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.75), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=45,
by_epoch=True,
begin=5,
end=50,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=50)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,121 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='huge',
img_size=224,
patch_size=14,
drop_path_rate=0.3, # set to 0.3
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1280,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
# learning rate and layer decay rate are set to 0.004 and 0.75 respectively
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.75), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=45,
by_epoch=True,
begin=5,
end=50,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=50)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,121 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='large',
img_size=224,
patch_size=16,
drop_path_rate=0.2, # set to 0.2
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
# learning rate and layer decay rate are set to 0.004 and 0.75 respectively
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.75), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=45,
by_epoch=True,
begin=5,
end=50,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=50)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,64 @@
_base_ = [
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='large',
img_size=224,
patch_size=16,
frozen_stages=24,
avg_token=False,
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=dict(type='ClsBatchNormNeck', input_features=1024),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=80,
by_epoch=True,
begin=10,
end=90,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=90)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,56 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=1560,
by_epoch=True,
begin=40,
end=1600,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=1600)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,56 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,56 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,56 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.000000001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=760,
by_epoch=True,
begin=40,
end=800,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=800)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,55 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,66 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
backbone=dict(type='MAEViT', arch='h', patch_size=14),
neck=dict(
type='MAEPretrainDecoder',
embed_dim=1280,
patch_size=14,
num_patches=256),
head=dict(patch_size=14))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=1560,
by_epoch=True,
begin=40,
end=1600,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=1600)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,61 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
backbone=dict(type='MAEViT', arch='l'),
neck=dict(type='MAEPretrainDecoder', embed_dim=1024))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=1560,
by_epoch=True,
begin=40,
end=1600,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=1600)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,61 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
backbone=dict(type='MAEViT', arch='l'),
neck=dict(type='MAEPretrainDecoder', embed_dim=1024))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=260,
by_epoch=True,
begin=40,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,61 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
backbone=dict(type='MAEViT', arch='l'),
neck=dict(type='MAEPretrainDecoder', embed_dim=1024))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,61 @@
_base_ = [
'../_base_/models/mae_vit-base-p16.py',
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
backbone=dict(type='MAEViT', arch='l'),
neck=dict(type='MAEPretrainDecoder', embed_dim=1024))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.000000001,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=760,
by_epoch=True,
begin=40,
end=800,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=800)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=4096)

View File

@ -0,0 +1,289 @@
Collections:
- Name: MAE
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 8x A100-80G GPUs
Architecture:
- ViT
Paper:
URL: https://arxiv.org/abs/2111.06377
Title: "Masked Autoencoders Are Scalable Vision Learners"
README: configs/mae/README.md
Models:
- Name: mae_vit-base-p16_8xb512-amp-coslr-300e_in1k
In Collection: MAE
Metadata:
Epochs: 300
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-base-p16_8xb512-amp-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-300e_in1k/mae_vit-base-p16_8xb512-coslr-300e-fp16_in1k_20220829-c2cf66ba.pth
Downstream:
- vit-base-p16_mae-300e-pre_8xb2048-linear-coslr-90e_in1k
- vit-base-p16_mae-300e-pre_8xb128-coslr-100e_in1k
- Name: vit-base-p16_mae-300e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 60.8
Config: configs/mae/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-base-p16_mae-300e-pre_8xb128-coslr-100e_in1k
In Collection: MAE
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.1
Config: configs/mae/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: mae_vit-base-p16_8xb512-coslr-400e_in1k
In Collection: MAE
Metadata:
Epochs: 400
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-base-p16_8xb512-amp-coslr-400e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k_20220825-bc79e40b.pth
Downstream:
- vit-base-p16_mae-400e-pre_8xb2048-linear-coslr-90e_in1k
- vit-base-p16_mae-400e-pre_8xb128-coslr-100e_in1k
- Name: vit-base-p16_mae-400e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 62.5
Config: configs/mae/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-base-p16_mae-400e-pre_8xb128-coslr-100e_in1k
In Collection: MAE
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.3
Config: configs/mae/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: mae_vit-base-p16_8xb512-amp-coslr-800e_in1k
In Collection: MAE
Metadata:
Epochs: 800
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-base-p16_8xb512-amp-coslr-800e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-base-p16_8xb512-coslr-800e-fp16_in1k_20220825-5d81fbc4.pth
Downstream:
- vit-base-p16_mae-800e-pre_8xb2048-linear-coslr-90e_in1k
- vit-base-p16_mae-800e-pre_8xb128-coslr-100e_in1k
- Name: vit-base-p16_mae-800e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 65.1
Config: configs/mae/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-base-p16_mae-800e-pre_8xb128-coslr-100e_in1k
In Collection: MAE
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.3
Config: configs/mae/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
- Name: mae_vit-base-p16_8xb512-amp-coslr-1600e_in1k
In Collection: MAE
Metadata:
Epochs: 1600
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-base-p16_8xb512-amp-coslr-1600e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth
Downstream:
- vit-base-p16_mae-1600e-pre_8xb2048-linear-coslr-90e_in1k
- vit-base-p16_mae-1600e-pre_8xb128-coslr-100e_in1k
- Name: vit-base-p16_mae-1600e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 67.1
Config: configs/mae/benchmarks/vit-base-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-base-p16_mae-1600e-pre_8xb128-coslr-100e_in1k
In Collection: MAE
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.5
Config: configs/mae/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k_20220825-cf70aa21.pth
- Name: mae_vit-large-p16_8xb512-amp-coslr-400e_in1k
In Collection: MAE
Metadata:
Epochs: 400
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-large-p16_8xb512-amp-coslr-400e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-400e_in1k_20220825-b11d0425.pth
Downstream:
- vit-large-p16_mae-400e-pre_8xb2048-linear-coslr-90e_in1k
- vit-large-p16_mae-400e-pre_8xb128-coslr-50e_in1k
- Name: vit-large-p16_mae-400e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 70.7
Config: configs/mae/benchmarks/vit-large-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-large-p16_mae-400e-pre_8xb128-coslr-50e_in1k
In Collection: MAE
Metadata:
Epochs: 50
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.2
Config: configs/mae/benchmarks/vit-large-p16_8xb128-coslr-50e_in1k.py
- Name: mae_vit-large-p16_8xb512-amp-coslr-800e_in1k
In Collection: MAE
Metadata:
Epochs: 800
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-large-p16_8xb512-amp-coslr-800e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-800e_in1k_20220825-df72726a.pth
Downstream:
- vit-large-p16_mae-800e-pre_8xb2048-linear-coslr-90e_in1k
- vit-large-p16_mae-800e-pre_8xb128-coslr-50e_in1k
- Name: vit-large-p16_mae-800e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 73.7
Config: configs/mae/benchmarks/vit-large-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-large-p16_mae-800e-pre_8xb128-coslr-50e_in1k
In Collection: MAE
Metadata:
Epochs: 50
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.4
Config: configs/mae/benchmarks/vit-large-p16_8xb128-coslr-50e_in1k.py
- Name: mae_vit-large-p16_8xb512-amp-coslr-1600e_in1k
In Collection: MAE
Metadata:
Epochs: 1600
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-large-p16_8xb512-amp-coslr-1600e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-large-p16_8xb512-fp16-coslr-1600e_in1k_20220825-cc7e98c9.pth
Downstream:
- vit-large-p16_mae-1600e-pre_8xb2048-linear-coslr-90e_in1k
- vit-large-p16_mae-1600e-pre_8xb128-coslr-50e_in1k
- Name: vit-large-p16_mae-1600e-pre_8xb2048-linear-coslr-90e_in1k
In Collection: MAE
Metadata:
Epochs: 90
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 75.5
Config: configs/mae/benchmarks/vit-large-p16_8xb2048-linear-coslr-90e_in1k.py
- Name: vit-large-p16_mae-1600e-pre_8xb128-coslr-50e_in1k
In Collection: MAE
Metadata:
Epochs: 50
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.7
Config: configs/mae/benchmarks/vit-large-p16_8xb128-coslr-50e_in1k.py
- Name: mae_vit-huge-p16_8xb512-amp-coslr-1600e_in1k
In Collection: MAE
Metadata:
Epochs: 1600
Batch Size: 4096
Results: null
Config: configs/mae/mae_vit-huge-p14_8xb512-amp-coslr-1600e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k_20220916-ff848775.pth
Downstream:
- vit-huge-p14_mae-1600e-pre_8xb128-coslr-50e_in1k
- vit-huge-p14_mae-1600e-pre_32xb8-coslr-50e_in1k-448px
- Name: vit-huge-p14_mae-1600e-pre_8xb128-coslr-50e_in1k
In Collection: MAE
Metadata:
Epochs: 50
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 86.9
Config: configs/mae/benchmarks/vit-huge-p14_8xb128-coslr-50e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k/vit-huge-p16_ft-8xb128-coslr-50e_in1k_20220916-0bfc9bfd.pth
- Name: vit-huge-p14_mae-1600e-pre_32xb8-coslr-50e_in1k-448px
In Collection: MAE
Metadata:
Epochs: 50
Batch Size: 256
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 87.3
Config: configs/mae/benchmarks/vit-huge-p14_32xb8-coslr-50e_in1k-448px.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mae/mae_vit-huge-p16_8xb512-fp16-coslr-1600e_in1k/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448/vit-huge-p16_ft-32xb8-coslr-50e_in1k-448_20220916-95b6a0ce.pth

View File

@ -0,0 +1,60 @@
# MaskFeat
> [Masked Feature Prediction for Self-Supervised Visual Pre-Training](https://arxiv.org/abs/2112.09133v1)
<!-- [ALGORITHM] -->
## Abstract
We present Masked Feature Prediction (MaskFeat) for self-supervised pre-training of video models. Our approach first randomly masks out a portion of the input sequence and then predicts the feature of the masked regions. We study five different types of features and find Histograms of Oriented Gradients (HOG), a hand-crafted feature descriptor, works particularly well in terms of both performance and efficiency. We observe that the local contrast normalization in HOG is essential for good results, which is in line with earlier work using HOG for visual recognition. Our approach can learn abundant visual knowledge and drive large-scale Transformer-based models. Without using extra model weights or supervision, MaskFeat pre-trained on unlabeled videos achieves unprecedented results of 86.7% with MViT-L on Kinetics-400, 88.3% on Kinetics-600, 80.4% on Kinetics-700, 38.8 mAP on AVA, and 75.0% on SSv2. MaskFeat further generalizes to image input, which can be interpreted as a video with a single frame and obtains competitive results on ImageNet.
<div align="center">
<img src="https://user-images.githubusercontent.com/48178838/190090285-428f07c0-0887-4ce8-b94f-f719cfd25622.png" width="60%"/>
</div>
## Models and Benchmarks
Here, we report the results of the model on ImageNet, the details are below:
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tr>
<td>MaskFeat</td>
<td>ViT-base</td>
<td>300</td>
<td>2048</td>
<td>/</td>
<td>83.4</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221101-6dfc8bf3.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221019_194256.json'>log</a></td>
<td>/</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb256-coslr-100e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k_20221028-5134431c.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k_20221026_105344.json'>log</a></td>
</tr>
</tbody>
</table>
## Citation
```bibtex
@InProceedings{wei2022masked,
author = {Wei, Chen and Fan, Haoqi and Xie, Saining and Wu, Chao-Yuan and Yuille, Alan and Feichtenhofer, Christoph},
title = {Masked Feature Prediction for Self-Supervised Visual Pre-Training},
booktitle = {CVPR},
year = {2022},
}
```

View File

@ -0,0 +1,120 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]
train_dataloader = dict(batch_size=256, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=256, dataset=dict(pipeline=test_pipeline))
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=2e-5, bias=2e-5)
]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=8e-3,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.65), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=20,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=80,
by_epoch=True,
begin=20,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0)

View File

@ -0,0 +1,110 @@
_base_ = '../_base_/default_runtime.py'
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
type='SelfSupDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
size=224,
scale=(0.5, 1.0),
ratio=(0.75, 1.3333),
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='BEiTMaskGenerator',
input_size=14,
num_masking_patches=78,
min_num_patches=15,
),
dict(
type='PackSelfSupInputs',
algorithm_keys=['mask'],
meta_keys=['img_path'])
]
train_dataloader = dict(
batch_size=256,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix=dict(img_path='train/'),
pipeline=train_pipeline))
# model settings
model = dict(
type='MaskFeat',
data_preprocessor=dict(
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(type='MaskFeatViT', arch='b', patch_size=16),
neck=dict(
type='LinearNeck',
in_channels=768,
out_channels=108,
with_avg_pool=False,
init_cfg=dict(type='TruncNormal', layer='Linear', std=0.02, bias=0)),
head=dict(
type='MaskFeatPretrainHead',
loss=dict(type='PixelReconstructionLoss', criterion='L2')),
target_generator=dict(
type='HOGGenerator', nbins=9, pool=8, gaussian_window=16))
# optimizer wrapper
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW', lr=2e-4 * 8, betas=(0.9, 0.999), weight_decay=0.05),
clip_grad=dict(max_norm=0.02),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
# commented 'pos_embed' and 'cls_token' to avoid loss stuck situation
custom_keys={
# 'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
# 'cls_token': dict(decay_mult=0.)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-6,
by_epoch=True,
begin=0,
end=30,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=270,
by_epoch=True,
begin=30,
end=300,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,37 @@
Collections:
- Name: MaskFeat
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 8x A100-80G GPUs
Architecture:
- ViT
Paper:
URL: https://arxiv.org/abs/2112.09133v1
Title: "Masked Feature Prediction for Self-Supervised Visual Pre-Training"
README: configs/maskfeat/README.md
Models:
- Name: maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k
In Collection: MaskFeat
Metadata:
Epochs: 300
Batch Size: 2048
Results: null
Config: configs/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k_20221101-6dfc8bf3.pth
Downstream:
- vit-base-p16_maskfeat-pre_8xb256-coslr-100e_in1k
- Name: vit-base-p16_maskfeat-pre_8xb256-coslr-100e_in1k
In Collection: MaskFeat
Metadata:
Epochs: 100
Batch Size: 2048
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.4
Config: configs/maskfeat/benchmarks/vit-base-p16_8xb256-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_vit-base-p16_8xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k/vit-base-p16_ft-8xb256-coslr-100e_in1k_20221028-5134431c.pth

View File

@ -0,0 +1,82 @@
# MILAN
> [MILAN: Masked Image Pretraining on
> Language Assisted Representation
> ](https://arxiv.org/pdf/2208.06049)
<!-- [ALGORITHM] -->
## Abstract
Self-attention based transformer models have been dominating many computer
vision tasks in the past few years. Their superb model qualities heavily depend
on the excessively large labeled image datasets. In order to reduce the reliance
on large labeled datasets, reconstruction based masked autoencoders are gaining
popularity, which learn high quality transferable representations from unlabeled
images. For the same purpose, recent weakly supervised image pretraining methods
explore language supervision from text captions accompanying the images. In this
work, we propose masked image pretraining on language assisted representation,
dubbed as MILAN. Instead of predicting raw pixels or low level features, our
pretraining objective is to reconstruct the image features with substantial semantic
signals that are obtained using caption supervision. Moreover, to accommodate our
reconstruction target, we propose a more efficient prompting decoder architecture
and a semantic aware mask sampling mechanism, which further advance the
transfer performance of the pretrained model. Experimental results demonstrate
that MILAN delivers higher accuracy than the previous works. When the masked
autoencoder is pretrained and finetuned on ImageNet-1K dataset with an input
resolution of 224×224, MILAN achieves a top-1 accuracy of 85.4% on ViTB/16, surpassing previous state-of-the-arts by 1%. In the downstream semantic
segmentation task, MILAN achieves 52.7 mIoU using ViT-B/16 backbone on
ADE20K dataset, outperforming previous masked pretraining results by 4 points.
<div align="center">
<img src="https://user-images.githubusercontent.com/30762564/205210369-41a65c4c-bcd4-4147-91ea-c6c9061ab455.png" width="80%"/>
</div>
## Models and Benchmarks
Here, we report the results of the model, which is pre-trained on ImageNet-1k
for 400 epochs, the details are below:
<table class="docutils">
<thead>
<tr>
<th rowspan="2">Algorithm</th>
<th rowspan="2">Backbone</th>
<th rowspan="2">Epoch</th>
<th rowspan="2">Batch Size</th>
<th colspan="2" align="center">Results (Top-1 %)</th>
<th colspan="3" align="center">Links</th>
</tr>
<tr>
<th>Linear Eval</th>
<th>Fine-tuning</th>
<th>Pretrain</th>
<th>Linear Eval</th>
<th>Fine-tuning</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="1">MILAN</td>
<td>ViT-B/16</td>
<td>400</td>
<td>4096</td>
<td>78.9</td>
<td>85.3</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221123_112721.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/milan/classification/vit-base-p16_linear-8xb2048-coslr-100e_in1k.py'>config</a> |<a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221129-03f26f85.pth'> model </a>| <a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221125_031826.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/configs/selfsup/milan/classification/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan.py'>config</a> |<a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.pth'> model </a>| <a href='https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221125_031826.json'>log</a></td>
</tr>
</tbody>
</table>
## Citation
```bibtex
@article{Hou2022MILANMI,
title={MILAN: Masked Image Pretraining on Language Assisted Representation},
author={Zejiang Hou and Fei Sun and Yen-Kuang Chen and Yuan Xie and S. Y. Kung},
journal={ArXiv},
year={2022}
}
```

View File

@ -0,0 +1,120 @@
_base_ = [
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
# dataset settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=0.3333333333333333,
fill_color=[103.53, 116.28, 123.675],
fill_std=[57.375, 57.12, 58.395]),
dict(type='PackClsInputs')
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs')
]
train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
test_dataloader = val_dataloader
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1,
avg_token=True,
output_cls_token=False,
final_norm=False,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=None,
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.02)]),
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8),
dict(type='CutMix', alpha=1.0)
]))
# optimizer wrapper
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=4e-4,
weight_decay=0.05,
eps=1e-8,
betas=(0.9, 0.999),
model_type='vit', # layer-wise lr decay type
layer_decay_rate=0.65), # layer-wise lr decay factor
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0),
'.bias': dict(decay_mult=0.0),
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
by_epoch=True,
begin=5,
end=100,
eta_min=1e-6,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,70 @@
_base_ = [
'../../_base_/datasets/imagenet_bs32_pillow.py',
'../../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=2048, drop_last=True)
val_dataloader = dict(drop_last=False)
test_dataloader = dict(drop_last=False)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
frozen_stages=12,
avg_token=False,
final_norm=True,
init_cfg=dict(type='Pretrained', checkpoint='')),
neck=dict(type='mmselfsup.ClsBatchNormNeck', input_features=768),
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss'),
init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)]),
data_preprocessor=dict(
num_classes=1000,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
))
# optimizer
optim_wrapper = dict(
_delete_=True,
type='AmpOptimWrapper',
optimizer=dict(type='LARS', lr=3.2, weight_decay=0.0, momentum=0.9),
)
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=90,
by_epoch=True,
begin=10,
end=100,
eta_min=0.0,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=100)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3),
logger=dict(type='LoggerHook', interval=10))
randomness = dict(seed=0, diff_rank_seed=True)

View File

@ -0,0 +1,51 @@
Collections:
- Name: MILAN
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- AdamW
Training Resources: 16x A100-80G GPUs
Architecture:
- ViT
Paper:
URL: https://arxiv.org/pdf/2208.06049
Title: "MILAN: Masked Image Pretraining on Language Assisted Representation"
README: configs/milan/README.md
Models:
- Name: milan_vit-base-p16_16xb256-amp-coslr-400e_in1k
In Collection: MILAN
Metadata:
Epochs: 400
Batch Size: 4096
Results: null
Config: configs/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k_20221129-180922e8.pth
Downstream:
- vit-base-p16_milan-pre_8xb128-coslr-100e_in1k
- vit-base-p16_milan-pre_8xb2048-linear-coslr-100e_in1k
- Name: vit-base-p16_milan-pre_8xb128-coslr-100e_in1k
In Collection: MILAN
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 85.3
Config: configs/milan/benchmarks/vit-base-p16_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k/vit-base-p16_ft-8xb128-coslr-100e_in1k-milan_20221129-74ac94fa.pth
- Name: vit-base-p16_milan-pre_8xb2048-linear-coslr-100e_in1k
In Collection: MILAN
Metadata:
Epochs: 100
Batch Size: 16384
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 78.9
Config: configs/milan/benchmarks/vit-base-p16_8xb2048-linear-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/milan/milan_vit-base-p16_16xb256-amp-coslr-400e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k/vit-base-p16_linear-8xb2048-coslr-100e_in1k_20221129-03f26f85.pth

View File

@ -0,0 +1,88 @@
_base_ = [
'../_base_/datasets/imagenet_bs512_mae.py',
'../_base_/default_runtime.py',
]
# dataset settings
train_dataloader = dict(batch_size=256)
# model settings
model = dict(
type='MILAN',
backbone=dict(
type='MILANViT',
arch='b',
patch_size=16,
mask_ratio=0.75,
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
neck=dict(
type='MILANPretrainDecoder',
init_cfg=[
dict(type='Xavier', distribution='uniform', layer='Linear'),
dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0)
]),
head=dict(
type='MILANPretrainHead',
loss=dict(
type='CosineSimilarityLoss', shift_factor=2.0, scale_factor=2.0),
),
target_generator=dict(
type='CLIPGenerator',
tokenizer_path= # noqa
'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/clip_vit_base_16.pth.tar' # noqa
),
init_cfg=None)
# optimizer wrapper
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=1.5e-4 * 4096 / 256,
betas=(0.9, 0.95),
weight_decay=0.05),
paramwise_cfg=dict(
custom_keys={
'ln': dict(decay_mult=0.0),
'bias': dict(decay_mult=0.0),
'pos_embed': dict(decay_mult=0.),
'mask_token': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
find_unused_parameters = True
# learning rate scheduler
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=40,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=360,
by_epoch=True,
begin=40,
end=400,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=400)
default_hooks = dict(
# only keeps the latest 3 checkpoints
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
randomness = dict(seed=0, diff_rank_seed=True)
# auto resume
resume = True
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=2048)

View File

@ -0,0 +1,130 @@
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'ImageNet'
data_root = 'data/imagenet/'
data_preprocessor = dict(
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True,
)
bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackClsInputs'),
]
train_dataloader = dict(
batch_size=128,
num_workers=16,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]
val_dataloader = dict(
batch_size=64,
num_workers=8,
pin_memory=True,
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_dataloader = val_dataloader
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(
type='AdamW',
lr=5e-4 * (8 * 128 / 256),
model_type='mixmim',
layer_decay_rate=0.7,
betas=(0.9, 0.999),
weight_decay=0.05),
constructor='LearningRateDecayOptimWrapperConstructor',
paramwise_cfg=dict(
custom_keys={
'.ln': dict(decay_mult=0.0), # do not decay on ln and bias
'.bias': dict(decay_mult=0.0)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-6,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=95,
eta_min=1e-6,
by_epoch=True,
begin=5,
end=100,
convert_to_iter_based=True)
]
train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=10)
val_cfg = dict()
test_cfg = dict()
default_hooks = dict(
# save checkpoint per epoch.
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1))

View File

@ -0,0 +1,6 @@
_base_ = [
'../../_base_/models/mixmim/mixmim_base.py',
'../../_base_/datasets/imagenet_bs64_swin_224.py',
'../../_base_/schedules/imagenet_bs256.py',
'../../_base_/default_runtime.py'
]

View File

@ -34,6 +34,29 @@ Models:
Top 1 Accuracy: 84.6
Top 5 Accuracy: 97.0
Weights: https://download.openmmlab.com/mmclassification/v0/mixmim/mixmim-base_3rdparty_in1k_20221206-e40e2c8c.pth
Config: configs/mixmim/mixmim-base_8xb64_in1k.py
Config: configs/mixmim/benchmarks/mixmim-base_8xb64_in1k.py
Converted From:
Code: https://github.com/Sense-X/MixMIM
- Name: mixmim_mixmim-base_16xb128-coslr-300e_in1k
In Collection: MixMIM
Metadata:
Epochs: 300
Batch Size: 2048
Results: null
Config: configs/mixmim/mixmim_mixmim-base_16xb128-coslr-300e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_16xb128-coslr-300e_in1k_20221208-44fe8d2c.pth
Downstream:
- mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
- Name: mixmim-base_mixmim-pre_8xb128-coslr-100e_in1k
In Collection: MILAN
Metadata:
Epochs: 100
Batch Size: 1024
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 84.63
Config: configs/mixmim/benchmarks/mixmim-base_8xb128-coslr-100e_in1k.py
Weights: https://download.openmmlab.com/mmselfsup/1.x/mixmim/mixmim-base-p16_16xb128-coslr-300e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k/mixmim-base-p16_ft-8xb128-coslr-100e_in1k_20221208-41ecada9.pth

View File

@ -1,5 +0,0 @@
_base_ = [
'../_base_/models/mixmim/mixmim_base.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
]

Some files were not shown because too many files have changed in this diff Show More