_base_ = [ '../_base_/datasets/imagenet_bs512_mocov3.py', '../_base_/default_runtime.py', ] # model settings temperature = 1.0 model = dict( type='MoCoV3', base_momentum=0.01, # 0.01 for 100e and 300e, 0.004 for 1000e backbone=dict(type='ResNet', depth=50, norm_cfg=dict(type='SyncBN')), neck=dict( type='NonLinearNeck', in_channels=2048, hid_channels=4096, out_channels=256, num_layers=2, with_bias=False, with_last_bn=True, with_last_bn_affine=False, with_last_bias=False, with_avg_pool=True), head=dict( type='MoCoV3Head', predictor=dict( type='NonLinearNeck', in_channels=256, hid_channels=4096, out_channels=256, num_layers=2, with_bias=False, with_last_bn=False, with_last_bn_affine=False, with_last_bias=False, with_avg_pool=False), loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature), temperature=temperature)) # optimizer optim_wrapper = dict( type='AmpOptimWrapper', loss_scale='dynamic', optimizer=dict(type='LARS', lr=9.6, weight_decay=1e-6, momentum=0.9), paramwise_cfg=dict( custom_keys={ 'bn': dict(decay_mult=0, lars_exclude=True), 'bias': dict(decay_mult=0, lars_exclude=True), # bn layer in ResNet block downsample module 'downsample.1': dict(decay_mult=0, lars_exclude=True), }), ) # learning rate scheduler param_scheduler = [ dict( type='LinearLR', start_factor=1e-4, by_epoch=True, begin=0, end=10, convert_to_iter_based=True), dict( type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=10, end=100, convert_to_iter_based=True) ] # runtime settings train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100) # only keeps the latest 3 checkpoints default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) # NOTE: `auto_scale_lr` is for automatically scaling LR # based on the actual training batch size. auto_scale_lr = dict(base_batch_size=4096)