diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
index 90240e49..ba965a45 100644
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
@@ -8,8 +8,6 @@ log_config = dict(
         # dict(type='TensorboardLoggerHook')
     ])
 # yapf:enable
-# You can register your own hooks like this
-# custom_hooks=[dict(type='EMAHook')]
 
 dist_params = dict(backend='nccl')
 log_level = 'INFO'
diff --git a/configs/_base_/schedules/imagenet_bs4096_AdamW.py b/configs/_base_/schedules/imagenet_bs4096_AdamW.py
new file mode 100644
index 00000000..859cf4b2
--- /dev/null
+++ b/configs/_base_/schedules/imagenet_bs4096_AdamW.py
@@ -0,0 +1,18 @@
+# optimizer
+optimizer = dict(type='AdamW', lr=0.003, weight_decay=0.3)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+
+# specific to vit pretrain
+paramwise_cfg = dict(
+    custom_keys={
+        '.backbone.cls_token': dict(decay_mult=0.0),
+        '.backbone.pos_embed': dict(decay_mult=0.0)
+    })
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0,
+    warmup='linear',
+    warmup_iters=10000,
+    warmup_ratio=1e-4)
+runner = dict(type='EpochBasedRunner', max_epochs=300)
diff --git a/configs/vision_transformer/vit_base_patch16_224_pretrain_imagenet.py b/configs/vision_transformer/vit_base_patch16_224_pretrain_imagenet.py
index 8513938c..55f02496 100644
--- a/configs/vision_transformer/vit_base_patch16_224_pretrain_imagenet.py
+++ b/configs/vision_transformer/vit_base_patch16_224_pretrain_imagenet.py
@@ -1,7 +1,7 @@
 _base_ = [
     '../_base_/models/vit_base_patch16_224_pretrain.py',
-    '../_base_/datasets/imagenet_bs32_pil_resize.py',
-    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
     '../_base_/default_runtime.py'
 ]
 
@@ -123,7 +123,7 @@ test_pipeline = [
     dict(type='Collect', keys=['img'])
 ]
 data = dict(
-    samples_per_gpu=32,
+    samples_per_gpu=64,
     workers_per_gpu=2,
     train=dict(
         type=dataset_type,
diff --git a/configs/vision_transformer/vit_large_patch16_224_finetune_imagenet.py b/configs/vision_transformer/vit_large_patch16_224_finetune_imagenet.py
index 7809d26b..a7410b77 100644
--- a/configs/vision_transformer/vit_large_patch16_224_finetune_imagenet.py
+++ b/configs/vision_transformer/vit_large_patch16_224_finetune_imagenet.py
@@ -8,14 +8,3 @@ _base_ = [
 
 img_norm_cfg = dict(
     mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', size=(384, -1), backend='pillow'),
-    dict(type='CenterCrop', crop_size=384),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'])
-]
-
-data = dict(test=dict(pipeline=test_pipeline))
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 53160654..5ae3af1f 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -20,10 +20,10 @@ The ResNet family models below are trained by standard data augmentations, i.e.,
 | ResNet-50             | 25.56     | 4.12     | 76.55 | 93.15 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.log.json) |
 | ResNet-101            | 44.55     | 7.85     | 78.18 | 94.03 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.log.json) |
 | ResNet-152            | 60.19     | 11.58    | 78.63 | 94.16 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.log.json) |
-| ResNeSt-50            | 27.48     | 5.41     | 81.13 | 95.59 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth) &#124; [log]() |
-| ResNeSt-101           | 48.28     | 10.27    | 82.32 | 96.24 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth) &#124; [log]() |
-| ResNeSt-200           | 70.2      | 17.53    | 82.41 | 96.22 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth) &#124; [log]() |
-| ResNeSt-269           | 110.93    | 22.58    | 82.70 | 96.28 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth) &#124; [log]() |
+| ResNeSt-50*           | 27.48     | 5.41     | 81.13 | 95.59 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth) &#124; [log]() |
+| ResNeSt-101*          | 48.28     | 10.27    | 82.32 | 96.24 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth) &#124; [log]() |
+| ResNeSt-200*          | 70.2      | 17.53    | 82.41 | 96.22 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth) &#124; [log]() |
+| ResNeSt-269*          | 110.93    | 22.58    | 82.70 | 96.28 |  | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth) &#124; [log]() |
 | ResNetV1D-50          | 25.58     | 4.36     | 77.4  | 93.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d50_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.log.json) |
 | ResNetV1D-101         | 44.57     | 8.09     | 78.85 | 94.38 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d101_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.log.json) |
 | ResNetV1D-152         | 60.21     | 11.82    | 79.35 | 94.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d152_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.log.json) |
@@ -36,6 +36,10 @@ The ResNet family models below are trained by standard data augmentations, i.e.,
 | ShuffleNetV1 1.0x (group=3)   | 1.87      | 0.146    | 68.13 | 87.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v1/shufflenet_v1_1x_b64x16_linearlr_bn_nowd_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) |
 | ShuffleNetV2 1.0x     | 2.28      | 0.149    | 69.55 | 88.92 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v2/shufflenet_v2_1x_b64x16_linearlr_bn_nowd_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) |
 | MobileNet V2          | 3.5       | 0.319    | 71.86 | 90.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v2/mobilenet_v2_b32x8_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) &#124; [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) |
+| ViT-B/16*             | 86.86     | 33.03    | 84.20 | 97.18 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit_base_patch16_384_finetune_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit_base_patch16_384.pth) &#124; [log]() |
+| ViT-B/32*             | 88.3      | 8.56     | 81.73 | 96.13 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit_base_patch32_384_finetune_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit_base_patch32_384.pth) &#124; [log]() |
+| ViT-L/16*             | 304.72    | 116.68   | 85.08 | 97.38 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit_large_patch16_384_finetune_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit_large_patch16_384.pth)  &#124; [log]() |
+| ViT-L/32*             | 306.63    | 29.66    | 81.52 | 96.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit_large_patch32_384_finetune_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit_large_patch32_384.pth)  &#124; [log]() |
 
 Models with * are converted from other repos, others are trained by ourselves.