From 7301f8c7b63453264b37ec7df31e302b25f3930b Mon Sep 17 00:00:00 2001
From: Cathy0908 <30484308+Cathy0908@users.noreply.github.com>
Date: Wed, 14 Sep 2022 12:04:52 +0800
Subject: [PATCH 1/9] fix io.copytree (#193)

---
 easycv/file/file_io.py     |  5 +++++
 tests/file/test_file_io.py | 25 ++++++++++++++++++++++++-
 tests/ut_config.py         |  3 ++-
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/easycv/file/file_io.py b/easycv/file/file_io.py
index 50e052eb..04e13743 100644
--- a/easycv/file/file_io.py
+++ b/easycv/file/file_io.py
@@ -519,6 +519,11 @@ class IO(IOLocal):
         ]
         if path in files:
             files.remove(path)
+        if recursive:
+            files = [
+                i for i in files
+                if not self.isdir(f'{OSS_PREFIX}{bucket.bucket_name}/{i}')
+            ]
 
         if not files and not self._obj_exists(bucket, path):
             raise FileNotFoundError(
diff --git a/tests/file/test_file_io.py b/tests/file/test_file_io.py
index e67f75ed..31929f0c 100644
--- a/tests/file/test_file_io.py
+++ b/tests/file/test_file_io.py
@@ -7,7 +7,8 @@ import unittest
 import uuid
 
 from tests.ut_config import (BASE_LOCAL_PATH, CLS_DATA_NPY_LOCAL,
-                             CLS_DATA_NPY_OSS, IO_DATA_TXTX_OSS, TMP_DIR_OSS)
+                             CLS_DATA_NPY_OSS, IO_DATA_MULTI_DIRS_OSS,
+                             IO_DATA_TXTX_OSS, TMP_DIR_OSS)
 
 from easycv.file import io
 
@@ -128,6 +129,28 @@ class IOForOSSTest(unittest.TestCase):
         io.remove(temp_dir)
         io.remove(oss_file_path2)
 
+    def test_copytree_multi_dirs(self):
+        target = [
+            'dir1/a.txt', 'dir1/dir1_1/a.txt', 'dir1/dir1_1/b.txt',
+            'dir2/b.txt'
+        ]
+        # test copy dir from oss to local
+        oss_file_path1 = IO_DATA_MULTI_DIRS_OSS
+        temp_dir = tempfile.TemporaryDirectory().name
+        io.copytree(oss_file_path1, temp_dir)
+        self.assertTrue(io.exists(temp_dir))
+        self.assertCountEqual(io.listdir(temp_dir, recursive=True), target)
+
+        # test copy dir from local to oss
+        oss_file_path2 = os.path.join(TMP_DIR_OSS, '%s' % uuid.uuid4().hex)
+        io.copytree(temp_dir, oss_file_path2)
+        self.assertTrue(io.exists(oss_file_path2))
+        self.assertCountEqual(
+            io.listdir(oss_file_path2, recursive=True), target)
+
+        io.remove(temp_dir)
+        io.remove(oss_file_path2)
+
     def test_listdir(self):
         # with suffix /
         files = io.listdir(IO_DATA_TXTX_OSS.rstrip('/') + '/')
diff --git a/tests/ut_config.py b/tests/ut_config.py
index 64284b1d..59f32e63 100644
--- a/tests/ut_config.py
+++ b/tests/ut_config.py
@@ -45,7 +45,8 @@ SMALL_IMAGENET_TFRECORD_OSS = os.path.join(
     BASE_OSS_PATH, 'data/classification/small_imagenet_tfrecord/')
 
 IO_DATA_TXTX_OSS = os.path.join(BASE_OSS_PATH, 'data/io_test_dir/txts/')
-
+IO_DATA_MULTI_DIRS_OSS = os.path.join(BASE_OSS_PATH,
+                                      'data/io_test_dir/multi_dirs/')
 DET_DATA_SMALL_COCO_LOCAL = os.path.join(BASE_LOCAL_PATH,
                                          'data/detection/small_coco')
 

From 29f0e4242703e3c68c1605767099d205559ced89 Mon Sep 17 00:00:00 2001
From: Cathy0908 <30484308+Cathy0908@users.noreply.github.com>
Date: Wed, 14 Sep 2022 12:05:52 +0800
Subject: [PATCH 2/9] fix import error in quantize_utils.py(#180)

---
 easycv/toolkit/quantize/quantize_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easycv/toolkit/quantize/quantize_utils.py b/easycv/toolkit/quantize/quantize_utils.py
index 8759c593..c7ef8aa8 100644
--- a/easycv/toolkit/quantize/quantize_utils.py
+++ b/easycv/toolkit/quantize/quantize_utils.py
@@ -7,8 +7,8 @@ import numpy as np
 import torch
 from mmcv.parallel import scatter_kwargs
 
+from easycv.models.detection.detectors.yolox.yolo_head import YOLOXHead
 from easycv.models.detection.utils import output_postprocess, postprocess
-from easycv.models.detection.yolox.yolo_head import YOLOXHead
 
 
 def quantize_config_check(device, backend, model_type=''):

From 0cb91de0cb80845f2a09f6de7c3697f28b51629e Mon Sep 17 00:00:00 2001
From: zzoneee <55594658+zzoneee@users.noreply.github.com>
Date: Wed, 14 Sep 2022 15:24:54 +0800
Subject: [PATCH 3/9] add DeiT III (#171)

1.Add a backbone: deitiii.
2.Add an optimizer: lamb.
3.Add a sampler: RASampler.
4.Add a lr update hook: CosineAnnealingWarmupByEpochLrUpdaterHook.
5.In easycv/models/classification/classification.py, I remove the default mixup_cfg to keep the classification.py clean.
---
 .../imagenet/vit/deitiii_base_patch16_192.py  | 143 +++++++++
 .../imagenet_deitiii_base_patch16_192_jpg.py  |  17 ++
 .../imagenet_deitiii_large_patch16_192_jpg.py |  17 ++
 .../imagenet_deitiii_small_patch16_224_jpg.py |  86 ++++++
 docs/source/model_zoo_cls.md                  |   3 +
 easycv/core/optimizer/__init__.py             |   1 +
 easycv/core/optimizer/lamb.py                 | 166 ++++++++++
 .../classification/pipelines/auto_augment.py  |  36 ++-
 easycv/datasets/loader/build_loader.py        |  14 +-
 easycv/datasets/loader/sampler.py             |  71 +++++
 easycv/hooks/__init__.py                      |   6 +-
 easycv/hooks/lr_update_hook.py                |  27 ++
 easycv/models/backbones/__init__.py           |   1 +
 easycv/models/backbones/conv_mae_vit.py       |   2 +-
 easycv/models/backbones/vision_transformer.py | 287 ++++++++++++++++++
 .../backbones/vit_transfomer_dynamic.py       | 190 +-----------
 .../models/classification/classification.py   |  30 +-
 easycv/models/heads/cls_head.py               |   6 +-
 easycv/models/loss/cross_entropy_loss.py      |  53 +++-
 tests/models/backbones/test_deitiii.py        |  42 +++
 tools/train.py                                |   5 +-
 21 files changed, 982 insertions(+), 221 deletions(-)
 create mode 100644 configs/classification/imagenet/vit/deitiii_base_patch16_192.py
 create mode 100644 configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py
 create mode 100644 configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py
 create mode 100644 configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py
 create mode 100644 easycv/core/optimizer/lamb.py
 create mode 100644 easycv/models/backbones/vision_transformer.py
 create mode 100644 tests/models/backbones/test_deitiii.py

diff --git a/configs/classification/imagenet/vit/deitiii_base_patch16_192.py b/configs/classification/imagenet/vit/deitiii_base_patch16_192.py
new file mode 100644
index 00000000..46b620f1
--- /dev/null
+++ b/configs/classification/imagenet/vit/deitiii_base_patch16_192.py
@@ -0,0 +1,143 @@
+# from PIL import Image
+
+_base_ = 'configs/base.py'
+
+log_config = dict(
+    interval=10,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+
+# model settings
+model = dict(
+    type='Classification',
+    train_preprocess=['mixUp'],
+    pretrained=False,
+    mixup_cfg=dict(
+        mixup_alpha=0.8,
+        cutmix_alpha=1.0,
+        cutmix_minmax=None,
+        prob=1.0,
+        switch_prob=0.5,
+        mode='batch',
+        label_smoothing=0.0,
+        num_classes=1000),
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=[192],
+        num_classes=1000,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.,
+        drop_path_rate=0.2,
+        use_layer_scale=True),
+    head=dict(
+        type='ClsHead',
+        loss_config=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            label_ceil=True),
+        with_fc=False,
+        use_num_classes=False))
+
+data_train_list = 'data/imagenet1k/train.txt'
+data_train_root = 'data/imagenet1k/train/'
+data_test_list = 'data/imagenet1k/val.txt'
+data_test_root = 'data/imagenet1k/val/'
+
+dataset_type = 'ClsDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+three_augment_policies = [[
+    dict(type='PILGaussianBlur', prob=1.0, radius_min=0.1, radius_max=2.0),
+], [
+    dict(type='Solarization', threshold=128),
+], [
+    dict(type='Grayscale', num_output_channels=3),
+]]
+train_pipeline = [
+    dict(
+        type='RandomResizedCrop', size=192, scale=(0.08, 1.0),
+        interpolation=3),  # interpolation='bicubic'
+    dict(type='RandomHorizontalFlip'),
+    dict(type='MMAutoAugment', policies=three_augment_policies),
+    dict(type='ColorJitter', brightness=0.3, contrast=0.3, saturation=0.3),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Collect', keys=['img', 'gt_labels'])
+]
+size = int((256 / 224) * 192)
+test_pipeline = [
+    dict(type='Resize', size=size, interpolation=3),
+    dict(type='CenterCrop', size=192),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Collect', keys=['img', 'gt_labels'])
+]
+
+data = dict(
+    imgs_per_gpu=256,
+    workers_per_gpu=8,
+    use_repeated_augment_sampler=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list,
+            root=data_train_root,
+            type='ClsSourceImageList'),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list,
+            root=data_test_root,
+            type='ClsSourceImageList'),
+        pipeline=test_pipeline))
+
+eval_config = dict(initial=True, interval=1, gpu_collect=True)
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=data['val'],
+        dist_eval=True,
+        evaluators=[dict(type='ClsEvaluator', topk=(1, 5))],
+    )
+]
+
+# additional hooks
+custom_hooks = []
+
+# optimizer
+optimizer = dict(
+    type='Lamb',
+    lr=0.003,
+    weight_decay=0.05,
+    eps=1e-8,
+    paramwise_options={
+        'cls_token': dict(weight_decay=0.),
+        'pos_embed': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'gamma_1': dict(weight_decay=0.),
+        'gamma_2': dict(weight_decay=0.),
+    })
+optimizer_config = dict(grad_clip=None, update_interval=1)
+
+lr_config = dict(
+    policy='CosineAnnealingWarmupByEpoch',
+    by_epoch=True,
+    min_lr_ratio=0.00001 / 0.003,
+    warmup='linear',
+    warmup_by_epoch=True,
+    warmup_iters=5,
+    warmup_ratio=0.000001 / 0.003,
+)
+checkpoint_config = dict(interval=10)
+
+# runtime settings
+total_epochs = 800
+
+ema = dict(decay=0.99996)
diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py
new file mode 100644
index 00000000..5a35f946
--- /dev/null
+++ b/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py
@@ -0,0 +1,17 @@
+_base_ = './deitiii_base_patch16_192.py'
+# model settings
+model = dict(
+    type='Classification',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=[192],
+        num_classes=1000,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.,
+        drop_path_rate=0.2,
+        use_layer_scale=True))
diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py
new file mode 100644
index 00000000..4c82cf9a
--- /dev/null
+++ b/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py
@@ -0,0 +1,17 @@
+_base_ = './deitiii_base_patch16_192.py'
+# model settings
+model = dict(
+    type='Classification',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=[192],
+        num_classes=1000,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.,
+        drop_path_rate=0.45,
+        use_layer_scale=True))
diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py
new file mode 100644
index 00000000..9ba9cf77
--- /dev/null
+++ b/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py
@@ -0,0 +1,86 @@
+_base_ = './deitiii_base_patch16_192.py'
+# model settings
+model = dict(
+    type='Classification',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=[224],
+        num_classes=1000,
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.,
+        drop_path_rate=0.05,
+        use_layer_scale=True))
+
+data_train_list = 'data/imagenet1k/train.txt'
+data_train_root = 'data/imagenet1k/train/'
+data_test_list = 'data/imagenet1k/val.txt'
+data_test_root = 'data/imagenet1k/val/'
+
+dataset_type = 'ClsDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+three_augment_policies = [[
+    dict(type='PILGaussianBlur', prob=1.0, radius_min=0.1, radius_max=2.0),
+], [
+    dict(type='Solarization', threshold=128),
+], [
+    dict(type='Grayscale', num_output_channels=3),
+]]
+train_pipeline = [
+    dict(
+        type='RandomResizedCrop', size=224, scale=(0.08, 1.0),
+        interpolation=3),  # interpolation='bicubic'
+    dict(type='RandomHorizontalFlip'),
+    dict(type='MMAutoAugment', policies=three_augment_policies),
+    dict(type='ColorJitter', brightness=0.3, contrast=0.3, saturation=0.3),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Collect', keys=['img', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='Resize', size=256, interpolation=3),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Collect', keys=['img', 'gt_labels'])
+]
+
+data = dict(
+    imgs_per_gpu=256,
+    workers_per_gpu=8,
+    use_repeated_augment_sampler=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list,
+            root=data_train_root,
+            type='ClsSourceImageList'),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list,
+            root=data_test_root,
+            type='ClsSourceImageList'),
+        pipeline=test_pipeline))
+
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=data['val'],
+        dist_eval=True,
+        evaluators=[dict(type='ClsEvaluator', topk=(1, 5))],
+    )
+]
+
+# optimizer
+optimizer = dict(lr=0.004)
+
+lr_config = dict(
+    min_lr_ratio=0.00001 / 0.004,
+    warmup_ratio=0.000001 / 0.004,
+)
diff --git a/docs/source/model_zoo_cls.md b/docs/source/model_zoo_cls.md
index 3d91275e..a2254ddf 100644
--- a/docs/source/model_zoo_cls.md
+++ b/docs/source/model_zoo_cls.md
@@ -21,6 +21,9 @@
 | hrnetw64 | [hrnetw64](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/hrnet/imagenet_hrnetw64_jpg.py) | 79.884    | 95.04    | 5120    | 54.74    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/resnet/hrnetw64/epoch_100.pth) |
 | vit-base-patch16 | [vit-base-patch16](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_vit_base_patch16_224_jpg.py) | 76.082    | 92.026    | 346    | 8.03    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/vit/vit-base-patch16/epoch_300.pth) |
 | swin-tiny-patch4-window7 | [swin-tiny-patch4-window7](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/swint/imagenet_swin_tiny_patch4_window7_224_jpg.py) | 80.528    | 94.822    | 132    | 12.94    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/swint/swin-tiny-patch4-window7/epoch_300.pth) |
+| deitiii-small-patch16-224 | [deitiii-small-patch16-224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py) | 81.408    | 95.388    | 89    | 4.53    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_small_patch16_224/deitiii_small.pth) |
+| deitiii-base-patch16-192 | [deitiii-base-patch16-192](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py) | 82.982    | 95.95    | 337    | 4.63    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_base_patch16_192/deitiii_base.pth) |
+| deitiii-large-patch16-192 | [deitiii-large-patch16-192](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py) | 83.902    | 96.296    | 1170    | 10.17    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_large_patch16_192/deitiii_large.pth) |
 
 (ps: 通过EasyCV训练得到模型结果，推理的输入尺寸默认为224，机器默认为V100 16G，其中gpu memory记录的是gpu peak memory)
 
diff --git a/easycv/core/optimizer/__init__.py b/easycv/core/optimizer/__init__.py
index b4df330b..4c2bf30d 100644
--- a/easycv/core/optimizer/__init__.py
+++ b/easycv/core/optimizer/__init__.py
@@ -4,6 +4,7 @@ import torch
 from torch.optim import *
 
 from .builder import build_optimizer_constructor
+from .lamb import Lamb
 from .lars import LARS
 from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor
 from .ranger import Ranger
diff --git a/easycv/core/optimizer/lamb.py b/easycv/core/optimizer/lamb.py
new file mode 100644
index 00000000..6295cdc7
--- /dev/null
+++ b/easycv/core/optimizer/lamb.py
@@ -0,0 +1,166 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+import torch
+from mmcv.runner import OPTIMIZERS
+from torch.optim import Optimizer
+
+
+@OPTIMIZERS.register_module()
+class Lamb(Optimizer):
+    """A pure pytorch variant of FuseLAMB (NvLamb variant) optimizer.
+    This class is copied from `timm`_. The LAMB was proposed in `Large Batch
+    Optimization for Deep Learning - Training BERT in 76 minutes`_.
+    .. _timm:
+        https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/lamb.py
+    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+        parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 1.0)
+        trust_clip (bool): enable LAMBC trust ratio clipping (default: False)
+        always_adapt (boolean, optional): Apply adaptive learning rate to 0.0
+            weight decay parameter (default: False)
+    """  # noqa: E501
+
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 bias_correction=True,
+                 betas=(0.9, 0.999),
+                 eps=1e-6,
+                 weight_decay=0.01,
+                 grad_averaging=True,
+                 max_grad_norm=1.0,
+                 trust_clip=False,
+                 always_adapt=False):
+        defaults = dict(
+            lr=lr,
+            bias_correction=bias_correction,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            grad_averaging=grad_averaging,
+            max_grad_norm=max_grad_norm,
+            trust_clip=trust_clip,
+            always_adapt=always_adapt)
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        device = self.param_groups[0]['params'][0].device
+        one_tensor = torch.tensor(
+            1.0, device=device
+        )  # because torch.where doesn't handle scalars correctly
+        global_grad_norm = torch.zeros(1, device=device)
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Lamb does not support sparse gradients, consider '
+                        'SparseAdam instead.')
+                global_grad_norm.add_(grad.pow(2).sum())
+
+        global_grad_norm = torch.sqrt(global_grad_norm)
+        # FIXME it'd be nice to remove explicit tensor conversion of scalars
+        #  when torch.where promotes
+        # scalar types properly https://github.com/pytorch/pytorch/issues/9190
+        max_grad_norm = torch.tensor(
+            self.defaults['max_grad_norm'], device=device)
+        clip_global_grad_norm = torch.where(global_grad_norm > max_grad_norm,
+                                            global_grad_norm / max_grad_norm,
+                                            one_tensor)
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+            beta3 = 1 - beta1 if grad_averaging else 1.0
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or
+            # pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            if bias_correction:
+                bias_correction1 = 1 - beta1**group['step']
+                bias_correction2 = 1 - beta2**group['step']
+            else:
+                bias_correction1, bias_correction2 = 1.0, 1.0
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.div_(clip_global_grad_norm)
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient valuesa
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=beta3)  # m_t
+                exp_avg_sq.mul_(beta2).addcmul_(
+                    grad, grad, value=1 - beta2)  # v_t
+
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
+                    group['eps'])
+                update = (exp_avg / bias_correction1).div_(denom)
+
+                weight_decay = group['weight_decay']
+                if weight_decay != 0:
+                    update.add_(p, alpha=weight_decay)
+
+                if weight_decay != 0 or group['always_adapt']:
+                    # Layer-wise LR adaptation. By default, skip adaptation on
+                    # parameters that are
+                    # excluded from weight decay, unless always_adapt == True,
+                    # then always enabled.
+                    w_norm = p.norm(2.0)
+                    g_norm = update.norm(2.0)
+                    # FIXME nested where required since logical and/or not
+                    #  working in PT XLA
+                    trust_ratio = torch.where(
+                        w_norm > 0,
+                        torch.where(g_norm > 0, w_norm / g_norm, one_tensor),
+                        one_tensor,
+                    )
+                    if group['trust_clip']:
+                        # LAMBC trust clipping, upper bound fixed at one
+                        trust_ratio = torch.minimum(trust_ratio, one_tensor)
+                    update.mul_(trust_ratio)
+
+                p.add_(update, alpha=-group['lr'])
+
+        return loss
diff --git a/easycv/datasets/classification/pipelines/auto_augment.py b/easycv/datasets/classification/pipelines/auto_augment.py
index 9f4137e6..e9bef83a 100644
--- a/easycv/datasets/classification/pipelines/auto_augment.py
+++ b/easycv/datasets/classification/pipelines/auto_augment.py
@@ -8,7 +8,7 @@ from typing import Sequence
 
 import mmcv
 import numpy as np
-from PIL import Image
+from PIL import Image, ImageFilter
 
 from easycv.datasets.registry import PIPELINES
 from easycv.datasets.shared.pipelines import Compose
@@ -1043,3 +1043,37 @@ class Cutout(object):
         repr_str += f'pad_val={self.pad_val}, '
         repr_str += f'prob={self.prob})'
         return repr_str
+
+
+@PIPELINES.register_module()
+class PILGaussianBlur(object):
+
+    def __init__(self, prob=0.1, radius_min=0.1, radius_max=2.):
+        assert 0 <= prob <= 1.0, 'The prob should be in range [0,1], ' \
+            f'got {prob} instead.'
+        assert isinstance(radius_min, (int, float)), 'The radius_min type must '\
+            f'be int or float, but got {type(radius_min)} instead.'
+        assert isinstance(radius_max, (int, float)), 'The radius_max type must '\
+            f'be int or float, but got {type(radius_max)} instead.'
+
+        self.prob = prob
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+
+    def __call__(self, results):
+        if np.random.rand() > self.prob:
+            return results
+
+        for key in results.get('img_fields', ['img']):
+            img = results[key].filter(
+                ImageFilter.GaussianBlur(
+                    radius=random.uniform(self.radius_min, self.radius_max)))
+            results[key] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'radius_min={self.radius_min}, '
+        repr_str += f'radius_max={self.radius_max})'
+        return repr_str
diff --git a/easycv/datasets/loader/build_loader.py b/easycv/datasets/loader/build_loader.py
index 6af50073..08127325 100644
--- a/easycv/datasets/loader/build_loader.py
+++ b/easycv/datasets/loader/build_loader.py
@@ -14,7 +14,7 @@ from easycv.datasets.shared.odps_reader import set_dataloader_workid
 from easycv.utils.dist_utils import sync_random_seed
 from easycv.utils.torchacc_util import is_torchacc_enabled
 from .collate import CollateWrapper
-from .sampler import DistributedMPSampler, DistributedSampler
+from .sampler import DistributedMPSampler, DistributedSampler, RASampler
 
 if platform.system() != 'Windows':
     # https://github.com/pytorch/pytorch/issues/973
@@ -35,6 +35,7 @@ def build_dataloader(dataset,
                      odps_config=None,
                      persistent_workers=False,
                      collate_hooks=None,
+                     use_repeated_augment_sampler=False,
                      **kwargs):
     """Build PyTorch DataLoader.
     In distributed training, each GPU/process has a dataloader.
@@ -56,6 +57,8 @@ def build_dataloader(dataset,
             data in worker process can be reused.
         persistent_workers (bool) : After pytorch1.7, could use persistent_workers=True to
             avoid reconstruct dataworker before each epoch, speed up before epoch
+        use_repeated_augment_sampler (bool) : If set true, it will use RASampler.
+            Default: False.
         kwargs: any keyword argument to be used to initialize DataLoader
     Returns:
         DataLoader: A PyTorch dataloader.
@@ -68,7 +71,9 @@ def build_dataloader(dataset,
                                              'split_huge_listfile_byrank',
                                              False)
 
-        if hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1:
+        if use_repeated_augment_sampler:
+            sampler = RASampler(dataset, world_size, rank, shuffle=shuffle)
+        elif hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1:
             sampler = DistributedMPSampler(
                 dataset,
                 world_size,
@@ -88,7 +93,10 @@ def build_dataloader(dataset,
     else:
         if replace:
             raise NotImplementedError
-        if hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1:
+
+        if use_repeated_augment_sampler:
+            sampler = RASampler(dataset, 1, 0, shuffle=shuffle)
+        elif hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1:
             sampler = DistributedMPSampler(
                 dataset, 1, 0, shuffle=shuffle, replace=replace)
         else:
diff --git a/easycv/datasets/loader/sampler.py b/easycv/datasets/loader/sampler.py
index 6fe6863c..4c22695f 100644
--- a/easycv/datasets/loader/sampler.py
+++ b/easycv/datasets/loader/sampler.py
@@ -6,6 +6,7 @@ import random
 
 import numpy as np
 import torch
+import torch.distributed as dist
 from mmcv.runner import get_dist_info
 from torch.utils.data import DistributedSampler as _DistributedSampler
 from torch.utils.data import Sampler
@@ -469,3 +470,73 @@ class DistributedGivenIterationSampler(Sampler):
 
     def set_epoch(self, epoch):
         pass
+
+
+class RASampler(torch.utils.data.Sampler):
+    """Sampler that restricts data loading to a subset of the dataset for distributed,
+    with repeated augmentation.
+    It ensures that different each augmented version of a sample will be visible to a
+    different process (GPU)
+    Heavily based on torch.utils.data.DistributedSampler
+    """
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 num_repeats: int = 3):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            rank = dist.get_rank()
+        if num_repeats < 1:
+            raise ValueError('num_repeats should be greater than 0')
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.num_repeats = num_repeats
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(
+                len(self.dataset) * self.num_repeats / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
+        self.num_selected_samples = int(
+            math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g)
+        else:
+            indices = torch.arange(start=0, end=len(self.dataset))
+
+        # add extra samples to make it evenly divisible
+        indices = torch.repeat_interleave(
+            indices, repeats=self.num_repeats, dim=0).tolist()
+        padding_size: int = self.total_size - len(indices)
+        if padding_size > 0:
+            indices += indices[:padding_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices[:self.num_selected_samples])
+
+    def __len__(self):
+        return self.num_selected_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/easycv/hooks/__init__.py b/easycv/hooks/__init__.py
index c292038f..f2814dac 100644
--- a/easycv/hooks/__init__.py
+++ b/easycv/hooks/__init__.py
@@ -13,7 +13,8 @@ from .eval_hook import DistEvalHook, EvalHook
 from .export_hook import ExportHook
 from .extractor import Extractor
 from .logger import PreLoggerHook
-from .lr_update_hook import StepFixCosineAnnealingLrUpdaterHook
+from .lr_update_hook import (CosineAnnealingWarmupByEpochLrUpdaterHook,
+                             StepFixCosineAnnealingLrUpdaterHook)
 from .optimizer_hook import OptimizerHook
 from .oss_sync_hook import OSSSyncHook
 from .registry import HOOKS
@@ -33,7 +34,8 @@ __all__ = [
     'OSSSyncHook', 'HOOKS', 'TIMEHook', 'SWAVHook', 'SyncNormHook',
     'SyncRandomSizeHook', 'TensorboardLoggerHookV2', 'WandbLoggerHookV2',
     'YOLOXLrUpdaterHook', 'YOLOXModeSwitchHook', 'MixupCollateHook',
-    'PreLoggerHook', 'StepFixCosineAnnealingLrUpdaterHook', 'ThroughputHook'
+    'PreLoggerHook', 'StepFixCosineAnnealingLrUpdaterHook',
+    'CosineAnnealingWarmupByEpochLrUpdaterHook', 'ThroughputHook'
 ]
 
 if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'):
diff --git a/easycv/hooks/lr_update_hook.py b/easycv/hooks/lr_update_hook.py
index 39ca8f53..af1bc514 100644
--- a/easycv/hooks/lr_update_hook.py
+++ b/easycv/hooks/lr_update_hook.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmcv import runner
 from mmcv.runner import HOOKS
 from mmcv.runner.hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
                                           annealing_cos)
@@ -54,3 +55,29 @@ class StepFixCosineAnnealingLrUpdaterHook(CosineAnnealingLrUpdaterHook):
             target_lr = self.min_lr
 
         return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class CosineAnnealingWarmupByEpochLrUpdaterHook(CosineAnnealingLrUpdaterHook):
+
+    def before_train_iter(self, runner: 'runner.BaseRunner'):
+        cur_iter = runner.iter
+        epoch_len = len(runner.data_loader)
+        assert isinstance(self.warmup_iters, int)
+        if not self.by_epoch:
+            self.regular_lr = self.get_regular_lr(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                if cur_iter % epoch_len == 0:
+                    warmup_lr = self.get_warmup_lr(cur_iter)
+                    self._set_lr(runner, warmup_lr)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                if cur_iter % epoch_len == 0:
+                    warmup_lr = self.get_warmup_lr(cur_iter)
+                    self._set_lr(runner, warmup_lr)
diff --git a/easycv/models/backbones/__init__.py b/easycv/models/backbones/__init__.py
index f0be50ae..add29a07 100644
--- a/easycv/models/backbones/__init__.py
+++ b/easycv/models/backbones/__init__.py
@@ -19,4 +19,5 @@ from .resnet_jit import ResNetJIT
 from .resnext import ResNeXt
 from .shuffle_transformer import ShuffleTransformer
 from .swin_transformer import SwinTransformer
+from .vision_transformer import VisionTransformer
 from .vitdet import ViTDet
diff --git a/easycv/models/backbones/conv_mae_vit.py b/easycv/models/backbones/conv_mae_vit.py
index 02755faf..ce9b7b61 100644
--- a/easycv/models/backbones/conv_mae_vit.py
+++ b/easycv/models/backbones/conv_mae_vit.py
@@ -10,7 +10,7 @@ from timm.models.layers import trunc_normal_
 from easycv.models.registry import BACKBONES
 from easycv.models.utils import DropPath
 from easycv.models.utils.pos_embed import get_2d_sincos_pos_embed
-from .vit_transfomer_dynamic import Block
+from .vision_transformer import Block
 
 
 class PatchEmbed(nn.Module):
diff --git a/easycv/models/backbones/vision_transformer.py b/easycv/models/backbones/vision_transformer.py
new file mode 100644
index 00000000..2061979d
--- /dev/null
+++ b/easycv/models/backbones/vision_transformer.py
@@ -0,0 +1,287 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+Mostly copy-paste from timm library.
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+
+"""
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+
+from easycv.models.utils import DropPath, Mlp
+from ..registry import BACKBONES
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_layer_scale=False,
+                 init_values=1e-4):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+        self.use_layer_scale = use_layer_scale
+        if self.use_layer_scale:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, x, return_attention=False, rel_pos_bias=None):
+        y, attn = self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
+        if return_attention:
+            return attn
+        if self.use_layer_scale:
+            x = x + self.drop_path(self.gamma_1 * y)
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(y)
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    def forward_fea_and_attn(self, x):
+        y, attn = self.attn(self.norm1(x))
+        if self.use_layer_scale:
+            x = x + self.drop_path(self.gamma_1 * y)
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(y)
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x, attn
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        num_patches = (img_size // patch_size) * (img_size // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+@BACKBONES.register_module
+class VisionTransformer(nn.Module):
+    """ DeiT III is based on ViT. It uses some strategies to make the vit model
+    better, just like layer scale, stochastic depth, 3-Augment.
+
+    Paper link: https://arxiv.org/pdf/2204.07118.pdf (DeiT III: Revenge of the ViT)
+
+    Args:
+        img_size (list): Input image size. img_size=[224] means the image size is
+            224*224. img_size=[192, 224] means the image size is 192*224.
+        patch_size (int): The patch size. Default: 16
+        in_chans (int): The num of input channels. Default: 3
+        num_classes (int): The num of picture classes. Default: 1000
+        embed_dim (int): The dimensions of embedding. Default: 768
+        depth (int): The num of blocks. Default: 12
+        num_heads (int): Parallel attention heads. Default: 12
+        mlp_ratio (float): Mlp expansion ratio. Default: 4.0
+        qkv_bias (bool): Does kqv use bias. Default: False
+        qk_scale (float | None): In the step of self-attention, if qk_scale is not
+            None, it will use qk_scale to scale the q @ k. Otherwise it will use
+            head_dim**-0.5 instead of qk_scale. Default: None
+        drop_rate (float): Probability of an element to be zeroed after the feed
+            forward layer. Default: 0.0
+        drop_path_rate (float): Stochastic depth rate. Default: 0
+        norm_layer (nn.Module): normalization layer
+        use_dense_prediction (bool): If use_dense_prediction is True, the global
+            pool and norm will before head will be removed.(if any) Default: False
+        global_pool (bool): Global pool before head. Default: False
+        use_layer_scale (bool): If use_layer_scale is True, it will use layer
+            scale. Default: False
+        init_scale (float): It is used for layer scale in Block to scale the
+            gamma_1 and gamma_2.
+
+    """
+
+    def __init__(self,
+                 img_size=[224],
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 use_dense_prediction=False,
+                 global_pool=False,
+                 use_layer_scale=False,
+                 init_scale=1e-4,
+                 **kwargs):
+        super().__init__()
+
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size[0],
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        self.drop_path_rate = drop_path_rate
+        self.depth = depth
+        dpr = [drop_path_rate for i in range(depth)]
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                use_layer_scale=use_layer_scale,
+                init_values=init_scale) for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.head = nn.Linear(
+            embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        # Dense prediction head
+        self.use_dense_prediction = use_dense_prediction
+        if self.use_dense_prediction:
+            self.head_dense = None
+
+        # Use global average pooling
+        self.global_pool = global_pool
+        if self.global_pool:
+            self.fc_norm = norm_layer(embed_dim)
+            self.norm = None
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x):
+
+        x = self.forward_features(x)
+        x = self.pos_drop(x)
+        x = self.head(x)
+
+        return [x]
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+
+        x = x + self.pos_embed
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        for blk in self.blocks:
+            x = blk(x)
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.use_dense_prediction:
+            return x[:, 0], x[:, 1:]
+        else:
+            if self.global_pool:
+                x = x[:, 1:, :].mean(dim=1)
+                return self.fc_norm(x)
+            else:
+                return x[:, 0]
diff --git a/easycv/models/backbones/vit_transfomer_dynamic.py b/easycv/models/backbones/vit_transfomer_dynamic.py
index 3ff94701..6df88d2c 100644
--- a/easycv/models/backbones/vit_transfomer_dynamic.py
+++ b/easycv/models/backbones/vit_transfomer_dynamic.py
@@ -12,197 +12,25 @@ from functools import partial
 
 import torch
 import torch.nn as nn
-from timm.models.layers import trunc_normal_
 
-from easycv.models.utils import DropPath, Mlp
+from easycv.models.backbones.vision_transformer import VisionTransformer
 
 
-class Attention(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, rel_pos_bias=None):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-
-        if rel_pos_bias is not None:
-            attn = attn + rel_pos_bias
-
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x, attn
-
-
-class Block(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop)
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop)
-
-    def forward(self, x, return_attention=False, rel_pos_bias=None):
-        y, attn = self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
-        if return_attention:
-            return attn
-        x = x + self.drop_path(y)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-    def forward_fea_and_attn(self, x):
-        y, attn = self.attn(self.norm1(x))
-        x = x + self.drop_path(y)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x, attn
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        num_patches = (img_size // patch_size) * (img_size // patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        return x
-
-
-class DynamicVisionTransformer(nn.Module):
+class DynamicVisionTransformer(VisionTransformer):
     """Dynamic Vision Transformer """
 
-    def __init__(self,
-                 img_size=[224],
-                 patch_size=16,
-                 in_chans=3,
-                 num_classes=0,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 norm_layer=nn.LayerNorm,
-                 use_dense_prediction=False,
-                 global_pool=False,
-                 **kwargs):
-        super().__init__()
-        self.num_features = self.embed_dim = embed_dim
+    def __init__(self, **kwargs):
+        super(DynamicVisionTransformer, self).__init__(**kwargs)
 
-        self.patch_embed = PatchEmbed(
-            img_size=img_size[0],
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim)
         num_patches = self.patch_embed.num_patches
 
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         self.pos_embed = nn.Parameter(
-            torch.zeros(1, num_patches + 1, embed_dim))
-        self.pos_drop = nn.Dropout(p=drop_rate)
+            torch.zeros(1, num_patches + 1, self.embed_dim))
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
-               ]  # stochastic depth decay rule
-        self.blocks = nn.ModuleList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=norm_layer) for i in range(depth)
-        ])
-        self.norm = norm_layer(embed_dim)
-
-        # Classifier head
-        self.head = nn.Linear(
-            embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-        # Dense prediction head
-        self.use_dense_prediction = use_dense_prediction
-        if self.use_dense_prediction:
-            self.head_dense = None
-
-
-# Use global average pooling
-        self.global_pool = global_pool
-        if self.global_pool:
-            self.fc_norm = norm_layer(embed_dim)
-            self.norm = None
-
-        trunc_normal_(self.pos_embed, std=.02)
-        trunc_normal_(self.cls_token, std=.02)
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=.02)
-                if isinstance(m, nn.Linear) and m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.LayerNorm):
-                nn.init.constant_(m.bias, 0)
-                nn.init.constant_(m.weight, 1.0)
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.depth)
+        ]
 
     def forward(self, x):
         # convert to list
diff --git a/easycv/models/classification/classification.py b/easycv/models/classification/classification.py
index ddcc9e31..34bde969 100644
--- a/easycv/models/classification/classification.py
+++ b/easycv/models/classification/classification.py
@@ -53,22 +53,15 @@ class Classification(BaseModel):
         if 'mixUp' in train_preprocess:
             rank, _ = get_dist_info()
             np.random.seed(rank + 12)
-            if not mixup_cfg:
-                num_classes = head.get(
-                    'num_classes',
-                    1000) if 'num_classes' in head else backbone.get(
-                        'num_classes', 1000)
-                mixup_cfg = dict(
-                    mixup_alpha=0.8,
-                    cutmix_alpha=1.0,
-                    cutmix_minmax=None,
-                    prob=1.0,
-                    switch_prob=0.5,
-                    mode='batch',
-                    label_smoothing=0.1,
-                    num_classes=num_classes)
-            self.mixup = Mixup(**mixup_cfg)
-            head.loss_config = {'type': 'SoftTargetCrossEntropy'}
+            if mixup_cfg is not None:
+                if 'num_classes' in mixup_cfg:
+                    self.mixup = Mixup(**mixup_cfg)
+                elif 'num_classes' in head or 'num_classes' in backbone:
+                    num_classes = head.get(
+                        'num_classes'
+                    ) if 'num_classes' in head else backbone.get('num_classes')
+                    mixup_cfg['num_classes'] = num_classes
+                    self.mixup = Mixup(**mixup_cfg)
             train_preprocess.remove('mixUp')
         self.train_preprocess = [
             self.preprocess_key_map[i] for i in train_preprocess
@@ -173,7 +166,10 @@ class Classification(BaseModel):
         for preprocess in self.train_preprocess:
             img = preprocess(img)
 
-        if hasattr(self, 'mixup'):
+        # When the number of samples in the dataset is odd, the last batch size of each epoch will be odd,
+        #  which will cause mixup to report an error. To avoid this situation, mixup is applied only when
+        #  the batch size is even.
+        if hasattr(self, 'mixup') and len(img) % 2 == 0:
             img, gt_labels = self.mixup(img, gt_labels)
 
         x = self.forward_backbone(img)
diff --git a/easycv/models/heads/cls_head.py b/easycv/models/heads/cls_head.py
index 4b4654d6..ff3724d7 100644
--- a/easycv/models/heads/cls_head.py
+++ b/easycv/models/heads/cls_head.py
@@ -28,7 +28,8 @@ class ClsHead(nn.Module):
                  },
                  input_feature_index=[0],
                  init_cfg=dict(
-                     type='Normal', layer='Linear', std=0.01, bias=0.)):
+                     type='Normal', layer='Linear', std=0.01, bias=0.),
+                 use_num_classes=True):
 
         super(ClsHead, self).__init__()
         self.with_avg_pool = with_avg_pool
@@ -46,7 +47,8 @@ class ClsHead(nn.Module):
                 'label_smooth must be given as a float number in [0,1]'
             logger.info(f'=> Augment: using label smooth={self.label_smooth}')
             loss_config['label_smooth'] = label_smooth
-        loss_config['num_classes'] = num_classes
+        if use_num_classes:
+            loss_config['num_classes'] = num_classes
 
         self.criterion = build_from_cfg(loss_config, LOSSES)
 
diff --git a/easycv/models/loss/cross_entropy_loss.py b/easycv/models/loss/cross_entropy_loss.py
index 69e1f615..0f9d5074 100644
--- a/easycv/models/loss/cross_entropy_loss.py
+++ b/easycv/models/loss/cross_entropy_loss.py
@@ -115,6 +115,7 @@ def binary_cross_entropy(pred,
                          class_weight=None,
                          ignore_index=-100,
                          avg_non_ignore=False,
+                         label_ceil=False,
                          **kwargs):
     """Calculate the binary CrossEntropy loss.
 
@@ -132,11 +133,14 @@ def binary_cross_entropy(pred,
         avg_non_ignore (bool): The flag decides to whether the loss is
             only averaged over non-ignored targets. Default: False.
             `New in version 0.23.0.`
+        label_ceil (bool): When use bce and set label_ceil=True,
+            it will make elements belong to (0, 1] in label change to 1.
+            Default: False.
 
     Returns:
         torch.Tensor: The calculated loss
     """
-    if len(pred.shape) > 1 and pred.shape(1) == 1:
+    if len(pred.shape) > 1 and pred.shape[1] == 1:
         # For binary class segmentation, the shape of pred is
         # [N, 1, H, W] and that of label is [N, H, W].
         # As the ignore_index often set as 255, so the
@@ -162,6 +166,8 @@ def binary_cross_entropy(pred,
             weight = weight * valid_mask
         else:
             weight = valid_mask
+    if label_ceil:
+        label = label.gt(0.0).type(label.dtype)
     # average loss over non-ignored and valid elements
     if reduction == 'mean' and avg_factor is None and avg_non_ignore:
         avg_factor = valid_mask.sum().item()
@@ -234,6 +240,9 @@ class CrossEntropyLoss(nn.Module):
         avg_non_ignore (bool): The flag decides to whether the loss is
             only averaged over non-ignored targets. Default: False.
             `New in version 0.23.0.`
+        label_ceil (bool): When use bce and set label_ceil=True,
+            it will make elements belong to (0, 1] in label change to 1.
+            Default: False.
     """
 
     def __init__(self,
@@ -243,10 +252,16 @@ class CrossEntropyLoss(nn.Module):
                  class_weight=None,
                  loss_weight=1.0,
                  loss_name='loss_ce',
-                 avg_non_ignore=False):
+                 avg_non_ignore=False,
+                 label_ceil=False):
         super(CrossEntropyLoss, self).__init__()
         assert (use_sigmoid is False) or (use_mask is False)
         self.use_sigmoid = use_sigmoid
+        if label_ceil:
+            if not use_sigmoid:
+                raise ValueError(
+                    '‘label_ceil’ is supported only when ‘use_sigmoid’ is true. If not use bce, please set ‘label_ceil’=False'
+                )
         self.use_mask = use_mask
         self.reduction = reduction
         self.loss_weight = loss_weight
@@ -266,6 +281,7 @@ class CrossEntropyLoss(nn.Module):
         else:
             self.cls_criterion = cross_entropy
         self._loss_name = loss_name
+        self.label_ceil = label_ceil
 
     def extra_repr(self):
         """Extra repr."""
@@ -289,16 +305,29 @@ class CrossEntropyLoss(nn.Module):
         else:
             class_weight = None
         # Note: for BCE loss, label < 0 is invalid.
-        loss_cls = self.loss_weight * self.cls_criterion(
-            cls_score,
-            label,
-            weight,
-            class_weight=class_weight,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            avg_non_ignore=self.avg_non_ignore,
-            ignore_index=ignore_index,
-            **kwargs)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * self.cls_criterion(
+                cls_score,
+                label,
+                weight,
+                class_weight=class_weight,
+                reduction=reduction,
+                avg_factor=avg_factor,
+                avg_non_ignore=self.avg_non_ignore,
+                ignore_index=ignore_index,
+                label_ceil=self.label_ceil,
+                **kwargs)
+        else:
+            loss_cls = self.loss_weight * self.cls_criterion(
+                cls_score,
+                label,
+                weight,
+                class_weight=class_weight,
+                reduction=reduction,
+                avg_factor=avg_factor,
+                avg_non_ignore=self.avg_non_ignore,
+                ignore_index=ignore_index,
+                **kwargs)
         return loss_cls
 
     @property
diff --git a/tests/models/backbones/test_deitiii.py b/tests/models/backbones/test_deitiii.py
new file mode 100644
index 00000000..f95f6ea5
--- /dev/null
+++ b/tests/models/backbones/test_deitiii.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+import torch
+from numpy.testing import assert_array_almost_equal
+
+
+class DeiTIIITest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    @unittest.skip('skip DeiT III unittest')
+    def test_deitiii(self):
+        model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/epoch_800.pth'
+        config_path = 'configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py'
+        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/deitiii_demo.JPEG'
+        # deitiii = ClsPredictor(model_path, config_path)
+        deitiii = []
+        output = deitiii.predict(img)
+
+        self.assertIn('prob', output)
+        self.assertIn('class', output)
+        self.assertEqual(len(output['prob'][0]), 1000)
+
+        assert_array_almost_equal(
+            output['prob'][0][:10],
+            torch.Tensor([
+                2.04629918698628899e-06, 5.27398606209317222e-06,
+                5.52915162188583054e-06, 3.60625563189387321e-06,
+                3.29447357216849923e-06, 5.61309570912271738e-06,
+                8.93703327164985240e-06, 4.89157764604897238e-06,
+                4.39371024185675196e-06, 5.21611764270346612e-06
+            ]),
+            decimal=8)
+
+        self.assertEqual(int(output['class']), 948)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/train.py b/tools/train.py
index 2241e760..96f93db8 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -273,8 +273,9 @@ def main():
                 drop_last=getattr(cfg.data, 'drop_last', False),
                 reuse_worker_cache=cfg.data.get('reuse_worker_cache', False),
                 persistent_workers=cfg.data.get('persistent_workers', False),
-                collate_hooks=cfg.data.get('train_collate_hooks', []))
-            for ds in datasets
+                collate_hooks=cfg.data.get('train_collate_hooks', []),
+                use_repeated_augment_sampler=cfg.data.get(
+                    'use_repeated_augment_sampler', False)) for ds in datasets
         ]
     else:
         default_args = dict(

From 9f01a37ad4df57b30430c41df08459025174e8fd Mon Sep 17 00:00:00 2001
From: tuofeilun <38110862+tuofeilunhifi@users.noreply.github.com>
Date: Fri, 16 Sep 2022 11:03:53 +0800
Subject: [PATCH 4/9] Refactor ViTDet backbone and simple feature pyramid
 (#177)

1. The vitdet backbone implemented by d2 is about 20% faster than the vitdet backbone originally reproduced by easycv.
2. 50.57 -> 50.65
---
 .../detection/vitdet/lsj_coco_detection.py    |    6 +-
 configs/detection/vitdet/lsj_coco_instance.py |    6 +-
 .../vitdet/vitdet_basicblock_100e.py          |    3 -
 .../vitdet/vitdet_bottleneck_100e.py          |    3 -
 .../vitdet/vitdet_cascade_mask_rcnn.py        |  231 ++++
 .../vitdet/vitdet_cascade_mask_rcnn_100e.py   |    4 +
 .../detection/vitdet/vitdet_faster_rcnn.py    |   31 +-
 .../vitdet/vitdet_faster_rcnn_100e.py         |    2 +-
 configs/detection/vitdet/vitdet_mask_rcnn.py  |   31 +-
 ...itdet_100e.py => vitdet_mask_rcnn_100e.py} |    0
 .../detection/vitdet/vitdet_schedule_100e.py  |   21 +-
 docs/source/_static/result.jpg                |    4 +-
 docs/source/model_zoo_det.md                  |    2 +-
 .../layer_decay_optimizer_constructor.py      |   78 +-
 easycv/models/backbones/vitdet.py             | 1057 ++++++-----------
 easycv/models/detection/necks/fpn.py          |    3 -
 easycv/models/detection/necks/sfp.py          |  216 +---
 easycv/predictors/detector.py                 |   10 +-
 tests/models/backbones/test_vitdet.py         |   23 +-
 tests/predictors/test_detector.py             |  189 ++-
 20 files changed, 925 insertions(+), 995 deletions(-)
 delete mode 100644 configs/detection/vitdet/vitdet_basicblock_100e.py
 delete mode 100644 configs/detection/vitdet/vitdet_bottleneck_100e.py
 create mode 100644 configs/detection/vitdet/vitdet_cascade_mask_rcnn.py
 create mode 100644 configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py
 rename configs/detection/vitdet/{vitdet_100e.py => vitdet_mask_rcnn_100e.py} (100%)

diff --git a/configs/detection/vitdet/lsj_coco_detection.py b/configs/detection/vitdet/lsj_coco_detection.py
index f5da1064..fb243a23 100644
--- a/configs/detection/vitdet/lsj_coco_detection.py
+++ b/configs/detection/vitdet/lsj_coco_detection.py
@@ -101,13 +101,15 @@ val_dataset = dict(
     pipeline=test_pipeline)
 
 data = dict(
-    imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset)
+    imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset
+)  # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node)
 
 # evaluation
-eval_config = dict(interval=1, gpu_collect=False)
+eval_config = dict(initial=False, interval=1, gpu_collect=False)
 eval_pipelines = [
     dict(
         mode='test',
+        # dist_eval=True,
         evaluators=[
             dict(type='CocoDetectionEvaluator', classes=CLASSES),
         ],
diff --git a/configs/detection/vitdet/lsj_coco_instance.py b/configs/detection/vitdet/lsj_coco_instance.py
index a42aa040..5271363f 100644
--- a/configs/detection/vitdet/lsj_coco_instance.py
+++ b/configs/detection/vitdet/lsj_coco_instance.py
@@ -101,13 +101,15 @@ val_dataset = dict(
     pipeline=test_pipeline)
 
 data = dict(
-    imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset)
+    imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset
+)  # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node)
 
 # evaluation
-eval_config = dict(interval=1, gpu_collect=False)
+eval_config = dict(initial=False, interval=1, gpu_collect=False)
 eval_pipelines = [
     dict(
         mode='test',
+        # dist_eval=True,
         evaluators=[
             dict(type='CocoDetectionEvaluator', classes=CLASSES),
             dict(type='CocoMaskEvaluator', classes=CLASSES)
diff --git a/configs/detection/vitdet/vitdet_basicblock_100e.py b/configs/detection/vitdet/vitdet_basicblock_100e.py
deleted file mode 100644
index a3ea54e7..00000000
--- a/configs/detection/vitdet/vitdet_basicblock_100e.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './vitdet_100e.py'
-
-model = dict(backbone=dict(aggregation='basicblock'))
diff --git a/configs/detection/vitdet/vitdet_bottleneck_100e.py b/configs/detection/vitdet/vitdet_bottleneck_100e.py
deleted file mode 100644
index a6031797..00000000
--- a/configs/detection/vitdet/vitdet_bottleneck_100e.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './vitdet_100e.py'
-
-model = dict(backbone=dict(aggregation='bottleneck'))
diff --git a/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py b/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py
new file mode 100644
index 00000000..dfe0d68d
--- /dev/null
+++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py
@@ -0,0 +1,231 @@
+# model settings
+
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)
+
+pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
+model = dict(
+    type='CascadeRCNN',
+    pretrained=pretrained,
+    backbone=dict(
+        type='ViTDet',
+        img_size=1024,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
+        mlp_ratio=4,
+        qkv_bias=True,
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
+    neck=dict(
+        type='SFP',
+        in_channels=768,
+        out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
+        norm_cfg=norm_cfg,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        num_convs=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            norm_cfg=norm_cfg,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
+
+mmlab_modules = [
+    dict(type='mmdet', name='CascadeRCNN', module='model'),
+    dict(type='mmdet', name='RPNHead', module='head'),
+    dict(type='mmdet', name='CascadeRoIHead', module='head'),
+]
diff --git a/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py b/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py
new file mode 100644
index 00000000..bbbc339f
--- /dev/null
+++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py
@@ -0,0 +1,4 @@
+_base_ = [
+    './vitdet_cascade_mask_rcnn.py', './lsj_coco_instance.py',
+    './vitdet_schedule_100e.py'
+]
diff --git a/configs/detection/vitdet/vitdet_faster_rcnn.py b/configs/detection/vitdet/vitdet_faster_rcnn.py
index 48604d8b..0a00b397 100644
--- a/configs/detection/vitdet/vitdet_faster_rcnn.py
+++ b/configs/detection/vitdet/vitdet_faster_rcnn.py
@@ -1,6 +1,6 @@
 # model settings
 
-norm_cfg = dict(type='GN', num_groups=1, requires_grad=True)
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)
 
 pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
 model = dict(
@@ -9,22 +9,32 @@ model = dict(
     backbone=dict(
         type='ViTDet',
         img_size=1024,
+        patch_size=16,
         embed_dim=768,
         depth=12,
         num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
         mlp_ratio=4,
         qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1,
-        use_abs_pos_emb=True,
-        aggregation='attn',
-    ),
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
     neck=dict(
         type='SFP',
-        in_channels=[768, 768, 768, 768],
+        in_channels=768,
         out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
         norm_cfg=norm_cfg,
         num_outs=5),
     rpn_head=dict(
@@ -32,7 +42,6 @@ model = dict(
         in_channels=256,
         feat_channels=256,
         num_convs=2,
-        norm_cfg=norm_cfg,
         anchor_generator=dict(
             type='AnchorGenerator',
             scales=[8],
@@ -98,7 +107,7 @@ model = dict(
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.5,
                 min_pos_iou=0.5,
-                match_low_quality=True,
+                match_low_quality=False,
                 ignore_iof_thr=-1),
             sampler=dict(
                 type='RandomSampler',
diff --git a/configs/detection/vitdet/vitdet_faster_rcnn_100e.py b/configs/detection/vitdet/vitdet_faster_rcnn_100e.py
index 5a43b575..bfeab9d1 100644
--- a/configs/detection/vitdet/vitdet_faster_rcnn_100e.py
+++ b/configs/detection/vitdet/vitdet_faster_rcnn_100e.py
@@ -1,4 +1,4 @@
 _base_ = [
-    './vitdet_faster_rcnn.py', './lsj_coco_detection.py',
+    './vitdet_faster_rcnn.py', './lsj_coco_instance.py',
     './vitdet_schedule_100e.py'
 ]
diff --git a/configs/detection/vitdet/vitdet_mask_rcnn.py b/configs/detection/vitdet/vitdet_mask_rcnn.py
index 890f6e8f..6b1ed1ce 100644
--- a/configs/detection/vitdet/vitdet_mask_rcnn.py
+++ b/configs/detection/vitdet/vitdet_mask_rcnn.py
@@ -1,6 +1,6 @@
 # model settings
 
-norm_cfg = dict(type='GN', num_groups=1, requires_grad=True)
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)
 
 pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
 model = dict(
@@ -9,22 +9,32 @@ model = dict(
     backbone=dict(
         type='ViTDet',
         img_size=1024,
+        patch_size=16,
         embed_dim=768,
         depth=12,
         num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
         mlp_ratio=4,
         qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1,
-        use_abs_pos_emb=True,
-        aggregation='attn',
-    ),
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
     neck=dict(
         type='SFP',
-        in_channels=[768, 768, 768, 768],
+        in_channels=768,
         out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
         norm_cfg=norm_cfg,
         num_outs=5),
     rpn_head=dict(
@@ -32,7 +42,6 @@ model = dict(
         in_channels=256,
         feat_channels=256,
         num_convs=2,
-        norm_cfg=norm_cfg,
         anchor_generator=dict(
             type='AnchorGenerator',
             scales=[8],
@@ -112,7 +121,7 @@ model = dict(
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.5,
                 min_pos_iou=0.5,
-                match_low_quality=True,
+                match_low_quality=False,
                 ignore_iof_thr=-1),
             sampler=dict(
                 type='RandomSampler',
diff --git a/configs/detection/vitdet/vitdet_100e.py b/configs/detection/vitdet/vitdet_mask_rcnn_100e.py
similarity index 100%
rename from configs/detection/vitdet/vitdet_100e.py
rename to configs/detection/vitdet/vitdet_mask_rcnn_100e.py
diff --git a/configs/detection/vitdet/vitdet_schedule_100e.py b/configs/detection/vitdet/vitdet_schedule_100e.py
index e659b1f6..a9160eba 100644
--- a/configs/detection/vitdet/vitdet_schedule_100e.py
+++ b/configs/detection/vitdet/vitdet_schedule_100e.py
@@ -1,26 +1,29 @@
 _base_ = 'configs/base.py'
 
+log_config = dict(
+    interval=200,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
 checkpoint_config = dict(interval=10)
+
 # optimizer
-paramwise_options = {
-    'norm': dict(weight_decay=0.),
-    'bias': dict(weight_decay=0.),
-    'pos_embed': dict(weight_decay=0.),
-    'cls_token': dict(weight_decay=0.)
-}
 optimizer = dict(
     type='AdamW',
     lr=1e-4,
     betas=(0.9, 0.999),
     weight_decay=0.1,
-    paramwise_options=paramwise_options)
-optimizer_config = dict(grad_clip=None, loss_scale=512.)
+    constructor='LayerDecayOptimizerConstructor',
+    paramwise_options=dict(num_layers=12, layer_decay_rate=0.7))
+optimizer_config = dict(grad_clip=None)
 # learning policy
 lr_config = dict(
     policy='step',
     warmup='linear',
     warmup_iters=250,
-    warmup_ratio=0.067,
+    warmup_ratio=0.001,
     step=[88, 96])
 total_epochs = 100
 
diff --git a/docs/source/_static/result.jpg b/docs/source/_static/result.jpg
index 5bb73d81..d63bad1d 100644
--- a/docs/source/_static/result.jpg
+++ b/docs/source/_static/result.jpg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee64c0caef841c61c7e6344b7fe2c07a38fba07a8de81ff38c0686c641e0a283
-size 190356
+oid sha256:c696a58a2963b5ac47317751f04ff45bfed4723f2f70bacf91eac711f9710e54
+size 189432
diff --git a/docs/source/model_zoo_det.md b/docs/source/model_zoo_det.md
index 03eb3588..474496f0 100644
--- a/docs/source/model_zoo_det.md
+++ b/docs/source/model_zoo_det.md
@@ -22,7 +22,7 @@ Pretrained on COCO2017 dataset. (The result has been optimized with PAI-Blade, a
 
 | Algorithm  | Config                                                       | Params<br/>(backbone/total)                      | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | mask_mAP<sup>val<br/><sub>0.5:0.95</sub> | Download                                                     |
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_100e.py) | 88M/118M | 163ms | 50.57                   | 44.96          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.log.json) |
+| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_mask_rcnn_100e.py) | 86M/111M | 138ms | 50.65                   | 45.41          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/20220901_135827.log.json) |
 
 ## FCOS
 
diff --git a/easycv/core/optimizer/layer_decay_optimizer_constructor.py b/easycv/core/optimizer/layer_decay_optimizer_constructor.py
index 45625494..310bb38c 100644
--- a/easycv/core/optimizer/layer_decay_optimizer_constructor.py
+++ b/easycv/core/optimizer/layer_decay_optimizer_constructor.py
@@ -1,5 +1,3 @@
-# Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py
-
 import json
 
 from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
@@ -7,23 +5,32 @@ from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
 from .builder import OPTIMIZER_BUILDERS
 
 
-def get_num_layer_for_vit(var_name, num_max_layer, layer_sep=None):
-    if var_name in ('backbone.cls_token', 'backbone.mask_token',
-                    'backbone.pos_embed'):
-        return 0
-    elif var_name.startswith('backbone.patch_embed'):
-        return 0
-    elif var_name.startswith('backbone.blocks'):
-        layer_id = int(var_name.split('.')[2])
-        return layer_id + 1
-    else:
-        return num_max_layer - 1
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Reference from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if '.pos_embed' in name or '.patch_embed' in name:
+        layer_id = 0
+    elif '.blocks.' in name and '.residual.' not in name:
+        layer_id = int(name[name.find('.blocks.'):].split('.')[2]) + 1
+
+    scale = lr_decay_rate**(num_layers + 1 - layer_id)
+
+    return layer_id, scale
 
 
 @OPTIMIZER_BUILDERS.register_module()
 class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
 
-    def add_params(self, params, module, prefix='', is_dcn_module=None):
+    def add_params(self, params, module):
         """Add all parameters of module to the params list.
         The parameters of the given module will be added to the list of param
         groups, with specific rules defined by paramwise_cfg.
@@ -31,54 +38,41 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
             params (list[dict]): A list of param groups, it will be modified
                 in place.
             module (nn.Module): The module to be added.
-            prefix (str): The prefix of the module
-            is_dcn_module (int|float|None): If the current module is a
-                submodule of DCN, `is_dcn_module` will be passed to
-                control conv_offset layer's learning rate. Defaults to None.
+
+        Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py
+        Note: Currently, this optimizer constructor is built for ViTDet.
         """
-        # get param-wise options
 
         parameter_groups = {}
         print(self.paramwise_cfg)
-        num_layers = self.paramwise_cfg.get('num_layers') + 2
-        layer_sep = self.paramwise_cfg.get('layer_sep', None)
-        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        lr_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        num_layers = self.paramwise_cfg.get('num_layers')
         print('Build LayerDecayOptimizerConstructor %f - %d' %
-              (layer_decay_rate, num_layers))
+              (lr_decay_rate, num_layers))
+        lr = self.base_lr
         weight_decay = self.base_wd
 
-        custom_keys = self.paramwise_cfg.get('custom_keys', {})
-        # first sort with alphabet order and then sort with reversed len of str
-        sorted_keys = sorted(custom_keys.keys())
-
         for name, param in module.named_parameters():
 
             if not param.requires_grad:
                 continue  # frozen weights
 
-            if len(param.shape) == 1 or name.endswith('.bias') or (
-                    'pos_embed' in name) or ('cls_token'
-                                             in name) or ('rel_pos_' in name):
+            if 'backbone' in name and ('.norm' in name or '.pos_embed' in name
+                                       or '.gn.' in name or '.ln.' in name):
                 group_name = 'no_decay'
                 this_weight_decay = 0.
             else:
                 group_name = 'decay'
                 this_weight_decay = weight_decay
 
-            layer_id = get_num_layer_for_vit(name, num_layers, layer_sep)
+            if name.startswith('backbone'):
+                layer_id, scale = get_vit_lr_decay_rate(
+                    name, lr_decay_rate=lr_decay_rate, num_layers=num_layers)
+            else:
+                layer_id, scale = -1, 1
             group_name = 'layer_%d_%s' % (layer_id, group_name)
 
-            # if the parameter match one of the custom keys, ignore other rules
-            this_lr_multi = 1.
-            for key in sorted_keys:
-                if key in f'{name}':
-                    lr_mult = custom_keys[key].get('lr_mult', 1.)
-                    this_lr_multi = lr_mult
-                    group_name = '%s_%s' % (group_name, key)
-                    break
-
             if group_name not in parameter_groups:
-                scale = layer_decay_rate**(num_layers - layer_id - 1)
 
                 parameter_groups[group_name] = {
                     'weight_decay': this_weight_decay,
@@ -86,7 +80,7 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
                     'param_names': [],
                     'lr_scale': scale,
                     'group_name': group_name,
-                    'lr': scale * self.base_lr * this_lr_multi,
+                    'lr': scale * lr,
                 }
 
             parameter_groups[group_name]['params'].append(param)
diff --git a/easycv/models/backbones/vitdet.py b/easycv/models/backbones/vitdet.py
index 83e11efa..9380f740 100644
--- a/easycv/models/backbones/vitdet.py
+++ b/easycv/models/backbones/vitdet.py
@@ -1,5 +1,3 @@
-# Copyright 2018-2023 OpenMMLab. All rights reserved.
-# Reference: https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmdet/models/backbones/vit.py
 import math
 from functools import partial
 
@@ -7,793 +5,466 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from mmcv.cnn import build_norm_layer, constant_init, kaiming_init
-from mmcv.runner import get_dist_info
-from timm.models.layers import to_2tuple, trunc_normal_
-from torch.nn.modules.batchnorm import _BatchNorm
+from timm.models.layers import DropPath, trunc_normal_
 
-from easycv.models.utils import DropPath, Mlp
+from easycv.models.utils import Mlp
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from ..registry import BACKBONES
-from ..utils import build_conv_layer
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN')):
-        super(BasicBlock, self).__init__()
-
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            inplanes,
-            planes,
-            3,
-            stride=stride,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            conv_cfg, planes, planes, 3, padding=1, bias=False)
-        self.add_module(self.norm2_name, norm2)
-
-        self.relu = nn.ReLU(inplace=True)
-        self.stride = stride
-        self.dilation = dilation
-
-    @property
-    def norm1(self):
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        return getattr(self, self.norm2_name)
-
-    def forward(self, x, H, W):
-        B, _, C = x.shape
-        x = x.permute(0, 2, 1).reshape(B, -1, H, W)
-        identity = x
-
-        out = self.conv1(x)
-        out = self.norm1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.norm2(out)
-
-        out += identity
-        out = self.relu(out)
-        out = out.flatten(2).transpose(1, 2)
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN')):
-        """Bottleneck block for ResNet.
-        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
-        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
-        """
-        super(Bottleneck, self).__init__()
-
-        self.inplanes = inplanes
-        self.planes = planes
-        self.stride = stride
-        self.dilation = dilation
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-
-        self.conv1_stride = 1
-        self.conv2_stride = stride
-
-        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            norm_cfg, planes * self.expansion, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            inplanes,
-            planes,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            conv_cfg,
-            planes,
-            planes,
-            kernel_size=3,
-            stride=self.conv2_stride,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            conv_cfg,
-            planes,
-            planes * self.expansion,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-        self.relu = nn.ReLU(inplace=True)
-
-    @property
-    def norm1(self):
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        return getattr(self, self.norm2_name)
-
-    @property
-    def norm3(self):
-        return getattr(self, self.norm3_name)
-
-    def forward(self, x, H, W):
-        B, _, C = x.shape
-        x = x.permute(0, 2, 1).reshape(B, -1, H, W)
-        identity = x
-
-        out = self.conv1(x)
-        out = self.norm1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.norm2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.norm3(out)
-
-        out += identity
-        out = self.relu(out)
-        out = out.flatten(2).transpose(1, 2)
-        return out
-
-
-class Attention(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 window_size=None,
-                 attn_head_dim=None):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        if attn_head_dim is not None:
-            head_dim = attn_head_dim
-        all_head_dim = head_dim * self.num_heads
-        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
-        self.window_size = window_size
-        q_size = window_size[0]
-        kv_size = q_size
-        rel_sp_dim = 2 * q_size - 1
-        self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
-        self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
-
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(all_head_dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, H, W, rel_pos_bias=None):
-        B, N, C = x.shape
-        # qkv_bias = None
-        # if self.q_bias is not None:
-        #     qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
-        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        qkv = self.qkv(x)
-        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[
-            2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        attn = calc_rel_pos_spatial(attn, q, self.window_size,
-                                    self.window_size, self.rel_pos_h,
-                                    self.rel_pos_w)
-        # if self.relative_position_bias_table is not None:
-        #     relative_position_bias = \
-        #         self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-        #             self.window_size[0] * self.window_size[1] + 1,
-        #             self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
-        #     relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        #     attn = attn + relative_position_bias.unsqueeze(0)
-
-        # if rel_pos_bias is not None:
-        #     attn = attn + rel_pos_bias
-
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
 
 
 def window_partition(x, window_size):
     """
+    Partition into non-overlapping windows with padding if needed.
     Args:
-        x: (B, H, W, C)
-        window_size (int): window size
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
     Returns:
-        windows: (num_windows*B, window_size, window_size, C)
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
     """
     B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
-               C)
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size,
+               window_size, C)
     windows = x.permute(0, 1, 3, 2, 4,
                         5).contiguous().view(-1, window_size, window_size, C)
-    return windows
+    return windows, (Hp, Wp)
 
 
-def window_reverse(windows, window_size, H, W):
+def window_unpartition(windows, window_size, pad_hw, hw):
     """
+    Window unpartition into original sequences and removing padding.
     Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
     Returns:
-        x: (B, H, W, C)
+        x: unpartitioned sequences with [B, H, W, C].
     """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size,
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size,
                      window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
     return x
 
 
-def calc_rel_pos_spatial(
-    attn,
-    q,
-    q_shape,
-    k_shape,
-    rel_pos_h,
-    rel_pos_w,
-):
+def get_rel_pos(q_size, k_size, rel_pos):
     """
-    Spatial Relative Positional Embeddings.
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
     """
-    sp_idx = 0
-    q_h, q_w = q_shape
-    k_h, k_w = k_shape
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode='linear',
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1,
+                                                  max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
 
-    # Scale up rel pos if shapes for q and k are different.
-    q_h_ratio = max(k_h / q_h, 1.0)
-    k_h_ratio = max(q_h / k_h, 1.0)
-    dist_h = (
-        torch.arange(q_h)[:, None] * q_h_ratio -
-        torch.arange(k_h)[None, :] * k_h_ratio)
-    dist_h += (k_h - 1) * k_h_ratio
-    q_w_ratio = max(k_w / q_w, 1.0)
-    k_w_ratio = max(q_w / k_w, 1.0)
-    dist_w = (
-        torch.arange(q_w)[:, None] * q_w_ratio -
-        torch.arange(k_w)[None, :] * k_w_ratio)
-    dist_w += (k_w - 1) * k_w_ratio
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords -
+                       k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
 
-    Rh = rel_pos_h[dist_h.long()]
-    Rw = rel_pos_w[dist_w.long()]
+    return rel_pos_resized[relative_coords.long()]
 
-    B, n_head, q_N, dim = q.shape
 
-    r_q = q[:, :, sp_idx:].reshape(B, n_head, q_h, q_w, dim)
-    rel_h = torch.einsum('byhwc,hkc->byhwk', r_q, Rh)
-    rel_w = torch.einsum('byhwc,wkc->byhwk', r_q, Rw)
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
 
-    attn[:, :, sp_idx:, sp_idx:] = (
-        attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_h, q_w, k_h, k_w) +
-        rel_h[:, :, :, :, :, None] + rel_w[:, :, :, :, None, :]).view(
-            B, -1, q_h * q_w, k_h * k_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum('bhwc,hkc->bhwk', r_q, Rh)
+    rel_w = torch.einsum('bhwc,wkc->bhwk', r_q, Rw)
+
+    attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] +
+            rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w)
 
     return attn
 
 
-class WindowAttention(nn.Module):
-    """ Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
     Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode='bicubic',
+            align_corners=False,
+        )
+
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
     """
 
     def __init__(self,
-                 dim,
-                 window_size,
-                 num_heads,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 attn_head_dim=None):
-
+                 kernel_size=(16, 16),
+                 stride=(16, 16),
+                 padding=(0, 0),
+                 in_chans=3,
+                 embed_dim=768):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
         super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        q_size = window_size[0]
-        kv_size = window_size[1]
-        rel_sp_dim = 2 * q_size - 1
-        self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
-        self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
+        self.scale = head_dim**-0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
 
-        # trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(
+                torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(
+                torch.zeros(2 * input_size[1] - 1, head_dim))
 
-    def forward(self, x, H, W):
-        """ Forward function.
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        x = x.reshape(B_, H, W, C)
-        pad_l = pad_t = 0
-        pad_r = (self.window_size[1] -
-                 W % self.window_size[1]) % self.window_size[1]
-        pad_b = (self.window_size[0] -
-                 H % self.window_size[0]) % self.window_size[0]
+            if not rel_pos_zero_init:
+                trunc_normal_(self.rel_pos_h, std=0.02)
+                trunc_normal_(self.rel_pos_w, std=0.02)
 
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
-        _, Hp, Wp, _ = x.shape
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads,
+                                  -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
 
-        x = window_partition(
-            x, self.window_size[0])  # nW*B, window_size, window_size, C
-        x = x.view(-1, self.window_size[1] * self.window_size[0],
-                   C)  # nW*B, window_size*window_size, C
-        B_w = x.shape[0]
-        N_w = x.shape[1]
-        qkv = self.qkv(x).reshape(B_w, N_w, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[
-            2]  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
 
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h,
+                                          self.rel_pos_w, (H, W), (H, W))
 
-        attn = calc_rel_pos_spatial(attn, q, self.window_size,
-                                    self.window_size, self.rel_pos_h,
-                                    self.rel_pos_w)
-
-        attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_w, N_w, C)
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W,
+                            -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
         x = self.proj(x)
-        x = self.proj_drop(x)
-
-        x = x.view(-1, self.window_size[1], self.window_size[0], C)
-        x = window_reverse(x, self.window_size[0], Hp, Wp)  # B H' W' C
-
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :].contiguous()
-
-        x = x.view(B_, H * W, C)
 
         return x
 
 
 class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
 
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 init_values=None,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm,
-                 window_size=None,
-                 attn_head_dim=None,
-                 window=False,
-                 aggregation='attn'):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
         super().__init__()
         self.norm1 = norm_layer(dim)
-        self.aggregation = aggregation
-        self.window = window
-        if not window:
-            if aggregation == 'attn':
-                self.attn = Attention(
-                    dim,
-                    num_heads=num_heads,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    attn_drop=attn_drop,
-                    proj_drop=drop,
-                    window_size=window_size,
-                    attn_head_dim=attn_head_dim)
-            else:
-                self.attn = WindowAttention(
-                    dim,
-                    num_heads=num_heads,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    attn_drop=attn_drop,
-                    proj_drop=drop,
-                    window_size=window_size,
-                    attn_head_dim=attn_head_dim)
-                if aggregation == 'basicblock':
-                    self.conv_aggregation = BasicBlock(
-                        inplanes=dim, planes=dim)
-                elif aggregation == 'bottleneck':
-                    self.conv_aggregation = Bottleneck(
-                        inplanes=dim, planes=dim // 4)
-        else:
-            self.attn = WindowAttention(
-                dim,
-                num_heads=num_heads,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                attn_drop=attn_drop,
-                proj_drop=drop,
-                window_size=window_size,
-                attn_head_dim=attn_head_dim)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else
+            (window_size, window_size),
+        )
+
         self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
+            drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
         self.mlp = Mlp(
             in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop)
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer)
 
-        if init_values is not None:
-            self.gamma_1 = nn.Parameter(
-                init_values * torch.ones((dim)), requires_grad=True)
-            self.gamma_2 = nn.Parameter(
-                init_values * torch.ones((dim)), requires_grad=True)
-        else:
-            self.gamma_1, self.gamma_2 = None, None
+        self.window_size = window_size
 
-    def forward(self, x, H, W):
-        if self.gamma_1 is None:
-            x = x + self.drop_path(self.attn(self.norm1(x), H, W))
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(
-                self.gamma_1 * self.attn(self.norm1(x), H, W))
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        if not self.window and self.aggregation != 'attn':
-            x = self.conv_aggregation(x, H, W)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (
-            img_size[0] // patch_size[0])
-        self.patch_shape = (img_size[0] // patch_size[0],
-                            img_size[1] // patch_size[1])
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x, **kwargs):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        # assert H == self.img_size[0] and W == self.img_size[1], \
-        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x)
-        Hp, Wp = x.shape[2], x.shape[3]
-
-        x = x.flatten(2).transpose(1, 2)
-        return x, (Hp, Wp)
-
-
-class HybridEmbed(nn.Module):
-    """ CNN Feature Map Embedding
-    Extract feature map from CNN, flatten, project to embedding dim.
-    """
-
-    def __init__(self,
-                 backbone,
-                 img_size=224,
-                 feature_size=None,
-                 in_chans=3,
-                 embed_dim=768):
-        super().__init__()
-        assert isinstance(backbone, nn.Module)
-        img_size = to_2tuple(img_size)
-        self.img_size = img_size
-        self.backbone = backbone
-        if feature_size is None:
-            with torch.no_grad():
-                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
-                # map for all networks, the feature metadata has reliable channel and stride info, but using
-                # stride to calc feature dim requires info about padding of each stage that isn't captured.
-                training = backbone.training
-                if training:
-                    backbone.eval()
-                o = self.backbone(
-                    torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
-                feature_size = o.shape[-2:]
-                feature_dim = o.shape[1]
-                backbone.train(training)
-        else:
-            feature_size = to_2tuple(feature_size)
-            feature_dim = self.backbone.feature_info.channels()[-1]
-        self.num_patches = feature_size[0] * feature_size[1]
-        self.proj = nn.Linear(feature_dim, embed_dim)
+        self.use_residual_block = use_residual_block
 
     def forward(self, x):
-        x = self.backbone(x)[-1]
-        x = x.flatten(2).transpose(1, 2)
-        x = self.proj(x)
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+
         return x
 
 
-class Norm2d(nn.Module):
-
-    def __init__(self, embed_dim):
-        super().__init__()
-        self.ln = nn.LayerNorm(embed_dim, eps=1e-6)
-
-    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
-        x = self.ln(x)
-        x = x.permute(0, 3, 1, 2).contiguous()
-        return x
-
-
-# todo: refactor vitdet and vit_transformer_dynamic
 @BACKBONES.register_module()
 class ViTDet(nn.Module):
-    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
     """
 
-    def __init__(self,
-                 img_size=224,
-                 patch_size=16,
-                 in_chans=3,
-                 num_classes=80,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 hybrid_backbone=None,
-                 norm_layer=None,
-                 init_values=None,
-                 use_checkpoint=False,
-                 use_abs_pos_emb=False,
-                 use_rel_pos_bias=False,
-                 use_shared_rel_pos_bias=False,
-                 out_indices=[11],
-                 interval=3,
-                 pretrained=None,
-                 aggregation='attn'):
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        pretrained=None,
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+        """
         super().__init__()
-        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
-        self.num_classes = num_classes
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.use_act_checkpoint = use_act_checkpoint
 
-        if hybrid_backbone is not None:
-            self.patch_embed = HybridEmbed(
-                hybrid_backbone,
-                img_size=img_size,
-                in_chans=in_chans,
-                embed_dim=embed_dim)
-        else:
-            self.patch_embed = PatchEmbed(
-                img_size=img_size,
-                patch_size=patch_size,
-                in_chans=in_chans,
-                embed_dim=embed_dim)
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
 
-        num_patches = self.patch_embed.num_patches
-
-        self.out_indices = out_indices
-
-        if use_abs_pos_emb:
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (
+                pretrain_img_size // patch_size)
+            num_positions = (num_patches +
+                             1) if pretrain_use_cls_token else num_patches
             self.pos_embed = nn.Parameter(
-                torch.zeros(1, num_patches, embed_dim))
+                torch.zeros(1, num_positions, embed_dim))
         else:
             self.pos_embed = None
 
-        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
-               ]  # stochastic depth decay rule
-        self.use_rel_pos_bias = use_rel_pos_bias
-        self.use_checkpoint = use_checkpoint
-        self.blocks = nn.ModuleList([
-            Block(
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
                 dim=embed_dim,
                 num_heads=num_heads,
                 mlp_ratio=mlp_ratio,
                 qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
-                init_values=init_values,
-                window_size=(14, 14) if
-                ((i + 1) % interval != 0
-                 or aggregation != 'attn') else self.patch_embed.patch_shape,
-                window=((i + 1) % interval != 0),
-                aggregation=aggregation) for i in range(depth)
-        ])
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
 
         if self.pos_embed is not None:
-            trunc_normal_(self.pos_embed, std=.02)
-
-        self.norm = norm_layer(embed_dim)
+            trunc_normal_(self.pos_embed, std=0.02)
 
+        self.apply(self._init_weights)
         self.pretrained = pretrained
-        self._register_load_state_dict_pre_hook(self._prepare_checkpoint_hook)
 
-    def fix_init_weight(self):
-
-        def rescale(param, layer_id):
-            param.div_(math.sqrt(2.0 * layer_id))
-
-        for layer_id, layer in enumerate(self.blocks):
-            rescale(layer.attn.proj.weight.data, layer_id + 1)
-            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        self.fix_init_weight()
-        pretrained = pretrained or self.pretrained
-
-        def _init_weights(m):
-            if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=.02)
-                if isinstance(m, nn.Linear) and m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.LayerNorm):
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-                nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
 
-            if isinstance(m, nn.Conv2d):
-                kaiming_init(m, mode='fan_in', nonlinearity='relu')
-            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                constant_init(m, 1)
-
-            if isinstance(m, Bottleneck):
-                constant_init(m.norm3, 0)
-            elif isinstance(m, BasicBlock):
-                constant_init(m.norm2, 0)
-
-        if isinstance(pretrained, str):
-            self.apply(_init_weights)
+    def init_weights(self):
+        if isinstance(self.pretrained, str):
             logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            self.apply(_init_weights)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def _prepare_checkpoint_hook(self, state_dict, prefix, *args, **kwargs):
-        rank, _ = get_dist_info()
-        if 'pos_embed' in state_dict:
-            pos_embed_checkpoint = state_dict['pos_embed']
-            embedding_size = pos_embed_checkpoint.shape[-1]
-            H, W = self.patch_embed.patch_shape
-            num_patches = self.patch_embed.num_patches
-            num_extra_tokens = 1
-            # height (== width) for the checkpoint position embedding
-            orig_size = int(
-                (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5)
-            # height (== width) for the new position embedding
-            new_size = int(num_patches**0.5)
-            # class_token and dist_token are kept unchanged
-            if orig_size != new_size:
-                if rank == 0:
-                    print('Position interpolate from %dx%d to %dx%d' %
-                          (orig_size, orig_size, H, W))
-                # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-                # only the position tokens are interpolated
-                pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-                pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
-                                                embedding_size).permute(
-                                                    0, 3, 1, 2)
-                pos_tokens = torch.nn.functional.interpolate(
-                    pos_tokens,
-                    size=(H, W),
-                    mode='bicubic',
-                    align_corners=False)
-                new_pos_embed = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-                # new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-                state_dict['pos_embed'] = new_pos_embed
-
-    def get_num_layers(self):
-        return len(self.blocks)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-
-    def forward_features(self, x):
-        B, C, H, W = x.shape
-        x, (Hp, Wp) = self.patch_embed(x)
-        batch_size, seq_len, _ = x.size()
-
-        if self.pos_embed is not None:
-            x = x + self.pos_embed
-        x = self.pos_drop(x)
-
-        outs = []
-        for i, blk in enumerate(self.blocks):
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x, Hp, Wp)
-
-        x = self.norm(x)
-        xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp)
-
-        outs.append(xp)
-
-        return tuple(outs)
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
 
     def forward(self, x):
-        x = self.forward_features(x)
-        return x
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token,
+                                (x.shape[1], x.shape[2]))
+
+        for blk in self.blocks:
+            if self.use_act_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        outputs = [x.permute(0, 3, 1, 2)]
+        return outputs
diff --git a/easycv/models/detection/necks/fpn.py b/easycv/models/detection/necks/fpn.py
index 6d14bbef..8018903c 100644
--- a/easycv/models/detection/necks/fpn.py
+++ b/easycv/models/detection/necks/fpn.py
@@ -37,7 +37,6 @@ class FPN(nn.Module):
             Default: None.
         upsample_cfg (dict): Config dict for interpolate layer.
             Default: dict(mode='nearest').
-        init_cfg (dict or list[dict], optional): Initialization config dict.
     Example:
         >>> import torch
         >>> in_channels = [2, 3, 5, 7]
@@ -67,8 +66,6 @@ class FPN(nn.Module):
                  norm_cfg=None,
                  act_cfg=None,
                  upsample_cfg=dict(mode='nearest')):
-        #  init_cfg=dict(
-        #      type='Xavier', layer='Conv2d', distribution='uniform')):
         super(FPN, self).__init__()
         assert isinstance(in_channels, list)
         self.in_channels = in_channels
diff --git a/easycv/models/detection/necks/sfp.py b/easycv/models/detection/necks/sfp.py
index be1273b0..b588f643 100644
--- a/easycv/models/detection/necks/sfp.py
+++ b/easycv/models/detection/necks/sfp.py
@@ -2,26 +2,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
 
 from easycv.models.builder import NECKS
 
 
-class Norm2d(nn.Module):
-
-    def __init__(self, embed_dim):
-        super().__init__()
-        self.ln = nn.LayerNorm(embed_dim, eps=1e-6)
-
-    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
-        x = self.ln(x)
-        x = x.permute(0, 3, 1, 2).contiguous()
-        return x
-
-
 @NECKS.register_module()
-class SFP(BaseModule):
+class SFP(nn.Module):
     r"""Simple Feature Pyramid.
     This is an implementation of paper `Exploring Plain Vision Transformer Backbones for Object Detection <https://arxiv.org/abs/2203.16527>`_.
     Args:
@@ -32,25 +18,12 @@ class SFP(BaseModule):
             build the feature pyramid. Default: 0.
         end_level (int): Index of the end input backbone level (exclusive) to
             build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool | str): If bool, it decides whether to add conv
-            layers on top of the original feature maps. Default to False.
-            If True, it is equivalent to `add_extra_convs='on_input'`.
-            If str, it specifies the source feature map of the extra convs.
-            Only the following options are allowed
-            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_lateral':  Last feature map after lateral convs.
-            - 'on_output': The last output feature map after fpn convs.
-        relu_before_extra_convs (bool): Whether to apply relu before the extra
             conv. Default: False.
-        no_norm_on_lateral (bool): Whether to apply norm on lateral.
             Default: False.
         conv_cfg (dict): Config dict for convolution layer. Default: None.
         norm_cfg (dict): Config dict for normalization layer. Default: None.
         act_cfg (str): Config dict for activation layer in ConvModule.
             Default: None.
-        upsample_cfg (dict): Config dict for interpolate layer.
-            Default: `dict(mode='nearest')`
-        init_cfg (dict or list[dict], optional): Initialization config dict.
     Example:
         >>> import torch
         >>> in_channels = [2, 3, 5, 7]
@@ -70,158 +43,83 @@ class SFP(BaseModule):
     def __init__(self,
                  in_channels,
                  out_channels,
+                 scale_factors,
                  num_outs,
-                 start_level=0,
-                 end_level=-1,
-                 add_extra_convs=False,
-                 relu_before_extra_convs=False,
-                 no_norm_on_lateral=False,
                  conv_cfg=None,
                  norm_cfg=None,
-                 act_cfg=None,
-                 upsample_cfg=dict(mode='nearest'),
-                 init_cfg=[
-                     dict(
-                         type='Xavier',
-                         layer=['Conv2d'],
-                         distribution='uniform'),
-                     dict(type='Constant', layer=['LayerNorm'], val=1, bias=0)
-                 ]):
-        super(SFP, self).__init__(init_cfg)
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
+                 act_cfg=None):
+        super(SFP, self).__init__()
+        dim = in_channels
         self.out_channels = out_channels
-        self.num_ins = len(in_channels)
+        self.scale_factors = scale_factors
+        self.num_ins = len(scale_factors)
         self.num_outs = num_outs
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.no_norm_on_lateral = no_norm_on_lateral
-        self.upsample_cfg = upsample_cfg.copy()
 
-        if end_level == -1:
-            self.backbone_end_level = self.num_ins
-            assert num_outs >= self.num_ins - start_level
-        else:
-            # if end_level < inputs, no extra level is allowed
-            self.backbone_end_level = end_level
-            assert end_level <= len(in_channels)
-            assert num_outs == end_level - start_level
-        self.start_level = start_level
-        self.end_level = end_level
-        self.add_extra_convs = add_extra_convs
-        assert isinstance(add_extra_convs, (str, bool))
-        if isinstance(add_extra_convs, str):
-            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
-            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
-        elif add_extra_convs:  # True
-            self.add_extra_convs = 'on_input'
-
-        self.top_downs = nn.ModuleList()
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_convs = nn.ModuleList()
-
-        for i in range(self.start_level, self.backbone_end_level):
-            if i == 0:
-                top_down = nn.Sequential(
+        self.stages = []
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0),
+                    nn.GroupNorm(1, dim // 2, eps=1e-6),
+                    nn.GELU(),
                     nn.ConvTranspose2d(
-                        in_channels[i], in_channels[i], 2, stride=2,
-                        padding=0), Norm2d(in_channels[i]), nn.GELU(),
-                    nn.ConvTranspose2d(
-                        in_channels[i], in_channels[i], 2, stride=2,
-                        padding=0))
-            elif i == 1:
-                top_down = nn.ConvTranspose2d(
-                    in_channels[i], in_channels[i], 2, stride=2, padding=0)
-            elif i == 2:
-                top_down = nn.Identity()
-            elif i == 3:
-                top_down = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+                        dim // 2, dim // 4, 2, stride=2, padding=0)
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2, padding=0)]
+            else:
+                raise NotImplementedError(
+                    f'scale_factor={scale} is not supported yet.')
 
-            l_conv = ConvModule(
-                in_channels[i],
-                out_channels,
-                1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
-                inplace=False)
-            fpn_conv = ConvModule(
-                out_channels,
-                out_channels,
-                3,
-                padding=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                inplace=False)
-
-            self.top_downs.append(top_down)
-            self.lateral_convs.append(l_conv)
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv layers (e.g., RetinaNet)
-        extra_levels = num_outs - self.backbone_end_level + self.start_level
-        if self.add_extra_convs and extra_levels >= 1:
-            for i in range(extra_levels):
-                if i == 0 and self.add_extra_convs == 'on_input':
-                    in_channels = self.in_channels[self.backbone_end_level - 1]
-                else:
-                    in_channels = out_channels
-                extra_fpn_conv = ConvModule(
-                    in_channels,
+            layers.extend([
+                ConvModule(
+                    out_dim,
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False),
+                ConvModule(
+                    out_channels,
                     out_channels,
                     3,
-                    stride=2,
                     padding=1,
                     conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                     inplace=False)
-                self.fpn_convs.append(extra_fpn_conv)
+            ])
+
+            layers = nn.Sequential(*layers)
+            self.add_module(f'sfp_{idx}', layers)
+            self.stages.append(layers)
+
+    def init_weights(self):
+        pass
 
     def forward(self, inputs):
         """Forward function."""
-        assert len(inputs) == 1
+        features = inputs[0]
+        outs = []
 
-        # build top-down path
-        features = [
-            top_down(inputs[0]) for _, top_down in enumerate(self.top_downs)
-        ]
-        assert len(features) == len(self.in_channels)
+        # part 1: build simple feature pyramid
+        for stage in self.stages:
+            outs.append(stage(features))
 
-        # build laterals
-        laterals = [
-            lateral_conv(features[i + self.start_level])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-
-        used_backbone_levels = len(laterals)
-
-        # build outputs
-        # part 1: from original levels
-        outs = [
-            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
-        ]
         # part 2: add extra levels
-        if self.num_outs > len(outs):
+        if self.num_outs > self.num_ins:
             # use max pool to get more levels on top of outputs
             # (e.g., Faster R-CNN, Mask R-CNN)
-            if not self.add_extra_convs:
-                for i in range(self.num_outs - used_backbone_levels):
-                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
-            # add conv layers on top of original feature maps (RetinaNet)
-            else:
-                if self.add_extra_convs == 'on_input':
-                    extra_source = inputs[self.backbone_end_level - 1]
-                elif self.add_extra_convs == 'on_lateral':
-                    extra_source = laterals[-1]
-                elif self.add_extra_convs == 'on_output':
-                    extra_source = outs[-1]
-                else:
-                    raise NotImplementedError
-                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
-                for i in range(used_backbone_levels + 1, self.num_outs):
-                    if self.relu_before_extra_convs:
-                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
-                    else:
-                        outs.append(self.fpn_convs[i](outs[-1]))
+            for i in range(self.num_outs - self.num_ins):
+                outs.append(F.max_pool2d(outs[-1], 1, stride=2))
         return tuple(outs)
diff --git a/easycv/predictors/detector.py b/easycv/predictors/detector.py
index f9d05992..017d671e 100644
--- a/easycv/predictors/detector.py
+++ b/easycv/predictors/detector.py
@@ -253,11 +253,11 @@ class DetrPredictor(PredictorInterface):
             img,
             bboxes,
             labels=labels,
-            colors='green',
-            text_color='white',
-            font_size=20,
-            thickness=1,
-            font_scale=0.5,
+            colors='cyan',
+            text_color='cyan',
+            font_size=18,
+            thickness=2,
+            font_scale=0.0,
             show=show,
             out_file=out_file)
 
diff --git a/tests/models/backbones/test_vitdet.py b/tests/models/backbones/test_vitdet.py
index 3f0350a2..82012aed 100644
--- a/tests/models/backbones/test_vitdet.py
+++ b/tests/models/backbones/test_vitdet.py
@@ -14,18 +14,27 @@ class ViTDetTest(unittest.TestCase):
     def test_vitdet(self):
         model = ViTDet(
             img_size=1024,
+            patch_size=16,
             embed_dim=768,
             depth=12,
             num_heads=12,
+            drop_path_rate=0.1,
+            window_size=14,
             mlp_ratio=4,
             qkv_bias=True,
-            qk_scale=None,
-            drop_rate=0.,
-            attn_drop_rate=0.,
-            drop_path_rate=0.1,
-            use_abs_pos_emb=True,
-            aggregation='attn',
-        )
+            window_block_indexes=[
+                # 2, 5, 8 11 for global attention
+                0,
+                1,
+                3,
+                4,
+                6,
+                7,
+                9,
+                10,
+            ],
+            residual_block_indexes=[],
+            use_rel_pos=True)
 
         model.init_weights()
         model.train()
diff --git a/tests/predictors/test_detector.py b/tests/predictors/test_detector.py
index 9187d3a7..c3be2ed6 100644
--- a/tests/predictors/test_detector.py
+++ b/tests/predictors/test_detector.py
@@ -155,7 +155,7 @@ class DetectorTest(unittest.TestCase):
             decimal=1)
 
     def test_vitdet_detector(self):
-        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn_export.pth'
+        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth'
         img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
         out_file = './result.jpg'
         vitdet = DetrPredictor(model_path)
@@ -167,63 +167,170 @@ class DetectorTest(unittest.TestCase):
         self.assertIn('detection_classes', output)
         self.assertIn('detection_masks', output)
         self.assertIn('img_metas', output)
-        self.assertEqual(len(output['detection_boxes'][0]), 30)
-        self.assertEqual(len(output['detection_scores'][0]), 30)
-        self.assertEqual(len(output['detection_classes'][0]), 30)
+        self.assertEqual(len(output['detection_boxes'][0]), 33)
+        self.assertEqual(len(output['detection_scores'][0]), 33)
+        self.assertEqual(len(output['detection_classes'][0]), 33)
 
         self.assertListEqual(
             output['detection_classes'][0].tolist(),
             np.array([
                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 7, 7, 13, 13, 13, 56
+                2, 2, 2, 2, 2, 2, 7, 7, 13, 13, 13, 56
             ],
                      dtype=np.int32).tolist())
 
         assert_array_almost_equal(
             output['detection_scores'][0],
             np.array([
-                0.99791867, 0.99665856, 0.99480623, 0.99060905, 0.9882515,
-                0.98319584, 0.9738879, 0.97290784, 0.9514897, 0.95104814,
-                0.9321701, 0.86165, 0.8228847, 0.7623552, 0.76129806,
-                0.6050861, 0.44348577, 0.3452973, 0.2895671, 0.22109479,
-                0.21265312, 0.17855245, 0.1205352, 0.08981906, 0.10596471,
-                0.05854294, 0.99749386, 0.9472857, 0.5945908, 0.09855112
+                0.9975854158401489, 0.9965696334838867, 0.9922919869422913,
+                0.9833580851554871, 0.983080267906189, 0.970454752445221,
+                0.9701289534568787, 0.9649872183799744, 0.9642795324325562,
+                0.9642238020896912, 0.9529680609703064, 0.9403366446495056,
+                0.9391788244247437, 0.8941807150840759, 0.8178097009658813,
+                0.8013413548469543, 0.6677654385566711, 0.3952914774417877,
+                0.33463895320892334, 0.32501447200775146, 0.27323535084724426,
+                0.20197080075740814, 0.15607696771621704, 0.1068163588643074,
+                0.10183875262737274, 0.09735643863677979, 0.06559795141220093,
+                0.08890066295862198, 0.076363705098629, 0.9954648613929749,
+                0.9212945699691772, 0.5224372148513794, 0.20555885136127472
             ],
                      dtype=np.float32),
             decimal=2)
 
         assert_array_almost_equal(
             output['detection_boxes'][0],
-            np.array([[294.7058, 117.29371, 378.83713, 149.99928],
-                      [609.05444, 112.526474, 633.2971, 136.35175],
-                      [481.4165, 110.987335, 522.5531, 130.01529],
-                      [167.68184, 109.89049, 215.49057, 139.86987],
-                      [374.75082, 110.68697, 433.10028, 136.23654],
-                      [189.54971, 110.09322, 297.6167, 155.77412],
-                      [266.5185, 105.37718, 326.54385, 127.916374],
-                      [556.30225, 110.43166, 592.8248, 128.03764],
-                      [432.49252, 105.086464, 484.0512, 132.272],
-                      [0., 110.566444, 62.01249, 146.44017],
-                      [591.74664, 110.43527, 619.73816, 126.68549],
-                      [99.126854, 90.947975, 118.46699, 101.11096],
-                      [59.895264, 94.110054, 85.60521, 106.67633],
-                      [142.95819, 96.61966, 165.96964, 104.95929],
-                      [83.062515, 89.802605, 99.1546, 98.69074],
-                      [226.28802, 98.32568, 249.06772, 108.86408],
-                      [136.67789, 94.75706, 154.62924, 104.289536],
-                      [170.42459, 98.458694, 183.16309, 106.203156],
-                      [67.56731, 89.68286, 82.62955, 98.35645],
-                      [222.80092, 97.828445, 239.02655, 108.29377],
-                      [134.34427, 92.31653, 149.19615, 102.97457],
-                      [613.5186, 102.27066, 636.0434, 112.813644],
-                      [607.4787, 110.87984, 630.1123, 127.65646],
-                      [135.13664, 90.989876, 155.67192, 100.18036],
-                      [431.61505, 105.43844, 484.36508, 132.50078],
-                      [189.92722, 110.38832, 297.74353, 155.95557],
-                      [220.67035, 177.13489, 455.32092, 380.45712],
-                      [372.76584, 134.33807, 432.44357, 188.51534],
-                      [50.403812, 110.543495, 70.4368, 119.65186],
-                      [373.50272, 134.27258, 432.18475, 187.81824]]),
+            np.array([[
+                294.22674560546875, 116.6078109741211, 379.4328918457031,
+                150.14097595214844
+            ],
+                      [
+                          482.6017761230469, 110.75955963134766,
+                          522.8798828125, 129.71286010742188
+                      ],
+                      [
+                          167.06460571289062, 109.95974731445312,
+                          212.83975219726562, 140.16102600097656
+                      ],
+                      [
+                          609.2930908203125, 113.13909149169922,
+                          637.3115844726562, 136.4690704345703
+                      ],
+                      [
+                          191.185791015625, 111.1408920288086, 301.31689453125,
+                          155.7731170654297
+                      ],
+                      [
+                          431.2244873046875, 106.19962310791016,
+                          483.860595703125, 132.21627807617188
+                      ],
+                      [
+                          267.48358154296875, 105.5920639038086,
+                          325.2832336425781, 127.11176300048828
+                      ],
+                      [
+                          591.2138671875, 110.29329681396484,
+                          619.8524169921875, 126.1990966796875
+                      ],
+                      [
+                          0.0, 110.7026596069336, 61.487945556640625,
+                          146.33018493652344
+                      ],
+                      [
+                          555.9155883789062, 110.03486633300781,
+                          591.7050170898438, 127.06097412109375
+                      ],
+                      [
+                          60.24559783935547, 94.12760162353516,
+                          85.63741302490234, 106.66705322265625
+                      ],
+                      [
+                          99.02665710449219, 90.53657531738281,
+                          118.83953094482422, 101.18717956542969
+                      ],
+                      [
+                          396.30438232421875, 111.59194946289062,
+                          431.559814453125, 133.96914672851562
+                      ],
+                      [
+                          83.81543731689453, 89.65665435791016,
+                          99.9166259765625, 98.25627899169922
+                      ],
+                      [
+                          139.29647827148438, 96.68000793457031,
+                          165.22410583496094, 105.60000610351562
+                      ],
+                      [
+                          67.27152252197266, 89.42798614501953,
+                          83.25617980957031, 98.0460205078125
+                      ],
+                      [
+                          223.74176025390625, 98.68321990966797,
+                          250.42506408691406, 109.32588958740234
+                      ],
+                      [
+                          136.7582244873047, 96.51412963867188,
+                          152.51190185546875, 104.73160552978516
+                      ],
+                      [
+                          221.71812438964844, 97.86445617675781,
+                          238.9705810546875, 106.96803283691406
+                      ],
+                      [
+                          135.06964111328125, 91.80916595458984, 155.24609375,
+                          102.20686340332031
+                      ],
+                      [
+                          169.11180114746094, 97.53628540039062,
+                          182.88504028320312, 105.95404815673828
+                      ],
+                      [
+                          133.8811798095703, 91.00375366210938,
+                          145.35507202148438, 102.3780288696289
+                      ],
+                      [
+                          614.2507934570312, 102.19828796386719,
+                          636.5692749023438, 112.59198760986328
+                      ],
+                      [
+                          35.94759750366211, 91.7213363647461,
+                          70.38274383544922, 117.19855499267578
+                      ],
+                      [
+                          554.6401977539062, 115.18976593017578,
+                          562.0255737304688, 127.4429931640625
+                      ],
+                      [
+                          39.07550811767578, 92.73261260986328,
+                          85.36636352539062, 106.73953247070312
+                      ],
+                      [
+                          200.85513305664062, 93.00469970703125,
+                          219.73086547851562, 107.99642181396484
+                      ],
+                      [
+                          0.0, 111.18904876708984, 61.7393684387207,
+                          146.72547912597656
+                      ],
+                      [
+                          191.88568115234375, 111.09577178955078,
+                          299.4097900390625, 155.14639282226562
+                      ],
+                      [
+                          221.06834411621094, 176.6427001953125,
+                          458.3475341796875, 378.89300537109375
+                      ],
+                      [
+                          372.7131652832031, 135.51429748535156,
+                          433.2494201660156, 188.0106658935547
+                      ],
+                      [
+                          52.19819641113281, 110.3646011352539,
+                          70.95110321044922, 120.10567474365234
+                      ],
+                      [
+                          376.1671447753906, 133.6930694580078,
+                          432.2721862792969, 187.99481201171875
+                      ]]),
             decimal=1)
 
 

From ad78dfd3a1320dca8e523dc7abcd82b3aaaf8bfb Mon Sep 17 00:00:00 2001
From: zzoneee <55594658+zzoneee@users.noreply.github.com>
Date: Sat, 17 Sep 2022 11:21:47 +0800
Subject: [PATCH 5/9] fix DeiTIII cr bug (#196)

* fixbug_DeiTIII and rename vit_transfomer_dynamic.py to vit_transformer_dynamic.py
---
 ...dino_deit_small_p16_8xb2048_20e_feature.py |  2 +-
 ...moby_deit_small_p16_8xb2048_30e_feature.py |  2 +-
 docs/source/api/easycv.models.backbones.rst   |  2 +-
 .../backbones/pytorch_image_models_wrapper.py | 12 +++----
 easycv/models/backbones/vision_transformer.py | 30 ++++++++----------
 ..._dynamic.py => vit_transformer_dynamic.py} | 31 +++++++++++++++++--
 6 files changed, 50 insertions(+), 29 deletions(-)
 rename easycv/models/backbones/{vit_transfomer_dynamic.py => vit_transformer_dynamic.py} (89%)

diff --git a/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py b/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py
index 2d58705e..0e6ef6bc 100644
--- a/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py
+++ b/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py
@@ -10,7 +10,7 @@ oss_io_config = dict(
     buckets=['your oss buckets'])
 
 # model settings
-# 1920: merge 4 layers of features, open models/backbones/vit_transfomer_dynamic.py:311: self.forward_return_n_last_blocks
+# 1920: merge 4 layers of features, open models/backbones/vit_transformer_dynamic.py:311: self.forward_return_n_last_blocks
 # 384: default
 feature_num = 1920
 model = dict(
diff --git a/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py b/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py
index 4c81e184..dcb45d31 100644
--- a/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py
+++ b/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py
@@ -10,7 +10,7 @@ oss_io_config = dict(
     buckets=['your oss buckets'])
 
 # model settings
-# 1920: merge 4 layers of features, open models/backbones/vit_transfomer_dynamic.py:311: self.forward_return_n_last_blocks
+# 1920: merge 4 layers of features, open models/backbones/vit_transformer_dynamic.py:311: self.forward_return_n_last_blocks
 # 384: default
 feature_num = 1920
 model = dict(
diff --git a/docs/source/api/easycv.models.backbones.rst b/docs/source/api/easycv.models.backbones.rst
index 3f1ecfd0..4c742c37 100644
--- a/docs/source/api/easycv.models.backbones.rst
+++ b/docs/source/api/easycv.models.backbones.rst
@@ -156,7 +156,7 @@ easycv.models.backbones.swin\_transformer\_dynamic module
 easycv.models.backbones.vit\_transfomer\_dynamic module
 -------------------------------------------------------
 
-.. automodule:: easycv.models.backbones.vit_transfomer_dynamic
+.. automodule:: easycv.models.backbones.vit_transformer_dynamic
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/easycv/models/backbones/pytorch_image_models_wrapper.py b/easycv/models/backbones/pytorch_image_models_wrapper.py
index 6b141489..176d286f 100644
--- a/easycv/models/backbones/pytorch_image_models_wrapper.py
+++ b/easycv/models/backbones/pytorch_image_models_wrapper.py
@@ -16,11 +16,11 @@ from .shuffle_transformer import (shuffletrans_base_p4_w7_224,
 from .swin_transformer_dynamic import (dynamic_swin_base_p4_w7_224,
                                        dynamic_swin_small_p4_w7_224,
                                        dynamic_swin_tiny_p4_w7_224)
-from .vit_transfomer_dynamic import (dynamic_deit_small_p16,
-                                     dynamic_deit_tiny_p16,
-                                     dynamic_vit_base_p16,
-                                     dynamic_vit_huge_p14,
-                                     dynamic_vit_large_p16)
+from .vit_transformer_dynamic import (dynamic_deit_small_p16,
+                                      dynamic_deit_tiny_p16,
+                                      dynamic_vit_base_p16,
+                                      dynamic_vit_huge_p14,
+                                      dynamic_vit_large_p16)
 from .xcit_transformer import (xcit_large_24_p8, xcit_medium_24_p8,
                                xcit_medium_24_p16, xcit_small_12_p8,
                                xcit_small_12_p16)
@@ -36,7 +36,7 @@ _MODEL_MAP = {
     'dynamic_swin_small_p4_w7_224': dynamic_swin_small_p4_w7_224,
     'dynamic_swin_base_p4_w7_224': dynamic_swin_base_p4_w7_224,
 
-    # vit_transfomer_dynamic
+    # vit_transformer_dynamic
     'dynamic_deit_small_p16': dynamic_deit_small_p16,
     'dynamic_deit_tiny_p16': dynamic_deit_tiny_p16,
     'dynamic_vit_base_p16': dynamic_vit_base_p16,
diff --git a/easycv/models/backbones/vision_transformer.py b/easycv/models/backbones/vision_transformer.py
index 2061979d..79a9c900 100644
--- a/easycv/models/backbones/vision_transformer.py
+++ b/easycv/models/backbones/vision_transformer.py
@@ -4,12 +4,10 @@ Mostly copy-paste from timm library.
 https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 
 """
-import math
 from functools import partial
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from timm.models.layers import trunc_normal_
 
 from easycv.models.utils import DropPath, Mlp
@@ -163,8 +161,6 @@ class VisionTransformer(nn.Module):
             forward layer. Default: 0.0
         drop_path_rate (float): Stochastic depth rate. Default: 0
         norm_layer (nn.Module): normalization layer
-        use_dense_prediction (bool): If use_dense_prediction is True, the global
-            pool and norm will before head will be removed.(if any) Default: False
         global_pool (bool): Global pool before head. Default: False
         use_layer_scale (bool): If use_layer_scale is True, it will use layer
             scale. Default: False
@@ -188,7 +184,6 @@ class VisionTransformer(nn.Module):
                  attn_drop_rate=0.,
                  drop_path_rate=0.,
                  norm_layer=partial(nn.LayerNorm, eps=1e-6),
-                 use_dense_prediction=False,
                  global_pool=False,
                  use_layer_scale=False,
                  init_scale=1e-4,
@@ -196,6 +191,15 @@ class VisionTransformer(nn.Module):
         super().__init__()
 
         self.num_features = self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.norm_layer = norm_layer
+        self.use_layer_scale = use_layer_scale
+        self.init_scale = init_scale
 
         self.patch_embed = PatchEmbed(
             img_size=img_size[0],
@@ -231,11 +235,6 @@ class VisionTransformer(nn.Module):
         self.head = nn.Linear(
             embed_dim, num_classes) if num_classes > 0 else nn.Identity()
 
-        # Dense prediction head
-        self.use_dense_prediction = use_dense_prediction
-        if self.use_dense_prediction:
-            self.head_dense = None
-
         # Use global average pooling
         self.global_pool = global_pool
         if self.global_pool:
@@ -277,11 +276,8 @@ class VisionTransformer(nn.Module):
         if self.norm is not None:
             x = self.norm(x)
 
-        if self.use_dense_prediction:
-            return x[:, 0], x[:, 1:]
+        if self.global_pool:
+            x = x[:, 1:, :].mean(dim=1)
+            return self.fc_norm(x)
         else:
-            if self.global_pool:
-                x = x[:, 1:, :].mean(dim=1)
-                return self.fc_norm(x)
-            else:
-                return x[:, 0]
+            return x[:, 0]
diff --git a/easycv/models/backbones/vit_transfomer_dynamic.py b/easycv/models/backbones/vit_transformer_dynamic.py
similarity index 89%
rename from easycv/models/backbones/vit_transfomer_dynamic.py
rename to easycv/models/backbones/vit_transformer_dynamic.py
index 6df88d2c..3f8d0968 100644
--- a/easycv/models/backbones/vit_transfomer_dynamic.py
+++ b/easycv/models/backbones/vit_transformer_dynamic.py
@@ -13,13 +13,19 @@ from functools import partial
 import torch
 import torch.nn as nn
 
-from easycv.models.backbones.vision_transformer import VisionTransformer
+from easycv.models.backbones.vision_transformer import Block, VisionTransformer
 
 
 class DynamicVisionTransformer(VisionTransformer):
-    """Dynamic Vision Transformer """
+    """Dynamic Vision Transformer
 
-    def __init__(self, **kwargs):
+    Args:
+        use_dense_prediction (bool): If use_dense_prediction is True, the global
+            pool and norm will before head will be removed.(if any) Default: False
+
+    """
+
+    def __init__(self, use_dense_prediction=False, **kwargs):
         super(DynamicVisionTransformer, self).__init__(**kwargs)
 
         num_patches = self.patch_embed.num_patches
@@ -31,6 +37,25 @@ class DynamicVisionTransformer(VisionTransformer):
             x.item()
             for x in torch.linspace(0, self.drop_path_rate, self.depth)
         ]
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=self.embed_dim,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                drop=self.drop_rate,
+                attn_drop=self.attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=self.norm_layer,
+                use_layer_scale=self.use_layer_scale,
+                init_values=self.init_scale) for i in range(self.depth)
+        ])
+
+        # Dense prediction head
+        self.use_dense_prediction = use_dense_prediction
+        if self.use_dense_prediction:
+            self.head_dense = None
 
     def forward(self, x):
         # convert to list

From 5ac638175873a66551869fee5cce2b94575a6842 Mon Sep 17 00:00:00 2001
From: Cathy0908 <30484308+Cathy0908@users.noreply.github.com>
Date: Mon, 19 Sep 2022 16:07:04 +0800
Subject: [PATCH 6/9] add error code (#146)

* add error code
---
 benchmarks/tools/extract.py                   |   1 +
 benchmarks/tools/extract_backbone_weights.py  |   4 +-
 benchmarks/tools/linear_eval.py               |   5 +-
 easycv/apis/export.py                         |   1 +
 easycv/apis/test.py                           |   1 +
 easycv/core/evaluation/coco_evaluation.py     |   3 +-
 easycv/core/evaluation/keypoint_eval.py       |   1 +
 easycv/core/evaluation/metric_registry.py     |   2 +
 easycv/core/evaluation/segmentation_eval.py   |   1 +
 easycv/core/evaluation/top_down_eval.py       |   1 +
 easycv/core/optimizer/adam.py                 |   2 +
 easycv/core/optimizer/lamb.py                 |   2 +
 easycv/core/optimizer/lars.py                 |   2 +
 easycv/core/optimizer/ranger.py               |   2 +
 easycv/core/sailfish/linear.py                |   1 +
 easycv/core/sailfish/util.py                  |   1 +
 easycv/core/visualization/image.py            |   6 +-
 .../classification/data_sources/image_list.py |   5 +-
 easycv/datasets/classification/odps.py        |   1 +
 .../classification/pipelines/auto_augment.py  |   1 +
 .../datasets/detection/data_sources/base.py   |   1 +
 .../datasets/detection/data_sources/coco.py   |   1 +
 .../detection/data_sources/coco_panoptic.py   |   1 +
 .../detection/data_sources/pai_format.py      |   1 +
 easycv/datasets/detection/mix.py              |   1 +
 .../detection/pipelines/mm_transforms.py      |  11 +-
 easycv/datasets/detection/raw.py              |   3 +-
 easycv/datasets/loader/build_loader.py        |   1 +
 easycv/datasets/loader/sampler.py             |   6 +-
 easycv/datasets/pose/data_sources/coco.py     |   1 +
 easycv/datasets/pose/data_sources/top_down.py |   1 +
 .../pose/hand_coco_wholebody_dataset.py       |   1 +
 easycv/datasets/pose/pipelines/transforms.py  |   1 +
 easycv/datasets/pose/top_down.py              |   1 +
 .../segmentation/data_sources/base.py         |   1 +
 .../selfsup/data_sources/image_list.py        |   1 +
 easycv/datasets/shared/multi_view.py          |   1 +
 easycv/datasets/shared/pipelines/format.py    |   1 +
 .../datasets/shared/pipelines/transforms.py   |   1 +
 easycv/datasets/shared/raw.py                 |   1 +
 easycv/file/base.py                           |   2 +
 easycv/file/file_io.py                        |   4 +-
 easycv/file/image.py                          |   3 +-
 easycv/file/utils.py                          |   2 +
 easycv/framework/__init__.py                  |   0
 easycv/framework/errors.py                    | 128 ++++++++++++++++++
 easycv/hooks/eval_hook.py                     |   1 +
 easycv/hooks/extractor.py                     |   1 +
 easycv/hooks/optimizer_hook.py                |   3 +-
 easycv/models/backbones/hrnet.py              |   1 +
 easycv/models/backbones/lighthrnet.py         |   1 +
 easycv/models/backbones/mit.py                |   1 +
 easycv/models/backbones/mobilenetv2.py        |   1 +
 .../backbones/pytorch_image_models_wrapper.py |   1 +
 easycv/models/backbones/resnest.py            |   1 +
 easycv/models/backbones/resnet.py             |   1 +
 easycv/models/backbones/resnet_jit.py         |   1 +
 .../models/backbones/shuffle_transformer.py   |   1 +
 easycv/models/backbones/xcit_transformer.py   |   3 +-
 easycv/models/base.py                         |   2 +
 .../models/classification/classification.py   |   3 +-
 .../detection/detectors/dab_detr/attention.py |   2 +
 .../dab_detr/dab_detr_transformer.py          |   1 +
 .../models/detection/detectors/detection.py   |   1 +
 .../detectors/dino/deformable_transformer.py  |   1 +
 .../detection/detectors/dino/dino_head.py     |   1 +
 .../models/detection/detectors/yolox/asff.py  |   1 +
 .../detectors/yolox/yolo_head_template.py     |   1 +
 easycv/models/detection/necks/fpn.py          |   1 +
 easycv/models/detection/necks/sfp.py          |   1 +
 easycv/models/detection/utils/misc.py         |   2 +
 easycv/models/loss/cross_entropy_loss.py      |   1 +
 easycv/models/loss/focal_loss.py              |   1 +
 easycv/models/loss/iou_loss.py                |   1 +
 easycv/models/loss/utils.py                   |   2 +
 .../pose/heads/topdown_heatmap_base_head.py   |   1 +
 .../pose/heads/topdown_heatmap_simple_head.py |   1 +
 easycv/models/segmentation/encoder_decoder.py |   1 +
 easycv/models/segmentation/heads/base.py      |   1 +
 .../segmentation/heads/transformer_decoder.py |   2 +
 easycv/models/segmentation/utils/criterion.py |   1 +
 easycv/models/selfsup/byol.py                 |   3 +-
 easycv/models/selfsup/dino.py                 |   3 +-
 easycv/models/selfsup/mae.py                  |   3 +-
 easycv/models/selfsup/moby.py                 |   7 +-
 easycv/models/selfsup/moco.py                 |   7 +-
 easycv/models/selfsup/simclr.py               |   3 +-
 easycv/models/selfsup/swav.py                 |   7 +-
 easycv/models/utils/conv_module.py            |   1 +
 easycv/models/utils/norm.py                   |   2 +
 easycv/models/utils/transformer.py            |   2 +
 easycv/predictors/base.py                     |   1 +
 easycv/predictors/classifier.py               |   3 +-
 easycv/predictors/detector.py                 |   1 +
 easycv/predictors/feature_extractor.py        |   3 +-
 easycv/predictors/pose_predictor.py           |   1 +
 easycv/runner/ev_runner.py                    |   1 +
 easycv/toolkit/blade/cv_blade_utils.py        |   6 +-
 easycv/toolkit/prune/prune_utils.py           |   4 +-
 easycv/toolkit/quantize/quantize_utils.py     |   1 +
 easycv/toolkit/torchacc/convert_ops.py        |   2 +
 easycv/utils/checkpoint.py                    |   1 +
 easycv/utils/collect.py                       |   1 +
 easycv/utils/config_tools.py                  |   5 +-
 easycv/utils/json_utils.py                    |   2 +
 easycv/utils/logger.py                        |   2 +
 easycv/utils/mmlab_utils.py                   |   3 +-
 easycv/utils/registry.py                      |   2 +
 easycv/utils/test_util.py                     |   1 +
 easycv/utils/user_config_params_utils.py      |   2 +
 tests/core/evaluation/test_coco_evaluation.py |   1 +
 tests/core/optimizer/test_optimizers.py       |   2 +
 .../data_sources/test_det_voc_datasource.py   |   3 +-
 tests/framework/__init__.py                   |   0
 tests/framework/test_errors.py                |  52 +++++++
 tests/utils/test_json_utils.py                |   1 +
 tools/eval.py                                 |   4 +-
 117 files changed, 367 insertions(+), 41 deletions(-)
 create mode 100644 easycv/framework/__init__.py
 create mode 100644 easycv/framework/errors.py
 create mode 100644 tests/framework/__init__.py
 create mode 100644 tests/framework/test_errors.py

diff --git a/benchmarks/tools/extract.py b/benchmarks/tools/extract.py
index 1aff8fa6..9214a282 100644
--- a/benchmarks/tools/extract.py
+++ b/benchmarks/tools/extract.py
@@ -15,6 +15,7 @@ from mmcv.runner import get_dist_info, init_dist, load_checkpoint
 from easycv.apis import set_random_seed
 from easycv.datasets import build_dataloader, build_dataset
 from easycv.file import io
+from easycv.framework.errors import ValueError
 from easycv.models import build_model
 from easycv.utils.collect import dist_forward_collect, nondist_forward_collect
 from easycv.utils.config_tools import mmcv_config_fromfile
diff --git a/benchmarks/tools/extract_backbone_weights.py b/benchmarks/tools/extract_backbone_weights.py
index d3eb38f1..976611ee 100644
--- a/benchmarks/tools/extract_backbone_weights.py
+++ b/benchmarks/tools/extract_backbone_weights.py
@@ -3,6 +3,8 @@ import argparse
 
 import torch
 
+from easycv.framework.errors import ValueError
+
 
 def parse_args():
     parser = argparse.ArgumentParser(
@@ -24,7 +26,7 @@ def main():
             output_dict['state_dict'][key[9:]] = value
             has_backbone = True
     if not has_backbone:
-        raise Exception('Cannot find a backbone module in the checkpoint.')
+        raise ValueError('Cannot find a backbone module in the checkpoint.')
     torch.save(output_dict, args.output)
 
 
diff --git a/benchmarks/tools/linear_eval.py b/benchmarks/tools/linear_eval.py
index 2f74e191..a892e4bf 100644
--- a/benchmarks/tools/linear_eval.py
+++ b/benchmarks/tools/linear_eval.py
@@ -2,11 +2,12 @@
 import argparse
 import os
 import shutil
-import sys
 import time
 
 import torch
 
+from easycv.framework.errors import ValueError
+
 args = argparse.ArgumentParser(description='Process some integers.')
 args.add_argument(
     'model_path',
@@ -88,7 +89,7 @@ def extract_model(model_path):
             output_dict['state_dict'][key[9:]] = value
             has_backbone = True
     if not has_backbone:
-        raise Exception('Cannot find a backbone module in the checkpoint.')
+        raise ValueError('Cannot find a backbone module in the checkpoint.')
     torch.save(output_dict, backbone_file)
 
     return backbone_file
diff --git a/easycv/apis/export.py b/easycv/apis/export.py
index fe8a1850..c2633acb 100644
--- a/easycv/apis/export.py
+++ b/easycv/apis/export.py
@@ -13,6 +13,7 @@ import torchvision.transforms.functional as t_f
 from mmcv.utils import Config
 
 from easycv.file import io
+from easycv.framework.errors import ValueError
 from easycv.models import (DINO, MOCO, SWAV, YOLOX, Classification, MoBY,
                            build_model)
 from easycv.utils.checkpoint import load_checkpoint
diff --git a/easycv/apis/test.py b/easycv/apis/test.py
index 7d3e3dda..d27a0291 100644
--- a/easycv/apis/test.py
+++ b/easycv/apis/test.py
@@ -15,6 +15,7 @@ from mmcv.parallel import (MMDataParallel, MMDistributedDataParallel,
 from mmcv.runner import get_dist_info
 
 from easycv.file import io
+from easycv.framework.errors import ValueError
 from easycv.utils.torchacc_util import is_torchacc_enabled
 
 
diff --git a/easycv/core/evaluation/coco_evaluation.py b/easycv/core/evaluation/coco_evaluation.py
index fe5cc075..63891626 100644
--- a/easycv/core/evaluation/coco_evaluation.py
+++ b/easycv/core/evaluation/coco_evaluation.py
@@ -31,6 +31,7 @@ from easycv.core import standard_fields
 from easycv.core.evaluation import coco_tools
 from easycv.core.post_processing.nms import oks_nms, soft_oks_nms
 from easycv.core.standard_fields import DetectionResultFields, InputDataFields
+from easycv.framework.errors import KeyError, TypeError, ValueError
 from easycv.utils.json_utils import MyEncoder
 from .base_evaluator import Evaluator
 from .builder import EVALUATORS
@@ -365,7 +366,7 @@ class CocoDetectionEvaluator(Evaluator):
 def _check_mask_type_and_value(array_name, masks):
     """Checks whether mask dtype is uint8 and the values are either 0 or 1."""
     if masks.dtype != np.uint8:
-        raise ValueError('{} must be of type np.uint8. Found {}.'.format(
+        raise TypeError('{} must be of type np.uint8. Found {}.'.format(
             array_name, masks.dtype))
     if np.any(np.logical_and(masks != 0, masks != 1)):
         raise ValueError(
diff --git a/easycv/core/evaluation/keypoint_eval.py b/easycv/core/evaluation/keypoint_eval.py
index 0549a71f..4ab4f0c6 100644
--- a/easycv/core/evaluation/keypoint_eval.py
+++ b/easycv/core/evaluation/keypoint_eval.py
@@ -3,6 +3,7 @@
 # https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
 import numpy as np
 
+from easycv.framework.errors import KeyError
 from .base_evaluator import Evaluator
 from .builder import EVALUATORS
 from .metric_registry import METRICS
diff --git a/easycv/core/evaluation/metric_registry.py b/easycv/core/evaluation/metric_registry.py
index 35f89f5d..5c2f3e0e 100644
--- a/easycv/core/evaluation/metric_registry.py
+++ b/easycv/core/evaluation/metric_registry.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import inspect
 
+from easycv.framework.errors import KeyError, TypeError
+
 
 class MetricRegistry(object):
 
diff --git a/easycv/core/evaluation/segmentation_eval.py b/easycv/core/evaluation/segmentation_eval.py
index d76d4d66..81cbd82f 100644
--- a/easycv/core/evaluation/segmentation_eval.py
+++ b/easycv/core/evaluation/segmentation_eval.py
@@ -5,6 +5,7 @@ import numpy as np
 import torch
 from prettytable import PrettyTable
 
+from easycv.framework.errors import KeyError
 from easycv.utils.logger import print_log
 from .base_evaluator import Evaluator
 from .builder import EVALUATORS
diff --git a/easycv/core/evaluation/top_down_eval.py b/easycv/core/evaluation/top_down_eval.py
index ebb505e8..47a4f2dc 100644
--- a/easycv/core/evaluation/top_down_eval.py
+++ b/easycv/core/evaluation/top_down_eval.py
@@ -6,6 +6,7 @@ import cv2
 import numpy as np
 
 from easycv.core.post_processing import transform_preds
+from easycv.framework.errors import ValueError
 
 
 def _calc_distances(preds, targets, mask, normalize):
diff --git a/easycv/core/optimizer/adam.py b/easycv/core/optimizer/adam.py
index e015d523..f1bebd9d 100644
--- a/easycv/core/optimizer/adam.py
+++ b/easycv/core/optimizer/adam.py
@@ -8,6 +8,8 @@ from mmcv.runner.optimizer.builder import OPTIMIZERS
 from torch import Tensor
 from torch.optim import AdamW as _AdamW
 
+from easycv.framework.errors import RuntimeError
+
 
 def adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor],
           exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor],
diff --git a/easycv/core/optimizer/lamb.py b/easycv/core/optimizer/lamb.py
index 6295cdc7..92a296fa 100644
--- a/easycv/core/optimizer/lamb.py
+++ b/easycv/core/optimizer/lamb.py
@@ -5,6 +5,8 @@ import torch
 from mmcv.runner import OPTIMIZERS
 from torch.optim import Optimizer
 
+from easycv.framework.errors import RuntimeError
+
 
 @OPTIMIZERS.register_module()
 class Lamb(Optimizer):
diff --git a/easycv/core/optimizer/lars.py b/easycv/core/optimizer/lars.py
index f6700bce..07d7d5f6 100644
--- a/easycv/core/optimizer/lars.py
+++ b/easycv/core/optimizer/lars.py
@@ -3,6 +3,8 @@ import torch
 from torch.optim import *  # noqa: F401,F403
 from torch.optim.optimizer import Optimizer, required
 
+from easycv.framework.errors import ValueError
+
 
 class LARS(Optimizer):
     r"""Implements layer-wise adaptive rate scaling for SGD.
diff --git a/easycv/core/optimizer/ranger.py b/easycv/core/optimizer/ranger.py
index 727b6f0e..5ec04aac 100644
--- a/easycv/core/optimizer/ranger.py
+++ b/easycv/core/optimizer/ranger.py
@@ -4,6 +4,8 @@ import math
 import torch
 from torch.optim.optimizer import Optimizer
 
+from easycv.framework.errors import ValueError
+
 
 def centralized_gradient(x, use_gc=True, gc_conv_only=False):
     '''credit - https://github.com/Yonghongwei/Gradient-Centralization '''
diff --git a/easycv/core/sailfish/linear.py b/easycv/core/sailfish/linear.py
index 6386dab6..939de5b4 100644
--- a/easycv/core/sailfish/linear.py
+++ b/easycv/core/sailfish/linear.py
@@ -22,6 +22,7 @@ import torch
 from easycv.core.sailfish.util import (BiasUniformInitializer,
                                        KaimingUniformInitializer,
                                        ModelParallel, RenormUniformInitializer)
+from easycv.framework.errors import ValueError
 
 
 class Linear(torch.nn.Module):
diff --git a/easycv/core/sailfish/util.py b/easycv/core/sailfish/util.py
index 57155cbc..c54fd61d 100644
--- a/easycv/core/sailfish/util.py
+++ b/easycv/core/sailfish/util.py
@@ -25,6 +25,7 @@ from easycv.core.sailfish.function import (all_cat, all_log_softmax,
                                            shard_correct_predictions,
                                            shard_target_and_mask,
                                            shard_topk_correct_predictions)
+from easycv.framework.errors import NotImplementedError, ValueError
 
 
 class DistributedParallel:
diff --git a/easycv/core/visualization/image.py b/easycv/core/visualization/image.py
index 3b61f50b..9c79341c 100644
--- a/easycv/core/visualization/image.py
+++ b/easycv/core/visualization/image.py
@@ -10,6 +10,8 @@ import numpy as np
 from mmcv.utils.misc import deprecated_api_warning
 from PIL import Image, ImageDraw, ImageFont
 
+from easycv.framework.errors import FileNotFoundError
+
 
 def get_font_path():
     root_path = opd(opd(opd(os.path.realpath(__file__))))
@@ -22,8 +24,8 @@ def get_font_path():
     elif os.path.exists(find_path_source):
         return find_path_source
     else:
-        raise ValueError('Not find font file both in %s and %s' %
-                         (find_path_whl, find_path_source))
+        raise FileNotFoundError('Not find font file both in %s and %s' %
+                                (find_path_whl, find_path_source))
 
 
 _FONT_PATH = get_font_path()
diff --git a/easycv/datasets/classification/data_sources/image_list.py b/easycv/datasets/classification/data_sources/image_list.py
index 9835daa7..e37f9fa8 100644
--- a/easycv/datasets/classification/data_sources/image_list.py
+++ b/easycv/datasets/classification/data_sources/image_list.py
@@ -7,6 +7,7 @@ from PIL import Image, ImageFile
 
 from easycv.datasets.registry import DATASOURCES
 from easycv.file import io
+from easycv.framework.errors import TypeError
 from easycv.utils.dist_utils import dist_zero_exec
 from .utils import split_listfile_byrank
 
@@ -54,8 +55,8 @@ class ClsSourceImageList(object):
                 'list_file should be str or list(str)'
             root = [root] if isinstance(root, str) else root
             if not isinstance(root, list):
-                raise ValueError('root must be str or list(str), but get %s' %
-                                 type(root))
+                raise TypeError('root must be str or list(str), but get %s' %
+                                type(root))
 
             if len(root) < len(list_file):
                 logging.warning(
diff --git a/easycv/datasets/classification/odps.py b/easycv/datasets/classification/odps.py
index 24e24006..e8bf62f5 100644
--- a/easycv/datasets/classification/odps.py
+++ b/easycv/datasets/classification/odps.py
@@ -3,6 +3,7 @@ from PIL import Image
 
 from easycv.datasets.registry import DATASETS
 from easycv.datasets.shared.base import BaseDataset
+from easycv.framework.errors import NotImplementedError
 
 
 @DATASETS.register_module
diff --git a/easycv/datasets/classification/pipelines/auto_augment.py b/easycv/datasets/classification/pipelines/auto_augment.py
index e9bef83a..84c8b3ae 100644
--- a/easycv/datasets/classification/pipelines/auto_augment.py
+++ b/easycv/datasets/classification/pipelines/auto_augment.py
@@ -12,6 +12,7 @@ from PIL import Image, ImageFilter
 
 from easycv.datasets.registry import PIPELINES
 from easycv.datasets.shared.pipelines import Compose
+from easycv.framework.errors import TypeError
 
 # Default hyperparameters for all Ops
 _HPARAMS_DEFAULT = dict(pad_val=128)
diff --git a/easycv/datasets/detection/data_sources/base.py b/easycv/datasets/detection/data_sources/base.py
index 60bf04a3..fb1677be 100644
--- a/easycv/datasets/detection/data_sources/base.py
+++ b/easycv/datasets/detection/data_sources/base.py
@@ -10,6 +10,7 @@ from mmcv.runner.dist_utils import get_dist_info
 from tqdm import tqdm
 
 from easycv.file.image import load_image
+from easycv.framework.errors import NotImplementedError, ValueError
 
 
 def _load_image(img_path):
diff --git a/easycv/datasets/detection/data_sources/coco.py b/easycv/datasets/detection/data_sources/coco.py
index 1f1efca4..76709f32 100644
--- a/easycv/datasets/detection/data_sources/coco.py
+++ b/easycv/datasets/detection/data_sources/coco.py
@@ -4,6 +4,7 @@ from xtcocotools.coco import COCO
 
 from easycv.datasets.registry import DATASOURCES, PIPELINES
 from easycv.datasets.shared.pipelines import Compose
+from easycv.framework.errors import TypeError
 from easycv.utils.registry import build_from_cfg
 
 
diff --git a/easycv/datasets/detection/data_sources/coco_panoptic.py b/easycv/datasets/detection/data_sources/coco_panoptic.py
index fcf2060a..9ee2ea96 100644
--- a/easycv/datasets/detection/data_sources/coco_panoptic.py
+++ b/easycv/datasets/detection/data_sources/coco_panoptic.py
@@ -8,6 +8,7 @@ from xtcocotools.coco import COCO
 from easycv.datasets.detection.data_sources import DetSourceCoco
 from easycv.datasets.registry import DATASOURCES, PIPELINES
 from easycv.datasets.shared.pipelines import Compose
+from easycv.framework.errors import RuntimeError, TypeError
 from easycv.utils.registry import build_from_cfg
 
 try:
diff --git a/easycv/datasets/detection/data_sources/pai_format.py b/easycv/datasets/detection/data_sources/pai_format.py
index 6f390c86..8ad26e5a 100644
--- a/easycv/datasets/detection/data_sources/pai_format.py
+++ b/easycv/datasets/detection/data_sources/pai_format.py
@@ -8,6 +8,7 @@ import numpy as np
 from easycv.datasets.detection.data_sources.base import DetSourceBase
 from easycv.datasets.registry import DATASOURCES
 from easycv.file import io
+from easycv.framework.errors import NotImplementedError, ValueError
 
 
 def get_prior_task_id(keys):
diff --git a/easycv/datasets/detection/mix.py b/easycv/datasets/detection/mix.py
index 6e7d203f..19394908 100644
--- a/easycv/datasets/detection/mix.py
+++ b/easycv/datasets/detection/mix.py
@@ -9,6 +9,7 @@ import numpy as np
 import torch
 
 from easycv.datasets.registry import DATASETS, PIPELINES
+from easycv.framework.errors import TypeError
 from easycv.utils.bbox_util import xyxy2xywh as xyxy2cxcywh
 from easycv.utils.registry import build_from_cfg
 from .raw import DetDataset
diff --git a/easycv/datasets/detection/pipelines/mm_transforms.py b/easycv/datasets/detection/pipelines/mm_transforms.py
index cd4257cc..0c5846ec 100644
--- a/easycv/datasets/detection/pipelines/mm_transforms.py
+++ b/easycv/datasets/detection/pipelines/mm_transforms.py
@@ -13,6 +13,7 @@ from torchvision.transforms import functional as F
 
 from easycv.datasets.registry import PIPELINES
 from easycv.datasets.shared.pipelines.transforms import Compose
+from easycv.framework.errors import KeyError, NotImplementedError, TypeError
 
 try:
     from panopticapi.utils import rgb2id
@@ -1122,8 +1123,8 @@ class MMRandomFlip:
         elif flip_ratio is None:
             pass
         else:
-            raise ValueError('flip_ratios must be None, float, '
-                             'or list of float')
+            raise TypeError('flip_ratios must be None, float, '
+                            'or list of float')
         self.flip_ratio = flip_ratio
 
         valid_directions = ['horizontal', 'vertical', 'diagonal']
@@ -1133,7 +1134,7 @@ class MMRandomFlip:
             assert mmcv.is_list_of(direction, str)
             assert set(direction).issubset(set(valid_directions))
         else:
-            raise ValueError('direction must be either str or list of str')
+            raise TypeError('direction must be either str or list of str')
         self.direction = direction
 
         if isinstance(flip_ratio, list):
@@ -1168,7 +1169,7 @@ class MMRandomFlip:
             flipped[..., 2::4] = w - bboxes[..., 0::4]
             flipped[..., 3::4] = h - bboxes[..., 1::4]
         else:
-            raise ValueError(f"Invalid flipping direction '{direction}'")
+            raise KeyError(f"Invalid flipping direction '{direction}'")
         return flipped
 
     def __call__(self, results):
@@ -1274,7 +1275,7 @@ class MMRandomCrop:
         if crop_type not in [
                 'relative_range', 'relative', 'absolute', 'absolute_range'
         ]:
-            raise ValueError(f'Invalid crop_type {crop_type}.')
+            raise KeyError(f'Invalid crop_type {crop_type}.')
         if crop_type in ['absolute', 'absolute_range']:
             assert crop_size[0] > 0 and crop_size[1] > 0
             assert isinstance(crop_size[0], int) and isinstance(
diff --git a/easycv/datasets/detection/raw.py b/easycv/datasets/detection/raw.py
index 49f6a3b1..3f6800a1 100644
--- a/easycv/datasets/detection/raw.py
+++ b/easycv/datasets/detection/raw.py
@@ -9,6 +9,7 @@ from easycv.datasets.detection.data_sources import DetSourceCoco
 from easycv.datasets.registry import DATASETS
 from easycv.datasets.shared.base import BaseDataset
 from easycv.file.image import load_image
+from easycv.framework.errors import TimeoutError
 
 
 @DATASETS.register_module
@@ -38,7 +39,7 @@ class DetDataset(BaseDataset):
         count = 0
         while True:
             if count > 10:
-                raise RuntimeError('Loops timeout')
+                raise TimeoutError('Loops timeout')
             data_dict = self.data_source[idx]
             data_dict = self.pipeline(data_dict)
             if data_dict is None:
diff --git a/easycv/datasets/loader/build_loader.py b/easycv/datasets/loader/build_loader.py
index 08127325..4977553b 100644
--- a/easycv/datasets/loader/build_loader.py
+++ b/easycv/datasets/loader/build_loader.py
@@ -11,6 +11,7 @@ from mmcv.runner import get_dist_info
 from torch.utils.data import DataLoader, RandomSampler
 
 from easycv.datasets.shared.odps_reader import set_dataloader_workid
+from easycv.framework.errors import NotImplementedError
 from easycv.utils.dist_utils import sync_random_seed
 from easycv.utils.torchacc_util import is_torchacc_enabled
 from .collate import CollateWrapper
diff --git a/easycv/datasets/loader/sampler.py b/easycv/datasets/loader/sampler.py
index 4c22695f..fd39d054 100644
--- a/easycv/datasets/loader/sampler.py
+++ b/easycv/datasets/loader/sampler.py
@@ -11,6 +11,8 @@ from mmcv.runner import get_dist_info
 from torch.utils.data import DistributedSampler as _DistributedSampler
 from torch.utils.data import Sampler
 
+from easycv.framework.errors import ValueError
+
 
 class DistributedMPSampler(_DistributedSampler):
 
@@ -84,7 +86,9 @@ class DistributedMPSampler(_DistributedSampler):
         self.label_list = []
 
         if not self.dataset.data_source.has_labels:
-            raise 'MPSampler need initial with classification datasets which has label!'
+            raise ValueError(
+                'MPSampler need initial with classification datasets which has label!'
+            )
 
         for idx, label in enumerate(self.dataset.data_source.labels):
             if label in self.label_dict.keys():
diff --git a/easycv/datasets/pose/data_sources/coco.py b/easycv/datasets/pose/data_sources/coco.py
index 86db5dd0..0f9e9260 100644
--- a/easycv/datasets/pose/data_sources/coco.py
+++ b/easycv/datasets/pose/data_sources/coco.py
@@ -7,6 +7,7 @@ import json_tricks as json
 import numpy as np
 
 from easycv.datasets.registry import DATASOURCES
+from easycv.framework.errors import ValueError
 from .top_down import PoseTopDownSource
 
 COCO_DATASET_INFO = dict(
diff --git a/easycv/datasets/pose/data_sources/top_down.py b/easycv/datasets/pose/data_sources/top_down.py
index f892bc4c..3f20d7b3 100644
--- a/easycv/datasets/pose/data_sources/top_down.py
+++ b/easycv/datasets/pose/data_sources/top_down.py
@@ -12,6 +12,7 @@ from mmcv.utils.path import is_filepath
 from xtcocotools.coco import COCO
 
 from easycv.datasets.registry import DATASOURCES
+from easycv.framework.errors import ValueError
 
 
 class DatasetInfo:
diff --git a/easycv/datasets/pose/hand_coco_wholebody_dataset.py b/easycv/datasets/pose/hand_coco_wholebody_dataset.py
index 3084ba02..5cbd65a7 100644
--- a/easycv/datasets/pose/hand_coco_wholebody_dataset.py
+++ b/easycv/datasets/pose/hand_coco_wholebody_dataset.py
@@ -6,6 +6,7 @@ from easycv.core.evaluation.keypoint_eval import KeyPointEvaluator
 from easycv.datasets.pose.data_sources.coco import PoseTopDownSource
 from easycv.datasets.registry import DATASETS
 from easycv.datasets.shared.base import BaseDataset
+from easycv.framework.errors import ValueError
 
 
 @DATASETS.register_module()
diff --git a/easycv/datasets/pose/pipelines/transforms.py b/easycv/datasets/pose/pipelines/transforms.py
index 27c7c325..8401ee8f 100644
--- a/easycv/datasets/pose/pipelines/transforms.py
+++ b/easycv/datasets/pose/pipelines/transforms.py
@@ -9,6 +9,7 @@ from easycv.core.post_processing import (affine_transform, fliplr_joints,
                                          get_affine_transform, get_warp_matrix,
                                          warp_affine_joints)
 from easycv.datasets.registry import PIPELINES
+from easycv.framework.errors import ValueError
 
 
 @PIPELINES.register_module()
diff --git a/easycv/datasets/pose/top_down.py b/easycv/datasets/pose/top_down.py
index 3d972208..1946a654 100644
--- a/easycv/datasets/pose/top_down.py
+++ b/easycv/datasets/pose/top_down.py
@@ -3,6 +3,7 @@ from easycv.core.evaluation.coco_evaluation import CoCoPoseTopDownEvaluator
 from easycv.datasets.pose.data_sources.coco import PoseTopDownSource
 from easycv.datasets.registry import DATASETS
 from easycv.datasets.shared.base import BaseDataset
+from easycv.framework.errors import ValueError
 
 
 @DATASETS.register_module()
diff --git a/easycv/datasets/segmentation/data_sources/base.py b/easycv/datasets/segmentation/data_sources/base.py
index 888fd477..b8dc3673 100644
--- a/easycv/datasets/segmentation/data_sources/base.py
+++ b/easycv/datasets/segmentation/data_sources/base.py
@@ -12,6 +12,7 @@ from tqdm import tqdm
 
 from easycv.datasets.registry import DATASOURCES
 from easycv.file.image import load_image as _load_img
+from easycv.framework.errors import NotImplementedError, ValueError
 
 
 def load_image(img_path):
diff --git a/easycv/datasets/selfsup/data_sources/image_list.py b/easycv/datasets/selfsup/data_sources/image_list.py
index fa61de5d..93637b57 100644
--- a/easycv/datasets/selfsup/data_sources/image_list.py
+++ b/easycv/datasets/selfsup/data_sources/image_list.py
@@ -7,6 +7,7 @@ from PIL import Image, ImageFile
 
 from easycv.datasets.registry import DATASOURCES
 from easycv.file import io
+from easycv.framework.errors import ValueError
 
 
 @DATASOURCES.register_module
diff --git a/easycv/datasets/shared/multi_view.py b/easycv/datasets/shared/multi_view.py
index 7c96d9d6..b5bbd43f 100644
--- a/easycv/datasets/shared/multi_view.py
+++ b/easycv/datasets/shared/multi_view.py
@@ -7,6 +7,7 @@ from easycv.datasets.builder import build_datasource
 from easycv.datasets.registry import DATASETS, PIPELINES
 from easycv.datasets.shared.base import BaseDataset
 from easycv.datasets.shared.pipelines.transforms import Compose
+from easycv.framework.errors import NotImplementedError
 from easycv.utils.registry import build_from_cfg
 
 
diff --git a/easycv/datasets/shared/pipelines/format.py b/easycv/datasets/shared/pipelines/format.py
index 22a463e2..d46e0c34 100644
--- a/easycv/datasets/shared/pipelines/format.py
+++ b/easycv/datasets/shared/pipelines/format.py
@@ -7,6 +7,7 @@ import torch
 from mmcv.parallel import DataContainer as DC
 
 from easycv.datasets.registry import PIPELINES
+from easycv.framework.errors import TypeError
 
 
 def to_tensor(data):
diff --git a/easycv/datasets/shared/pipelines/transforms.py b/easycv/datasets/shared/pipelines/transforms.py
index 7de5c0a9..31e4a966 100644
--- a/easycv/datasets/shared/pipelines/transforms.py
+++ b/easycv/datasets/shared/pipelines/transforms.py
@@ -6,6 +6,7 @@ import numpy as np
 
 from easycv.datasets.registry import PIPELINES
 from easycv.file.image import load_image
+from easycv.framework.errors import TypeError
 from easycv.utils.registry import build_from_cfg
 
 
diff --git a/easycv/datasets/shared/raw.py b/easycv/datasets/shared/raw.py
index ed30275c..10f5e0a3 100644
--- a/easycv/datasets/shared/raw.py
+++ b/easycv/datasets/shared/raw.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.datasets.registry import DATASETS
+from easycv.framework.errors import NotImplementedError
 from .base import BaseDataset
 
 
diff --git a/easycv/file/base.py b/easycv/file/base.py
index 291e219c..47ee5d73 100644
--- a/easycv/file/base.py
+++ b/easycv/file/base.py
@@ -9,6 +9,8 @@ from datetime import datetime
 from functools import lru_cache
 from typing import List, Union
 
+from easycv.framework.errors import NotImplementedError
+
 
 class IOBase:
 
diff --git a/easycv/file/file_io.py b/easycv/file/file_io.py
index 04e13743..ac3f5d28 100644
--- a/easycv/file/file_io.py
+++ b/easycv/file/file_io.py
@@ -11,6 +11,8 @@ from typing import List, Union
 from tqdm import tqdm
 from tqdm.utils import CallbackIOWrapper
 
+from easycv.framework.errors import (FileNotFoundError, IOError, RuntimeError,
+                                     ValueError)
 from .base import IOLocal
 from .utils import (OSS_PREFIX, create_namedtuple, get_oss_config, is_oss_path,
                     mute_stderr, oss_progress)
@@ -198,7 +200,7 @@ class IO(IOLocal):
                 time.sleep(3)
 
         if data is None:
-            raise ValueError('Read file error: %s!' % full_path)
+            raise IOError('Read file error: %s!' % full_path)
 
         if mode == 'rb':
             return NullContextWrapper(BytesIO(data))
diff --git a/easycv/file/image.py b/easycv/file/image.py
index 2b5420b2..a6253427 100644
--- a/easycv/file/image.py
+++ b/easycv/file/image.py
@@ -7,6 +7,7 @@ import numpy as np
 from PIL import Image
 
 from easycv.file import io
+from easycv.framework.errors import IOError
 from easycv.utils.constant import MAX_READ_IMAGE_TRY_TIMES
 from .utils import is_oss_path
 
@@ -43,6 +44,6 @@ def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES):
         try_cnt += 1
 
     if img is None:
-        raise ValueError('Read Image Error: ' + img_path)
+        raise IOError('Read Image Error: ' + img_path)
 
     return img
diff --git a/easycv/file/utils.py b/easycv/file/utils.py
index dcf13f4c..49920611 100644
--- a/easycv/file/utils.py
+++ b/easycv/file/utils.py
@@ -10,6 +10,8 @@ from io import StringIO
 
 from tqdm import tqdm
 
+from easycv.framework.errors import ValueError
+
 OSS_PREFIX = 'oss://'
 URL_PREFIX = 'https://'
 
diff --git a/easycv/framework/__init__.py b/easycv/framework/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/easycv/framework/errors.py b/easycv/framework/errors.py
new file mode 100644
index 00000000..4fa8e8b7
--- /dev/null
+++ b/easycv/framework/errors.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+# total 64 bit
+# 63~64 (question category): 01 (user), ...
+# 60~62 (error severity): 001 (ERROR), 010(WARNING), 011(INFO), 100 (DEBUG), ...
+# 54~59 (product): 00000011 (PAI)
+# 49~53 (sub product): 00000 (none)
+# 41~48 (framework): 00000001 (tensorflow), 00000010 (pytorch)
+# 1~40 (error code)
+OK = 0x5818008000000000
+RUNTIME = 0x4818008000000001
+UNIMPLEMENTED = 0x4818008000000002
+INVALID_ARGUMENT = 0x4818008000000003
+INVALID_VALUE = 0x4818008000000004
+INVALID_KEY = 0x4818008000000005
+INVALID_TYPE = 0x4818008000000006
+MODULE_NOT_FOUND = 0x4818008000000007
+FILE_NOT_FOUND = 0x4818008000000008
+IO_FAILED = 0x4818008000000009
+PERMISSION_DENIED = 0x481800800000000a
+TIMEOUT = 0x481800800000000b
+
+
+class BaseError(Exception):
+    """The base error class for exceptions.
+  """
+    code = None
+
+    def __init__(self, message='', details=None, op=None):
+        """Creates a new `OpError` indicating that a particular op failed.
+
+      Args:
+        message: The message string describing the failure.
+        details: The help message that handle the error.
+        op: The `ops.Operation` that failed, if known; otherwise None. During
+          eager execution, this field is always `None`.
+      """
+        super(BaseError, self).__init__()
+        self._op = op
+        self._message = message
+        self._details = details
+
+    @property
+    def message(self):
+        """The error message that describes the error."""
+        return self._message
+
+    @property
+    def details(self):
+        """The help message that handle the error."""
+        return self._details
+
+    @property
+    def op(self):
+        """The operation that failed, if known.
+      Returns:
+        The `Operation` that failed, or None.
+      """
+        return self._op
+
+    @property
+    def error_code(self):
+        """The integer error code that describes the error."""
+        return hex(self.code)
+
+    def __str__(self):
+        print_str = 'ErrorCode: ' + self.error_code
+        if self.op is not None:
+            print_str += '\n' + 'Operation: ' + str(self.op)
+        print_str += '\n' + 'Message: ' + self.message
+        if self.details is not None:
+            print_str += '\n' + 'Details: ' + self.details
+        return print_str
+
+
+class NotImplementedError(BaseError):
+    """Raised when an operation has not been implemented."""
+    code = UNIMPLEMENTED
+
+
+class RuntimeError(BaseError):
+    """Raised when the system experiences an internal error."""
+    code = RUNTIME
+
+
+class PermissionDeniedError(BaseError):
+    """Raised when the caller does not have permission to run an operation."""
+    code = PERMISSION_DENIED
+
+
+class FileNotFoundError(BaseError):
+    """Raised when a requested entity was not found."""
+    code = FILE_NOT_FOUND
+
+
+class ModuleNotFoundError(BaseError):
+    """Raised when a module could not be located."""
+    code = MODULE_NOT_FOUND
+
+
+class InvalidArgumentError(BaseError):
+    """Raised when an operation receives an invalid argument."""
+    code = INVALID_ARGUMENT
+
+
+class TimeoutError(BaseError):
+    """Raised when an operation timed out."""
+    code = TIMEOUT
+
+
+class IOError(BaseError):
+    """Raised when an operation returns a system-related error, including I/O failures."""
+    code = IO_FAILED
+
+
+class ValueError(BaseError):
+    """Raised when an operation receives an invalid value."""
+    code = INVALID_VALUE
+
+
+class KeyError(BaseError):
+    """Raised when a mapping (dictionary) key is not found in the set of existing keys."""
+    code = INVALID_KEY
+
+
+class TypeError(BaseError):
+    """Raised when an operation or function is applied to an object of inappropriate type."""
+    code = INVALID_TYPE
diff --git a/easycv/hooks/eval_hook.py b/easycv/hooks/eval_hook.py
index e221e160..a4065617 100644
--- a/easycv/hooks/eval_hook.py
+++ b/easycv/hooks/eval_hook.py
@@ -7,6 +7,7 @@ from mmcv.runner import Hook
 from torch.utils.data import DataLoader
 
 from easycv.datasets.loader.loader_wrapper import TorchaccLoaderWrapper
+from easycv.framework.errors import TypeError
 from easycv.hooks.tensorboard import TensorboardLoggerHookV2
 from easycv.hooks.wandb import WandbLoggerHookV2
 
diff --git a/easycv/hooks/extractor.py b/easycv/hooks/extractor.py
index 6e6acafc..e81be9ae 100644
--- a/easycv/hooks/extractor.py
+++ b/easycv/hooks/extractor.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 from torch.utils.data import Dataset
 
+from easycv.framework.errors import TypeError
 from easycv.utils.collect import dist_forward_collect, nondist_forward_collect
 
 
diff --git a/easycv/hooks/optimizer_hook.py b/easycv/hooks/optimizer_hook.py
index 44a6d49b..e31be411 100644
--- a/easycv/hooks/optimizer_hook.py
+++ b/easycv/hooks/optimizer_hook.py
@@ -6,6 +6,7 @@ import torch
 from mmcv.parallel import is_module_wrapper
 from mmcv.runner import OptimizerHook as _OptimizerHook
 
+from easycv.framework.errors import TypeError
 from easycv.utils.dist_utils import get_dist_info
 from easycv.utils.torchacc_util import is_torchacc_enabled
 
@@ -134,7 +135,7 @@ class AMPFP16OptimizerHook(OptimizerHook):
             elif isinstance(loss_scale, dict):
                 self.scaler = amp.GradScaler(**loss_scale)
             else:
-                raise ValueError(
+                raise TypeError(
                     '`loss_scale` type must be in [float, dict], but got {loss_scale}'
                 )
 
diff --git a/easycv/models/backbones/hrnet.py b/easycv/models/backbones/hrnet.py
index 90730d02..09cb2198 100644
--- a/easycv/models/backbones/hrnet.py
+++ b/easycv/models/backbones/hrnet.py
@@ -7,6 +7,7 @@ from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
                       normal_init)
 from torch.nn.modules.batchnorm import _BatchNorm
 
+from easycv.framework.errors import NotImplementedError, TypeError, ValueError
 from easycv.models.registry import BACKBONES
 from ..modelzoo import hrnet as model_urls
 from .resnet import BasicBlock
diff --git a/easycv/models/backbones/lighthrnet.py b/easycv/models/backbones/lighthrnet.py
index 503f9d24..13846e0e 100644
--- a/easycv/models/backbones/lighthrnet.py
+++ b/easycv/models/backbones/lighthrnet.py
@@ -11,6 +11,7 @@ from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
                       normal_init)
 from torch.nn.modules.batchnorm import _BatchNorm
 
+from easycv.framework.errors import ValueError
 from easycv.models.registry import BACKBONES
 
 
diff --git a/easycv/models/backbones/mit.py b/easycv/models/backbones/mit.py
index a9cf2c01..c957b737 100644
--- a/easycv/models/backbones/mit.py
+++ b/easycv/models/backbones/mit.py
@@ -13,6 +13,7 @@ from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
                                         trunc_normal_init)
 from mmcv.runner import BaseModule, ModuleList, Sequential
 
+from easycv.framework.errors import TypeError
 from easycv.models.registry import BACKBONES
 from easycv.models.segmentation.utils import (PatchEmbed, nchw_to_nlc,
                                               nlc_to_nchw)
diff --git a/easycv/models/backbones/mobilenetv2.py b/easycv/models/backbones/mobilenetv2.py
index 860b26d9..283cd55a 100644
--- a/easycv/models/backbones/mobilenetv2.py
+++ b/easycv/models/backbones/mobilenetv2.py
@@ -5,6 +5,7 @@ r""" This model is taken from the official PyTorch model zoo.
 
 from torch import nn
 
+from easycv.framework.errors import ValueError
 from ..modelzoo import mobilenetv2 as model_urls
 from ..registry import BACKBONES
 
diff --git a/easycv/models/backbones/pytorch_image_models_wrapper.py b/easycv/models/backbones/pytorch_image_models_wrapper.py
index 176d286f..1072056d 100644
--- a/easycv/models/backbones/pytorch_image_models_wrapper.py
+++ b/easycv/models/backbones/pytorch_image_models_wrapper.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 from timm.models.helpers import load_pretrained
 from timm.models.hub import download_cached_file
 
+from easycv.framework.errors import ValueError
 from easycv.utils.logger import get_root_logger, print_log
 from ..modelzoo import timm_models as model_urls
 from ..registry import BACKBONES
diff --git a/easycv/models/backbones/resnest.py b/easycv/models/backbones/resnest.py
index 13ef9987..6bd5a08e 100644
--- a/easycv/models/backbones/resnest.py
+++ b/easycv/models/backbones/resnest.py
@@ -14,6 +14,7 @@ import torch.nn.functional as F
 from torch.nn import Conv2d, Module, ReLU
 from torch.nn.modules.utils import _pair
 
+from easycv.framework.errors import KeyError, NotImplementedError, RuntimeError
 from ..registry import BACKBONES
 
 
diff --git a/easycv/models/backbones/resnet.py b/easycv/models/backbones/resnet.py
index 6e083f86..ca5c959f 100644
--- a/easycv/models/backbones/resnet.py
+++ b/easycv/models/backbones/resnet.py
@@ -4,6 +4,7 @@ import torch.utils.checkpoint as cp
 from mmcv.cnn import constant_init, kaiming_init
 from torch.nn.modules.batchnorm import _BatchNorm
 
+from easycv.framework.errors import KeyError
 from ..modelzoo import resnet as model_urls
 from ..registry import BACKBONES
 from ..utils import FReLU, build_conv_layer, build_norm_layer
diff --git a/easycv/models/backbones/resnet_jit.py b/easycv/models/backbones/resnet_jit.py
index 0d55e59a..4e241f86 100644
--- a/easycv/models/backbones/resnet_jit.py
+++ b/easycv/models/backbones/resnet_jit.py
@@ -6,6 +6,7 @@ import torch.nn as nn
 from mmcv.cnn import constant_init, kaiming_init
 from torch.nn.modules.batchnorm import _BatchNorm
 
+from easycv.framework.errors import KeyError
 from ..registry import BACKBONES
 from ..utils import build_conv_layer, build_norm_layer
 
diff --git a/easycv/models/backbones/shuffle_transformer.py b/easycv/models/backbones/shuffle_transformer.py
index 965df17e..c53c103d 100644
--- a/easycv/models/backbones/shuffle_transformer.py
+++ b/easycv/models/backbones/shuffle_transformer.py
@@ -7,6 +7,7 @@ from einops import rearrange
 from timm.models.layers import DropPath, trunc_normal_
 from torch import nn
 
+from easycv.framework.errors import NotImplementedError
 from ..registry import BACKBONES
 
 
diff --git a/easycv/models/backbones/xcit_transformer.py b/easycv/models/backbones/xcit_transformer.py
index 0ee9cf87..18722f1e 100644
--- a/easycv/models/backbones/xcit_transformer.py
+++ b/easycv/models/backbones/xcit_transformer.py
@@ -19,6 +19,7 @@ import torch.nn as nn
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
 from timm.models.vision_transformer import Mlp, _cfg
 
+from easycv.framework.errors import ValueError
 from ..registry import BACKBONES
 
 
@@ -109,7 +110,7 @@ class ConvPatchEmbed(nn.Module):
                 conv3x3(embed_dim // 2, embed_dim, 2),
             )
         else:
-            raise (
+            raise ValueError(
                 'For convolutional projection, patch size has to be in [8, 16]'
             )
 
diff --git a/easycv/models/base.py b/easycv/models/base.py
index 0385baf7..1f4c3278 100644
--- a/easycv/models/base.py
+++ b/easycv/models/base.py
@@ -8,6 +8,8 @@ import torch.distributed as dist
 import torch.nn as nn
 from torch import Tensor
 
+from easycv.framework.errors import NotImplementedError, TypeError
+
 
 class BaseModel(nn.Module, metaclass=ABCMeta):
     ''' base class for model. '''
diff --git a/easycv/models/classification/classification.py b/easycv/models/classification/classification.py
index 34bde969..ccd30d50 100644
--- a/easycv/models/classification/classification.py
+++ b/easycv/models/classification/classification.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 from mmcv.runner import get_dist_info
 from timm.data.mixup import Mixup
 
+from easycv.framework.errors import KeyError, NotImplementedError, ValueError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger, print_log
 from easycv.utils.preprocess_function import (bninceptionPre, gaussianBlur,
@@ -300,4 +301,4 @@ class Classification(BaseModel):
                 rv['gt_labels'] = gt_labels.cpu()
             return rv
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
diff --git a/easycv/models/detection/detectors/dab_detr/attention.py b/easycv/models/detection/detectors/dab_detr/attention.py
index ce4b6929..95b952cd 100644
--- a/easycv/models/detection/detectors/dab_detr/attention.py
+++ b/easycv/models/detection/detectors/dab_detr/attention.py
@@ -29,6 +29,8 @@ from torch.nn.init import constant_
 from torch.nn.modules.linear import Linear
 from torch.nn.modules.module import Module
 
+from easycv.framework.errors import RuntimeError
+
 try:
     from torch.overrides import has_torch_function, handle_torch_function
 except:
diff --git a/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py b/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py
index 06345b11..1802f8e9 100644
--- a/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py
+++ b/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py
@@ -14,6 +14,7 @@ import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
 
+from easycv.framework.errors import NotImplementedError, ValueError
 from easycv.models.builder import NECKS
 from easycv.models.detection.utils import inverse_sigmoid
 from easycv.models.utils import (MLP, TransformerEncoder,
diff --git a/easycv/models/detection/detectors/detection.py b/easycv/models/detection/detectors/detection.py
index fe91fbf8..bcd8edf0 100644
--- a/easycv/models/detection/detectors/detection.py
+++ b/easycv/models/detection/detectors/detection.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.framework.errors import ValueError
 from easycv.models.base import BaseModel
 from easycv.models.builder import (MODELS, build_backbone, build_head,
                                    build_neck)
diff --git a/easycv/models/detection/detectors/dino/deformable_transformer.py b/easycv/models/detection/detectors/dino/deformable_transformer.py
index 57d5f51d..447af568 100644
--- a/easycv/models/detection/detectors/dino/deformable_transformer.py
+++ b/easycv/models/detection/detectors/dino/deformable_transformer.py
@@ -15,6 +15,7 @@ from typing import Optional
 import torch
 from torch import Tensor, nn
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.builder import NECKS
 from easycv.models.detection.utils import (gen_encoder_output_proposals,
                                            gen_sineembed_for_position,
diff --git a/easycv/models/detection/detectors/dino/dino_head.py b/easycv/models/detection/detectors/dino/dino_head.py
index bd581418..19ac173c 100644
--- a/easycv/models/detection/detectors/dino/dino_head.py
+++ b/easycv/models/detection/detectors/dino/dino_head.py
@@ -7,6 +7,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.builder import HEADS, build_neck
 from easycv.models.detection.utils import (DetrPostProcess, box_xyxy_to_cxcywh,
                                            inverse_sigmoid)
diff --git a/easycv/models/detection/detectors/yolox/asff.py b/easycv/models/detection/detectors/yolox/asff.py
index d4c62c3c..7af1ae84 100644
--- a/easycv/models/detection/detectors/yolox/asff.py
+++ b/easycv/models/detection/detectors/yolox/asff.py
@@ -3,6 +3,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import ValueError
 from easycv.models.backbones.network_blocks import BaseConv
 
 
diff --git a/easycv/models/detection/detectors/yolox/yolo_head_template.py b/easycv/models/detection/detectors/yolox/yolo_head_template.py
index 63923abf..a8e4fb03 100644
--- a/easycv/models/detection/detectors/yolox/yolo_head_template.py
+++ b/easycv/models/detection/detectors/yolox/yolo_head_template.py
@@ -8,6 +8,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import KeyError, RuntimeError
 from easycv.models.backbones.network_blocks import BaseConv, DWConv
 from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock
 from easycv.models.detection.utils import bboxes_iou
diff --git a/easycv/models/detection/necks/fpn.py b/easycv/models/detection/necks/fpn.py
index 8018903c..6f71eda0 100644
--- a/easycv/models/detection/necks/fpn.py
+++ b/easycv/models/detection/necks/fpn.py
@@ -3,6 +3,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.registry import NECKS
 
 
diff --git a/easycv/models/detection/necks/sfp.py b/easycv/models/detection/necks/sfp.py
index b588f643..62e581ea 100644
--- a/easycv/models/detection/necks/sfp.py
+++ b/easycv/models/detection/necks/sfp.py
@@ -3,6 +3,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.builder import NECKS
 
 
diff --git a/easycv/models/detection/utils/misc.py b/easycv/models/detection/utils/misc.py
index a9605a3b..9a2de7a9 100644
--- a/easycv/models/detection/utils/misc.py
+++ b/easycv/models/detection/utils/misc.py
@@ -9,6 +9,8 @@ from packaging import version
 from torch import Tensor
 from torch.autograd import Function
 
+from easycv.framework.errors import NotImplementedError
+
 if version.parse(torchvision.__version__) < version.parse('0.7'):
     from torchvision.ops import _new_empty_tensor
     from torchvision.ops.misc import _output_size
diff --git a/easycv/models/loss/cross_entropy_loss.py b/easycv/models/loss/cross_entropy_loss.py
index 0f9d5074..ad8661cb 100644
--- a/easycv/models/loss/cross_entropy_loss.py
+++ b/easycv/models/loss/cross_entropy_loss.py
@@ -7,6 +7,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import ValueError
 from easycv.models.builder import LOSSES
 from easycv.models.loss.utils import weight_reduce_loss
 
diff --git a/easycv/models/loss/focal_loss.py b/easycv/models/loss/focal_loss.py
index 0cec5ddb..f4ea5a47 100644
--- a/easycv/models/loss/focal_loss.py
+++ b/easycv/models/loss/focal_loss.py
@@ -4,6 +4,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.builder import LOSSES
 from easycv.models.loss.utils import weight_reduce_loss
 
diff --git a/easycv/models/loss/iou_loss.py b/easycv/models/loss/iou_loss.py
index 8a4af4bb..72611f02 100644
--- a/easycv/models/loss/iou_loss.py
+++ b/easycv/models/loss/iou_loss.py
@@ -7,6 +7,7 @@ import mmcv
 import torch
 import torch.nn as nn
 
+from easycv.framework.errors import NotImplementedError
 from easycv.models.detection.utils import bbox_overlaps
 from easycv.models.loss.utils import weighted_loss
 from ..registry import LOSSES
diff --git a/easycv/models/loss/utils.py b/easycv/models/loss/utils.py
index b08e7cf3..0164b104 100644
--- a/easycv/models/loss/utils.py
+++ b/easycv/models/loss/utils.py
@@ -4,6 +4,8 @@ import functools
 import torch
 import torch.nn.functional as F
 
+from easycv.framework.errors import ValueError
+
 
 def reduce_loss(loss, reduction):
     """Reduce loss as specified.
diff --git a/easycv/models/pose/heads/topdown_heatmap_base_head.py b/easycv/models/pose/heads/topdown_heatmap_base_head.py
index adc3dfa6..afcfc8f3 100644
--- a/easycv/models/pose/heads/topdown_heatmap_base_head.py
+++ b/easycv/models/pose/heads/topdown_heatmap_base_head.py
@@ -7,6 +7,7 @@ import numpy as np
 import torch.nn as nn
 
 from easycv.core.evaluation.top_down_eval import keypoints_from_heatmaps
+from easycv.framework.errors import ValueError
 
 
 class TopdownHeatmapBaseHead(nn.Module):
diff --git a/easycv/models/pose/heads/topdown_heatmap_simple_head.py b/easycv/models/pose/heads/topdown_heatmap_simple_head.py
index ba3c746b..8811d7eb 100644
--- a/easycv/models/pose/heads/topdown_heatmap_simple_head.py
+++ b/easycv/models/pose/heads/topdown_heatmap_simple_head.py
@@ -7,6 +7,7 @@ from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
 
 from easycv.core.evaluation import pose_pck_accuracy
 from easycv.core.post_processing import flip_back
+from easycv.framework.errors import TypeError, ValueError
 from easycv.models.builder import HEADS, build_loss
 from easycv.models.utils.ops import resize_tensor as resize
 from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
diff --git a/easycv/models/segmentation/encoder_decoder.py b/easycv/models/segmentation/encoder_decoder.py
index 6b96c98f..63577528 100644
--- a/easycv/models/segmentation/encoder_decoder.py
+++ b/easycv/models/segmentation/encoder_decoder.py
@@ -3,6 +3,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import TypeError, ValueError
 from easycv.models import builder
 from easycv.models.base import BaseModel
 from easycv.models.builder import MODELS
diff --git a/easycv/models/segmentation/heads/base.py b/easycv/models/segmentation/heads/base.py
index f1508a25..3aaf85ff 100644
--- a/easycv/models/segmentation/heads/base.py
+++ b/easycv/models/segmentation/heads/base.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 from mmcv.cnn.utils import initialize
 
 from easycv.core.evaluation.metrics import accuracy
+from easycv.framework.errors import TypeError
 from easycv.models.builder import build_loss
 from easycv.models.utils.ops import resize_tensor
 from easycv.utils.logger import print_log
diff --git a/easycv/models/segmentation/heads/transformer_decoder.py b/easycv/models/segmentation/heads/transformer_decoder.py
index 3a42072d..88ef6303 100644
--- a/easycv/models/segmentation/heads/transformer_decoder.py
+++ b/easycv/models/segmentation/heads/transformer_decoder.py
@@ -5,6 +5,8 @@ import torch
 from torch import Tensor, nn
 from torch.nn import functional as F
 
+from easycv.framework.errors import RuntimeError, ValueError
+
 
 class PositionEmbeddingSine(nn.Module):
     """
diff --git a/easycv/models/segmentation/utils/criterion.py b/easycv/models/segmentation/utils/criterion.py
index 490b2cb7..29345d12 100644
--- a/easycv/models/segmentation/utils/criterion.py
+++ b/easycv/models/segmentation/utils/criterion.py
@@ -8,6 +8,7 @@ import torchvision
 from mmcv.runner import get_dist_info
 from torch import Tensor, nn
 
+from easycv.framework.errors import ValueError
 from .point_rend import (get_uncertain_point_coords_with_randomness,
                          point_sample)
 
diff --git a/easycv/models/selfsup/byol.py b/easycv/models/selfsup/byol.py
index 44d42d10..9bdce95d 100644
--- a/easycv/models/selfsup/byol.py
+++ b/easycv/models/selfsup/byol.py
@@ -2,6 +2,7 @@
 import torch
 import torch.nn as nn
 
+from easycv.framework.errors import KeyError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from .. import builder
@@ -97,4 +98,4 @@ class BYOL(BaseModel):
         elif mode == 'extract':
             return self.backbone(img)
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
diff --git a/easycv/models/selfsup/dino.py b/easycv/models/selfsup/dino.py
index f9978974..1e21cad4 100644
--- a/easycv/models/selfsup/dino.py
+++ b/easycv/models/selfsup/dino.py
@@ -9,6 +9,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.runner import get_dist_info
 
+from easycv.framework.errors import KeyError, NotImplementedError, ValueError
 from easycv.utils.preprocess_function import (gaussianBlurDynamic,
                                               randomGrayScale, solarize)
 from .. import builder
@@ -416,4 +417,4 @@ class DINO(BaseModel):
             #     rv['gt_labels'] = gt_label.cpu()
             # return rv
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
diff --git a/easycv/models/selfsup/mae.py b/easycv/models/selfsup/mae.py
index 9efb686c..de9d062a 100644
--- a/easycv/models/selfsup/mae.py
+++ b/easycv/models/selfsup/mae.py
@@ -1,5 +1,6 @@
 import torch
 
+from easycv.framework.errors import KeyError
 from .. import builder
 from ..base import BaseModel
 from ..registry import MODELS
@@ -84,4 +85,4 @@ class MAE(BaseModel):
         elif mode == 'test':
             return self.forward_test(img, **kwargs)
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
diff --git a/easycv/models/selfsup/moby.py b/easycv/models/selfsup/moby.py
index 8ed0e9e5..94ebb5c5 100644
--- a/easycv/models/selfsup/moby.py
+++ b/easycv/models/selfsup/moby.py
@@ -3,6 +3,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from easycv.framework.errors import KeyError, ValueError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale
@@ -269,12 +270,14 @@ class MoBY(BaseModel):
                 if name in rd.keys():
                     rv[name] = rd[name]
                 else:
-                    raise 'Extract %s is not support in classification models' % name
+                    raise ValueError(
+                        'Extract %s is not support in classification models' %
+                        name)
             if gt_label is not None:
                 rv['gt_labels'] = gt_label.cpu()
             return rv
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
 
 
 # utils
diff --git a/easycv/models/selfsup/moco.py b/easycv/models/selfsup/moco.py
index 5094f61d..e566f726 100644
--- a/easycv/models/selfsup/moco.py
+++ b/easycv/models/selfsup/moco.py
@@ -2,6 +2,7 @@
 import torch
 import torch.nn as nn
 
+from easycv.framework.errors import KeyError, ValueError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale
@@ -232,12 +233,14 @@ class MOCO(BaseModel):
                 if name in rd.keys():
                     rv[name] = rd[name]
                 else:
-                    raise 'Extract %s is not support in classification models' % name
+                    raise ValueError(
+                        'Extract %s is not support in classification models' %
+                        name)
             if gt_label is not None:
                 rv['gt_labels'] = gt_label.cpu()
             return rv
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
 
 
 # utils
diff --git a/easycv/models/selfsup/simclr.py b/easycv/models/selfsup/simclr.py
index 3b26eaf4..28e1b324 100644
--- a/easycv/models/selfsup/simclr.py
+++ b/easycv/models/selfsup/simclr.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 
+from easycv.framework.errors import KeyError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale
@@ -97,4 +98,4 @@ class SimCLR(BaseModel):
         elif mode == 'extract':
             return self.forward_backbone(img)
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
diff --git a/easycv/models/selfsup/swav.py b/easycv/models/selfsup/swav.py
index a7e8af4f..1393fc29 100644
--- a/easycv/models/selfsup/swav.py
+++ b/easycv/models/selfsup/swav.py
@@ -5,6 +5,7 @@ import torch.distributed as dist
 import torch.nn as nn
 from mmcv.runner import get_dist_info
 
+from easycv.framework.errors import KeyError, ValueError
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.logger import get_root_logger
 from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale
@@ -193,12 +194,14 @@ class SWAV(BaseModel):
                 if name in rd.keys():
                     rv[name] = rd[name]
                 else:
-                    raise 'Extract %s is not support in classification models' % name
+                    raise ValueError(
+                        'Extract %s is not support in classification models' %
+                        name)
             if gt_label is not None:
                 rv['gt_labels'] = gt_label.cpu()
             return rv
         else:
-            raise Exception('No such mode: {}'.format(mode))
+            raise KeyError('No such mode: {}'.format(mode))
 
 
 class MultiPrototypes(nn.Module):
diff --git a/easycv/models/utils/conv_module.py b/easycv/models/utils/conv_module.py
index 26364a43..bfebf816 100644
--- a/easycv/models/utils/conv_module.py
+++ b/easycv/models/utils/conv_module.py
@@ -4,6 +4,7 @@ import warnings
 import torch.nn as nn
 from mmcv.cnn import constant_init, kaiming_init
 
+from easycv.framework.errors import KeyError
 from .activation import build_activation_layer
 from .conv_ws import ConvWS2d
 from .norm import build_norm_layer
diff --git a/easycv/models/utils/norm.py b/easycv/models/utils/norm.py
index 85191b55..f5ca46e7 100644
--- a/easycv/models/utils/norm.py
+++ b/easycv/models/utils/norm.py
@@ -2,6 +2,8 @@
 import torch
 import torch.nn as nn
 
+from easycv.framework.errors import KeyError, NotImplementedError
+
 
 class SyncIBN(nn.Module):
     r"""Instance-Batch Normalization layer from
diff --git a/easycv/models/utils/transformer.py b/easycv/models/utils/transformer.py
index e76fbb44..3cb19931 100644
--- a/easycv/models/utils/transformer.py
+++ b/easycv/models/utils/transformer.py
@@ -6,6 +6,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
 
+from easycv.framework.errors import RuntimeError
+
 
 class MLP(nn.Module):
     """ Very simple multi-layer perceptron (also called FFN)"""
diff --git a/easycv/predictors/base.py b/easycv/predictors/base.py
index 49f3a728..d65b1bb6 100644
--- a/easycv/predictors/base.py
+++ b/easycv/predictors/base.py
@@ -10,6 +10,7 @@ from torchvision.transforms import Compose
 
 from easycv.datasets.registry import PIPELINES
 from easycv.file import io
+from easycv.framework.errors import ValueError
 from easycv.models.builder import build_model
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.config_tools import mmcv_config_fromfile
diff --git a/easycv/predictors/classifier.py b/easycv/predictors/classifier.py
index be3a9967..e29d6736 100644
--- a/easycv/predictors/classifier.py
+++ b/easycv/predictors/classifier.py
@@ -4,6 +4,7 @@ import math
 import numpy as np
 import torch
 
+from easycv.framework.errors import ValueError
 from .base import Predictor
 from .builder import PREDICTORS
 
@@ -30,7 +31,7 @@ class TorchClassifier(PredictorInterface):
     """
         self.predictor = Predictor(model_path)
         if 'class_list' not in self.predictor.cfg and label_map_path is None:
-            raise Exception(
+            raise ValueError(
                 "label_map_path need to be set, when ckpt doesn't contain class_list"
             )
 
diff --git a/easycv/predictors/detector.py b/easycv/predictors/detector.py
index 017d671e..7637ca67 100644
--- a/easycv/predictors/detector.py
+++ b/easycv/predictors/detector.py
@@ -16,6 +16,7 @@ from easycv.datasets.registry import PIPELINES
 from easycv.datasets.utils import replace_ImageToTensor
 from easycv.file import io
 from easycv.file.utils import is_url_path, url_path_exists
+from easycv.framework.errors import TypeError
 from easycv.models import build_model
 from easycv.models.detection.utils import postprocess
 from easycv.utils.checkpoint import load_checkpoint
diff --git a/easycv/predictors/feature_extractor.py b/easycv/predictors/feature_extractor.py
index 79110bb9..fc0802af 100644
--- a/easycv/predictors/feature_extractor.py
+++ b/easycv/predictors/feature_extractor.py
@@ -6,6 +6,7 @@ import numpy as np
 import torch
 from PIL import Image
 
+from easycv.framework.errors import ValueError
 from .base import Predictor
 from .builder import PREDICTORS
 
@@ -522,7 +523,7 @@ class TorchFaceAttrExtractor(PredictorInterface):
                             neck_output.device)
                         neck_output = (distribute * neck_output).sum(dim=1)
                     else:
-                        raise Exception(
+                        raise ValueError(
                             'TorchFaceAttrExtractor for neck %d only support attr_method softmax/distributed sum'
                             % (neck_idx))
                         neck_output = torch.argmax(neck_output, dim=1)
diff --git a/easycv/predictors/pose_predictor.py b/easycv/predictors/pose_predictor.py
index 34ca5475..b2408051 100644
--- a/easycv/predictors/pose_predictor.py
+++ b/easycv/predictors/pose_predictor.py
@@ -11,6 +11,7 @@ from torchvision.transforms import Compose
 from easycv.datasets.pose.data_sources.top_down import DatasetInfo
 from easycv.datasets.registry import PIPELINES
 from easycv.file import io
+from easycv.framework.errors import ModuleNotFoundError, TypeError, ValueError
 from easycv.models import build_model
 from easycv.predictors.builder import PREDICTORS
 from easycv.predictors.detector import TorchYoloXPredictor
diff --git a/easycv/runner/ev_runner.py b/easycv/runner/ev_runner.py
index d8b9cb14..7921808b 100644
--- a/easycv/runner/ev_runner.py
+++ b/easycv/runner/ev_runner.py
@@ -8,6 +8,7 @@ from mmcv.runner import EpochBasedRunner
 from mmcv.runner.log_buffer import LogBuffer
 
 from easycv.file import io
+from easycv.framework.errors import RuntimeError, TypeError
 from easycv.utils.checkpoint import load_checkpoint, save_checkpoint
 
 if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'):
diff --git a/easycv/toolkit/blade/cv_blade_utils.py b/easycv/toolkit/blade/cv_blade_utils.py
index 0bfcb8f4..cd742161 100644
--- a/easycv/toolkit/blade/cv_blade_utils.py
+++ b/easycv/toolkit/blade/cv_blade_utils.py
@@ -17,6 +17,8 @@ import torch_blade.tensorrt
 import torchvision
 from torch_blade import optimize
 
+from easycv.framework.errors import RuntimeError
+
 os.environ['DISC_ENABLE_STITCH'] = os.environ.get('DISC_ENABLE_STITCH', 'true')
 os.environ['DISC_EXPERIMENTAL_SPECULATION_TLP_ENHANCE'] = os.environ.get(
     'DISC_EXPERIMENTAL_SPECULATION_TLP_ENHANCE', 'true')
@@ -103,13 +105,13 @@ def opt_trt_config(
 def cu_prof_start():
     ret = _cudart.cudaProfilerStart()
     if ret != 0:
-        raise Exception('cudaProfilerStart() returned %d' % ret)
+        raise RuntimeError('cudaProfilerStart() returned %d' % ret)
 
 
 def cu_prof_stop():
     ret = _cudart.cudaProfilerStop()
     if ret != 0:
-        raise Exception('cudaProfilerStop() returned %d' % ret)
+        raise RuntimeError('cudaProfilerStop() returned %d' % ret)
 
 
 @contextmanager
diff --git a/easycv/toolkit/prune/prune_utils.py b/easycv/toolkit/prune/prune_utils.py
index b9fb2aa2..48f05350 100644
--- a/easycv/toolkit/prune/prune_utils.py
+++ b/easycv/toolkit/prune/prune_utils.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.framework.errors import ValueError
+
 try:
     from nni.algorithms.compression.pytorch.pruning import AGPPrunerV2
 except ImportError:
@@ -83,7 +85,7 @@ def load_pruner(model,
             optimizer=optimizer,
             pruning_algorithm=pruning_algorithm)
     else:
-        raise Exception(
+        raise ValueError(
             'pruning class {} is not supported'.format(pruning_class))
 
     return pruner
diff --git a/easycv/toolkit/quantize/quantize_utils.py b/easycv/toolkit/quantize/quantize_utils.py
index c7ef8aa8..41ce1ac1 100644
--- a/easycv/toolkit/quantize/quantize_utils.py
+++ b/easycv/toolkit/quantize/quantize_utils.py
@@ -7,6 +7,7 @@ import numpy as np
 import torch
 from mmcv.parallel import scatter_kwargs
 
+from easycv.framework.errors import ValueError
 from easycv.models.detection.detectors.yolox.yolo_head import YOLOXHead
 from easycv.models.detection.utils import output_postprocess, postprocess
 
diff --git a/easycv/toolkit/torchacc/convert_ops.py b/easycv/toolkit/torchacc/convert_ops.py
index 2e3d69ee..c17f898c 100644
--- a/easycv/toolkit/torchacc/convert_ops.py
+++ b/easycv/toolkit/torchacc/convert_ops.py
@@ -10,6 +10,8 @@ import torchacc.torch_xla.core.xla_model as xm
 from prettytable import PrettyTable
 from torch.distributed import ReduceOp
 
+from easycv.framework.errors import ValueError
+
 DEFAULT_TAG = 'EasyCV-default-barrier-tag'
 
 OpSpec = namedtuple('OpSpec', ['module', 'name', 'value'])
diff --git a/easycv/utils/checkpoint.py b/easycv/utils/checkpoint.py
index 4bf0af60..c583d9a0 100644
--- a/easycv/utils/checkpoint.py
+++ b/easycv/utils/checkpoint.py
@@ -8,6 +8,7 @@ from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu
 from torch.optim import Optimizer
 
 from easycv.file import io
+from easycv.framework.errors import TypeError
 from easycv.utils.constant import CACHE_DIR
 
 
diff --git a/easycv/utils/collect.py b/easycv/utils/collect.py
index 887ac417..904a44c9 100644
--- a/easycv/utils/collect.py
+++ b/easycv/utils/collect.py
@@ -5,6 +5,7 @@ import mmcv
 import numpy as np
 import torch
 
+from easycv.framework.errors import ValueError
 from .gather import gather_tensors_batch
 
 
diff --git a/easycv/utils/config_tools.py b/easycv/utils/config_tools.py
index adc846df..90386c89 100644
--- a/easycv/utils/config_tools.py
+++ b/easycv/utils/config_tools.py
@@ -7,6 +7,7 @@ from importlib import import_module
 
 from mmcv import Config, import_modules_from_strings
 
+from easycv.framework.errors import IOError, KeyError, ValueError
 from .user_config_params_utils import check_value_type
 
 if platform.system() == 'Windows':
@@ -58,7 +59,7 @@ def check_base_cfg_path(base_cfg_name='configs/base.py', ori_filename=None):
     if osp.exists(base_cfg_path_3):
         return base_cfg_path_3
 
-    raise '%s not Found' % base_cfg_name
+    raise ValueError('%s not Found' % base_cfg_name)
 
 
 # Read config without __base__
@@ -69,7 +70,7 @@ def mmcv_file2dict_raw(ori_filename):
             # read configs/config_templates/detection_oss.py
             filename = check_base_cfg_path(ori_filename)
         else:
-            raise '%s and %s not Found' % (ori_filename, filename)
+            raise ValueError('%s and %s not Found' % (ori_filename, filename))
 
     fileExtname = osp.splitext(filename)[1]
     if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
diff --git a/easycv/utils/json_utils.py b/easycv/utils/json_utils.py
index 05dacacb..536966d6 100644
--- a/easycv/utils/json_utils.py
+++ b/easycv/utils/json_utils.py
@@ -23,6 +23,8 @@ from json import encoder
 
 import numpy as np
 
+from easycv.framework.errors import ValueError
+
 # python 3.5 and newer version does not have json.encoder.FLOAT_REPR
 needs_class_hack = sys.version_info >= (3, 5)
 
diff --git a/easycv/utils/logger.py b/easycv/utils/logger.py
index 0c785311..9183af1e 100644
--- a/easycv/utils/logger.py
+++ b/easycv/utils/logger.py
@@ -3,6 +3,8 @@ import logging
 
 from mmcv.utils import get_logger
 
+from easycv.framework.errors import TypeError
+
 
 def get_root_logger(log_file=None, log_level=logging.INFO):
     """Get the root logger.
diff --git a/easycv/utils/mmlab_utils.py b/easycv/utils/mmlab_utils.py
index 17899d08..e4e2df86 100644
--- a/easycv/utils/mmlab_utils.py
+++ b/easycv/utils/mmlab_utils.py
@@ -11,6 +11,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
+from easycv.framework.errors import TypeError, ValueError
 from easycv.models.registry import BACKBONES, HEADS, MODELS, NECKS
 from .test_util import run_in_subprocess
 
@@ -159,7 +160,7 @@ class MMAdapter:
         elif inspect.isclass(module_name):
             module_obj = module_name
         else:
-            raise ValueError(
+            raise TypeError(
                 'Only support type `str` and `class` object, but get type {}'.
                 format(type(module_name)))
         return module_obj
diff --git a/easycv/utils/registry.py b/easycv/utils/registry.py
index b9e19b7d..19d9ab5d 100644
--- a/easycv/utils/registry.py
+++ b/easycv/utils/registry.py
@@ -4,6 +4,8 @@ from functools import partial
 
 import mmcv
 
+from easycv.framework.errors import KeyError, TypeError
+
 
 class Registry(object):
 
diff --git a/easycv/utils/test_util.py b/easycv/utils/test_util.py
index 169b12e8..ddd7245d 100644
--- a/easycv/utils/test_util.py
+++ b/easycv/utils/test_util.py
@@ -18,6 +18,7 @@ import numpy as np
 import torch
 
 from easycv.file import io
+from easycv.framework.errors import RuntimeError
 
 TEST_DIR = '/tmp/ev_pytorch_test'
 
diff --git a/easycv/utils/user_config_params_utils.py b/easycv/utils/user_config_params_utils.py
index 45d2772d..558741c9 100644
--- a/easycv/utils/user_config_params_utils.py
+++ b/easycv/utils/user_config_params_utils.py
@@ -1,3 +1,5 @@
+from easycv.framework.errors import TypeError
+
 VALID_TYPES = {tuple, list, str, int, float, bool, type(None)}
 
 
diff --git a/tests/core/evaluation/test_coco_evaluation.py b/tests/core/evaluation/test_coco_evaluation.py
index badf94a9..7ea3a706 100644
--- a/tests/core/evaluation/test_coco_evaluation.py
+++ b/tests/core/evaluation/test_coco_evaluation.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from easycv.core import standard_fields
 from easycv.core.evaluation import coco_evaluation
+from easycv.framework.errors import ValueError
 
 
 class CocoDetectionEvaluationTest(unittest.TestCase):
diff --git a/tests/core/optimizer/test_optimizers.py b/tests/core/optimizer/test_optimizers.py
index fa569020..3a1c538e 100644
--- a/tests/core/optimizer/test_optimizers.py
+++ b/tests/core/optimizer/test_optimizers.py
@@ -9,6 +9,8 @@ from torch.autograd import Variable
 from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR
 from torch.testing._internal.common_utils import TestCase
 
+from easycv.framework.errors import ValueError
+
 
 @unittest.skipIf(
     LooseVersion(torch.__version__) < LooseVersion('1.6.0'),
diff --git a/tests/datasets/detection/data_sources/test_det_voc_datasource.py b/tests/datasets/detection/data_sources/test_det_voc_datasource.py
index 8e016415..cb409c59 100644
--- a/tests/datasets/detection/data_sources/test_det_voc_datasource.py
+++ b/tests/datasets/detection/data_sources/test_det_voc_datasource.py
@@ -8,6 +8,7 @@ from tests.ut_config import DET_DATA_SMALL_VOC_LOCAL, VOC_CLASSES
 
 from easycv.datasets.detection.data_sources.voc import DetSourceVOC
 from easycv.file import io
+from easycv.framework.errors import ValueError
 
 
 class DetSourceVOCTest(unittest.TestCase):
@@ -135,7 +136,7 @@ class DetSourceVOCTest(unittest.TestCase):
 
         self.assertEqual(num_samples, 20)
         self.assertEqual(data_source._retry_count, 2)
-        self.assertEqual(exception.args[0], 'All samples failed to load!')
+        self.assertEqual(exception.message, 'All samples failed to load!')
 
 
 if __name__ == '__main__':
diff --git a/tests/framework/__init__.py b/tests/framework/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/framework/test_errors.py b/tests/framework/test_errors.py
new file mode 100644
index 00000000..a01d3290
--- /dev/null
+++ b/tests/framework/test_errors.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+
+class ErrorsTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_errors(self):
+        from easycv.framework import errors
+
+        def dummy_op():
+            pass
+
+        with self.assertRaises(errors.ValueError) as cm:
+            raise errors.ValueError(
+                'value error', details='provide correct value', op=dummy_op)
+        value_exception = cm.exception
+        self.assertEqual(value_exception.error_code, hex(errors.INVALID_VALUE))
+        self.assertEqual(value_exception.op, dummy_op)
+        self.assertEqual(value_exception.details, 'provide correct value')
+        self.assertEqual(value_exception.message, 'value error')
+
+        with self.assertRaises(errors.NotImplementedError) as cm:
+            raise errors.NotImplementedError()
+        value_exception = cm.exception
+        self.assertEqual(value_exception.error_code, hex(errors.UNIMPLEMENTED))
+        self.assertEqual(value_exception.op, None)
+        self.assertEqual(value_exception.details, None)
+        self.assertEqual(value_exception.message, '')
+
+        with self.assertRaises(errors.FileNotFoundError) as cm:
+            raise errors.FileNotFoundError
+        value_exception = cm.exception
+        self.assertEqual(value_exception.error_code,
+                         hex(errors.FILE_NOT_FOUND))
+        self.assertEqual(value_exception.op, None)
+        self.assertEqual(value_exception.details, None)
+        self.assertEqual(value_exception.message, '')
+
+        with self.assertRaises(errors.TimeoutError) as cm:
+            raise errors.TimeoutError('time out')
+        value_exception = cm.exception
+        self.assertEqual(value_exception.error_code, hex(errors.TIMEOUT))
+        self.assertEqual(value_exception.op, None)
+        self.assertEqual(value_exception.details, None)
+        self.assertEqual(value_exception.message, 'time out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/utils/test_json_utils.py b/tests/utils/test_json_utils.py
index f3d60696..7c906ae6 100644
--- a/tests/utils/test_json_utils.py
+++ b/tests/utils/test_json_utils.py
@@ -21,6 +21,7 @@ import tempfile
 import unittest
 
 from easycv.file import io
+from easycv.framework.errors import ValueError
 from easycv.utils import json_utils
 
 
diff --git a/tools/eval.py b/tools/eval.py
index 60be08df..66b69f44 100644
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -32,6 +32,7 @@ from easycv.utils.config_tools import (CONFIG_TEMPLATE_ZOO,
 from easycv.utils.mmlab_utils import dynamic_adapt_for_mmlab
 
 from easycv.utils.setup_env import setup_multi_processes
+from easycv.framework.errors import ValueError, NotImplementedError
 from easycv.utils.misc import reparameterize_models
 
 
@@ -251,8 +252,7 @@ def main():
                 eval_kwargs.update(args.options)
 
             if args.inference_only:
-                raise RuntimeError('not implemented')
-                dataset.format_results(outputs, **eval_kwargs)
+                raise NotImplementedError('not implemented')
             if args.eval:
                 for t in eval_pipe.evaluators:
                     if 'metric_type' in t:

From b376d84fe0cc7cef9cb5739fdef170f6137e4d4b Mon Sep 17 00:00:00 2001
From: Cathy0908 <30484308+Cathy0908@users.noreply.github.com>
Date: Mon, 19 Sep 2022 18:00:34 +0800
Subject: [PATCH 7/9] add train memory (#178)

* add train memory
---
 docs/source/model_zoo_det.md | 45 ++++++++++++++++++------------------
 docs/source/model_zoo_seg.md | 26 ++++++++++-----------
 docs/source/model_zoo_ssl.md | 42 ++++++++++++++++-----------------
 3 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/docs/source/model_zoo_det.md b/docs/source/model_zoo_det.md
index 474496f0..ccabed31 100644
--- a/docs/source/model_zoo_det.md
+++ b/docs/source/model_zoo_det.md
@@ -6,38 +6,37 @@ Inference default use V100 16G.
 
 Pretrained on COCO2017 dataset. (The result has been optimized with PAI-Blade, and only computes the model inference time. To learn about end2end inference time,  you can refer to [export.md](./tutorials/export.md).)
 
-| Algorithm             | Config                                                                                                                                                  | Params | Speed<sup>V100<br/><sub>fp16 b32 </sub> | mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                                                                                                                                                                                                                                         |
-|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|--------|-----------------------------------------|-------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| YOLOX-s               | [yolox_s_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_s_8xb16_300e_coco.py)                             | 9M     | 0.68ms                                  | 40.0                                | 58.9                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/log.txt)                |
-| PAI-YOLOXs            | [yoloxs_pai_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_8xb16_300e_coco.py)                       | 16M    | 0.71ms                                  | 41.4                                | 60.0                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs.json)                       |
-| PAI-YOLOXs-ASFF       | [yoloxs_pai_asff_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_8xb16_300e_coco.py)             | 21M    | 0.87ms                                  | 42.8                                | 61.8                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff.json)             |
+| Algorithm             | Config                                                       | Params | Speed<sup>V100<br/><sub>fp16 b32 </sub> | mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
+| --------------------- | ------------------------------------------------------------ | ------ | --------------------------------------- | ----------------------------------- | ---------------------------- | ------------------------------------------------------------ |
+| YOLOX-s               | [yolox_s_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_s_8xb16_300e_coco.py) | 9M     | 0.68ms                                  | 40.0                                | 58.9                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/log.txt) |
+| PAI-YOLOXs            | [yoloxs_pai_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_8xb16_300e_coco.py) | 16M    | 0.71ms                                  | 41.4                                | 60.0                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs.json) |
+| PAI-YOLOXs-ASFF       | [yoloxs_pai_asff_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_8xb16_300e_coco.py) | 21M    | 0.87ms                                  | 42.8                                | 61.8                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff.json) |
 | PAI-YOLOXs-ASFF-TOOD3 | [yoloxs_pai_asff_tood3_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_tood3_8xb16_300e_coco.py) | 24M    | 1.15ms                                  | 43.9                                | 62.1                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff_tood3.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff_tood3.json) |
-| YOLOX-m               | [yolox_m_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb16_300e_coco.py)                             | 25M    | 1.52ms                                  | 46.3                                | 64.9                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/log.txt)                |
-| YOLOX-l               | [yolox_l_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb8_300e_coco.py)                               | 54M    | 2.47ms                                  | 48.9                                | 67.5                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/log.txt)                  |
-| YOLOX-x               | [yolox_x_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_x_8xb8_300e_coco.py)                               | 99M    | 4.74ms                                  | 50.9                                | 69.2                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/log.txt)                  |
-| YOLOX-tiny            | [yolox_tiny_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py)                       | 5M     | 0.28ms                                  | 31.5                                | 49.2                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/log.txt)          |
-| YOLOX-nano            | [yolox_nano_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py)                       | 2.2M   | 0.19ms                                  | 26.5                                | 42.6                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/log.txt)          |
+| YOLOX-m               | [yolox_m_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb16_300e_coco.py) | 25M    | 1.52ms                                  | 46.3                                | 64.9                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/log.txt) |
+| YOLOX-l               | [yolox_l_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb8_300e_coco.py) | 54M    | 2.47ms                                  | 48.9                                | 67.5                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/log.txt) |
+| YOLOX-x               | [yolox_x_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_x_8xb8_300e_coco.py) | 99M    | 4.74ms                                  | 50.9                                | 69.2                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/log.txt) |
+| YOLOX-tiny            | [yolox_tiny_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 5M     | 0.28ms                                  | 31.5                                | 49.2                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/log.txt) |
+| YOLOX-nano            | [yolox_nano_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 2.2M   | 0.19ms                                  | 26.5                                | 42.6                         | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/log.txt) |
 
 ## ViTDet
-
-| Algorithm  | Config                                                       | Params<br/>(backbone/total)                      | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | mask_mAP<sup>val<br/><sub>0.5:0.95</sub> | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_mask_rcnn_100e.py) | 86M/111M | 138ms | 50.65                   | 45.41          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/20220901_135827.log.json) |
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                      | Train memory<br/>(GB) | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | mask_mAP<sup>val<br/><sub>0.5:0.95</sub> | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_mask_rcnn_100e.py) | 86M/111M | 13.3 (fp16) | 138ms | 50.65                   | 45.41          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/20220901_135827.log.json) |
 
 ## FCOS
 
-| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| FCOS-r50(caffe)    | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_caffe_1x_coco.py) | 23M/32M | 85.8ms | 38.58                   | 57.18          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) |
-| FCOS-r50(torch)    | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_torch_1x_coco.py) | 23M/32M | 105.3ms | 38.88                   | 58.01          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/fcos_epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220826_182628.log.json) |
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | Train memory<br/>(GB)       | inference time(V100)<br/>(ms/img)                      | mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| FCOS-r50(caffe)    | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_caffe_1x_coco.py) | 23M/32M | 5.0 | 85.8ms | 38.58                   | 57.18          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) |
+| FCOS-r50(torch)    | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_torch_1x_coco.py) | 23M/32M | 4.0 (fp16) | 105.3ms | 38.88                   | 58.01          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/fcos_epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220826_182628.log.json) |
 
 ## DETR
 
-| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| DETR-r50    | [detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/detr/detr_r50_8x2_150e_coco.py) | 23M/41M | 48.5ms | 39.92                   | 60.52          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/epoch_150.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/20220609_101243.log.json) |
-| DAB-DETR-r50    | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py) | 23M/43M | 58.5ms | 42.52                   | 63.03          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/dab_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/20220610_122811.log.json) |
-| DN-DETR-r50    | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dn_detr_r50_8x2_50e_coco.py) | 23M/43M | 58.5ms | 44.39                   | 64.66          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/dn_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/20220713_105127.log.json) |
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | Train memory<br/>(GB)       | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| DETR-r50    | [detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/detr/detr_r50_8x2_150e_coco.py) | 23M/41M | 8.5 | 48.5ms | 39.92                   | 60.52          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/epoch_150.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/20220609_101243.log.json) |
+| DAB-DETR-r50    | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py) | 23M/43M | 2.6 | 58.5ms | 42.52                   | 63.03          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/dab_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/20220610_122811.log.json) |
+| DN-DETR-r50    | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dn_detr_r50_8x2_50e_coco.py) | 23M/43M | 7.8 | 58.5ms | 44.39                   | 64.66          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/dn_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/20220713_105127.log.json) |
 
 ## DINO
 
diff --git a/docs/source/model_zoo_seg.md b/docs/source/model_zoo_seg.md
index 97820feb..a1cdbb82 100644
--- a/docs/source/model_zoo_seg.md
+++ b/docs/source/model_zoo_seg.md
@@ -4,29 +4,29 @@
 
 Pretrained on **Pascal VOC 2012 + Aug**.
 
-| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                     | mIoU | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| fcn_r50_d8 | [fcn_r50-d8_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/fcn/fcn_r50-d8_512x512_8xb4_60e_voc12aug.py) | 23M/49M | 166ms | 69.01               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/20220525_203606.log.json) |
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | Train memory<br/>(GB)      | inference time(V100)<br/>(ms/img)                     | mIoU | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| fcn_r50_d8 | [fcn_r50-d8_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/fcn/fcn_r50-d8_512x512_8xb4_60e_voc12aug.py) | 23M/49M | 19.8 | 166ms | 69.01               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/20220525_203606.log.json) |
 
 ## UperNet
 
 Pretrained on **Pascal VOC 2012 + Aug**.
-
-| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | mIoU | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 282.9ms | 76.59               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | Train memory<br/>(GB)       | inference time(V100)<br/>(ms/img)                      | mIoU | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 5.5 | 282.9ms | 76.59               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
 
 ## Mask2former
 
 ### Instance Segmentation on COCO
-| Algorithm  | Config                                                       | box MAP | Mask mAP | Download                                                     |
-| ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |
-| mask2former_r50 | [mask2former_r50_8xb2_e50_instance](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_instance.py) | 46.09 | 43.26 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/20220620_113639.log.json) |
+| Algorithm  | Config                                                       | Train memory<br/>(GB)                                  | box MAP | Mask mAP | Download                                                     |
+| ---------- | ------------------------------------------------------------ |----------|----------|----------|----------|
+| mask2former_r50 | [mask2former_r50_8xb2_e50_instance](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_instance.py) | 18.8 | 46.09 | 43.26 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/20220620_113639.log.json) |
 
 ### Panoptic Segmentation on COCO
-| Algorithm  | Config                                                       | PQ | box MAP | Mask mAP | Download                                                     |
-| ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |
-| mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) |
+
+| Algorithm  | Config                                                       | Train memory<br/>(GB)                                  | PQ | box MAP | Mask mAP | Download                                                     |
+| ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |---------------------------------------------------------------------------- |
+| mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 18.8 | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) |
 
 
 ## SegFormer
diff --git a/docs/source/model_zoo_ssl.md b/docs/source/model_zoo_ssl.md
index 81650e43..c6a1ef30 100644
--- a/docs/source/model_zoo_ssl.md
+++ b/docs/source/model_zoo_ssl.md
@@ -5,19 +5,19 @@
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone | Params<br>(backbone/total) | Flops | inference time(V100)<br>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | -------- | -------------------------- | ----- | -------------------------------- | ------ | ------------------------------------------------------------ |
-| [mae_vit_base_patch16_8xb64_400e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_400e.py) | ViT-B/16 | 85M/111M                   | 9.8G  | 8.03                             | 400    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-400/pretrain_400.pth) |
-| [mae_vit_base_patch16_8xb64_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_1600e.py) | ViT-B/16 | 85M/111M                   | 9.8G  | 8.03                             | 1600   | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/pretrain_1600.pth) |
-| [mae_vit_large_patch16_8xb32_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_large_patch16_8xb32_1600e.py) | ViT-L/16 | 303M/329M                  | 20.8G | 16.30                            | 1600   | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-l-1600/pretrain_1600.pth) |
+| Config                                                       | Backbone | Params<br>(backbone/total) | Train memory<br/>(GB) | Flops | inference time(V100)<br>(ms/img) | Epochs | Download                                                     |
+| ------------------------------------------------------------ | -------- | -------------------------- | ------------------ | ----- | -------------------------------- | ------ | ------------------------------------------------------------ |
+| [mae_vit_base_patch16_8xb64_400e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_400e.py) | ViT-B/16 | 85M/111M                   | 9.5                | 9.8G  | 8.03                             | 400    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-400/pretrain_400.pth) |
+| [mae_vit_base_patch16_8xb64_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_1600e.py) | ViT-B/16 | 85M/111M                   | 9.5                | 9.8G  | 8.03                             | 1600   | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/pretrain_1600.pth) |
+| [mae_vit_large_patch16_8xb32_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_large_patch16_8xb32_1600e.py) | ViT-L/16 | 303M/329M                  | 11.3               | 20.8G | 16.30                            | 1600   | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-l-1600/pretrain_1600.pth) |
 
 ### Fast ConvMAE
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone        | Params<br/>(backbone/total) | Flops | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | --------------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ |
-| [fast_convmae_vit_base_patch16_8xb64_50e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/fast_convmae/fast_convmae_vit_base_patch16_8xb64_50e.py) | ConvViT-B/16 | 88M/115M                    | 45.1G | 6.88                              | 50     | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/20220617_110501.log.json) |
+| Config                                                       | Backbone        | Params<br/>(backbone/total) | Train memory<br/>(GB) | Flops | inference time(V100)<br/>(ms/img) | Total train time | Epochs | Download                                                     |
+| ------------------------------------------------------------ | --------------- | --------------------------- | ----- | --------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [fast_convmae_vit_base_patch16_8xb64_50e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/fast_convmae/fast_convmae_vit_base_patch16_8xb64_50e.py) | ConvViT-B/16 | 88M/115M                    | 30.3 | 45.1G | 6.88                              | 20h<br/>(8*A100) | 50     | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/20220617_110501.log.json) |
 
 > The flops of Fast ConvMAE is about four times of MAE, because the mask of MAE only retains 25% of the tokens each forward, but the mask of Fast ConvMAE adopts a complementary strategy, dividing the mask into four complementary parts with 25% token each part. This is equivalent to learning four samples at each forward, achieving 4 times the learning effect.
 
@@ -25,34 +25,34 @@ Pretrained on **ImageNet** dataset.
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone  | Params<br/>(backbone/total) | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | --------- | --------------------------- | --------------------------------- | ------ | ------------------------------------------------------------ |
-| [dino_deit_small_p16_8xb32_100e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/dino/dino_deit_small_p16_8xb32_100e_tfrecord.py) | DeiT-S/16 | 21M/88M                     | 6.17                              | 100    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/dino_deit_small/epoch_100.pth) |
+| Config                                                       | Backbone  | Params<br/>(backbone/total) | Train memory<br/>(GB) | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
+| ------------------------------------------------------------ | --------- | --------------------------- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ |
+| [dino_deit_small_p16_8xb32_100e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/dino/dino_deit_small_p16_8xb32_100e_tfrecord.py) | DeiT-S/16 | 21M/88M                     | 10.5               | 6.17                              | 100    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/dino_deit_small/epoch_100.pth) |
 
 ### MoBY
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone  | Params<br/>(backbone/total) | Flops | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | --------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ |
-| [moby_deit_small_p16_4xb128_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_deit_small_p16_4xb128_300e_tfrecord.py) | DeiT-S/16 | 21M/26M                     | 18.6G | 6.17                              | 300    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/log.txt) |
-| [moby_swin_tiny_8xb64_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_dynamic_swin_tiny_8xb64_300e_tfrecord.py) | Swin-T    | 27M/33M                     | 18.1G | 9.74                              | 300    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/log.txt) |
+| Config                                                       | Backbone  | Params<br/>(backbone/total) | Flops | Train memory<br/>(GB) | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
+| ------------------------------------------------------------ | --------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ |
+| [moby_deit_small_p16_4xb128_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_deit_small_p16_4xb128_300e_tfrecord.py) | DeiT-S/16 | 21M/26M                     | 18.6G | 21.4               | 6.17                              | 300    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/log.txt) |
+| [moby_swin_tiny_8xb64_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_dynamic_swin_tiny_8xb64_300e_tfrecord.py) | Swin-T    | 27M/33M                     | 18.1G | 16.1               | 9.74                              | 300    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/log.txt) |
 
 ### MoCo V2
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone | Params<br/>(backbone/total) | Flops | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | -------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ |
-| [mocov2_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mocov2/mocov2_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M                     | 8.2G  | 8.59                              | 200    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mocov2_r50/epoch_200.pth) |
+| Config                                                       | Backbone | Params<br/>(backbone/total) | Flops | Train memory<br/>(GB) | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
+| ------------------------------------------------------------ | -------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ |
+| [mocov2_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mocov2/mocov2_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M                     | 8.2G  | 5.4                | 8.59                              | 200    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mocov2_r50/epoch_200.pth) |
 
 ### SwAV
 
 Pretrained on **ImageNet** dataset.
 
-| Config                                                       | Backbone | Params<br/>(backbone/total) | Flops | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
-| ------------------------------------------------------------ | -------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ |
-| [swav_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/swav/swav_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M                     | 12.9G | 8.59                              | 200    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/epoch_200.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/log.txt) |
+| Config                                                       | Backbone | Params<br/>(backbone/total) | Flops | Train memory<br/>(GB) | inference time(V100)<br/>(ms/img) | Epochs | Download                                                     |
+| ------------------------------------------------------------ | -------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ |
+| [swav_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/swav/swav_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M                     | 12.9G | 11.3               | 8.59                              | 200    | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/epoch_200.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/log.txt) |
 
 ## Benchmarks
 

From bb53e066be94f21d651ad9dd8fff2b9db1810519 Mon Sep 17 00:00:00 2001
From: yhq <yanhaiqiang.yhq@alibaba-inc.com>
Date: Mon, 19 Sep 2022 19:52:21 +0800
Subject: [PATCH 8/9] fix missing vit model (#197)

* fix missing vit model

* set pretrained false while export cls model
---
 ...ge_patch16_8xb16_50e_lrdecay075_fintune.py |  3 +
 .../tutorials/EasyCV图像自监督训练-MAE.ipynb  | 55 ++++++++++++++++++-
 easycv/apis/export.py                         |  3 +
 easycv/models/modelzoo.py                     |  6 ++
 4 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py b/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py
index 31694589..fa008b3e 100644
--- a/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py
+++ b/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py
@@ -157,3 +157,6 @@ checkpoint_config = dict(interval=10)
 
 # runtime settings
 total_epochs = 50
+
+# export config
+export = dict(export_neck=True)
diff --git a/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb b/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb
index 71024860..f96f0b52 100644
--- a/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb
+++ b/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb
@@ -262,7 +262,7 @@
     "state_dict = torch.load(weight_path)['state_dict']\n",
     "state_dict_out = {}\n",
     "for key in state_dict:\n",
-    "    state_dict_out[key.replace('encoder.','')] = state_dict[key]\n",
+    "    state_dict_out['model.' + key.replace('encoder.','')] = state_dict[key]\n",
     "torch.save(state_dict_out,weight_path)"
    ]
   },
@@ -324,7 +324,7 @@
    "outputs": [],
    "source": [
     "!python -m torch.distributed.launch --nproc_per_node=1 --master_port=29930 \\\n",
-    "/home/pai/lib/python3.6/site-packages/easycv/tools/train.py mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py --work_dir work_dir/selfsup/jpg/mae --launcher pytorch"
+    "/home/pai/lib/python3.6/site-packages/easycv/tools/train.py mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py --work_dir work_dir/selfsup/jpg/mae_fintune --launcher pytorch"
    ]
   },
   {
@@ -333,7 +333,56 @@
    "metadata": {},
    "source": [
     "### 预测\n",
-    "参考EasyCV图像分类的demo，对训练好的模型导出并预测"
+    "对训练好的模型导出并预测"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4271c852",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! python -m easycv.tools.export mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py  work_dir/selfsup/jpg/mae_fintune/ClsEvaluator_neck_top1_best.pth  work_dir/selfsup/jpg/mae_fintune/best_export.pth"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2cc9e6fc",
+   "metadata": {},
+   "source": [
+    "下载测试图片和标签文件"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "973d5bd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/cifar10/qince_data/predict/aeroplane_s_000004.png\n",
+    "! wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/doc/easycv/configs/selfsup/mae/label_map.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a5a3632",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "from easycv.predictors.classifier import TorchClassifier\n",
+    "\n",
+    "output_ckpt = 'work_dir/selfsup/jpg/mae_fintune/best_export.pth'\n",
+    "tcls = TorchClassifier(output_ckpt, topk=1, label_map_path='label_map.txt')\n",
+    "\n",
+    "img = cv2.imread('aeroplane_s_000004.png')\n",
+    "# input image should be RGB order\n",
+    "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
+    "output = tcls.predict([img])\n",
+    "print(output)"
    ]
   }
  ],
diff --git a/easycv/apis/export.py b/easycv/apis/export.py
index c2633acb..9a0b9165 100644
--- a/easycv/apis/export.py
+++ b/easycv/apis/export.py
@@ -107,6 +107,9 @@ def _export_cls(model, cfg, filename):
         backbone=replace_syncbn(cfg.model.backbone),
     )
 
+    # avoid load pretrained model
+    model_config['pretrained'] = False
+
     if export_neck:
         if hasattr(cfg.model, 'neck'):
             model_config['neck'] = cfg.model.neck
diff --git a/easycv/models/modelzoo.py b/easycv/models/modelzoo.py
index 0680dd21..58f005c4 100644
--- a/easycv/models/modelzoo.py
+++ b/easycv/models/modelzoo.py
@@ -253,4 +253,10 @@ timm_models = {
     'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/timm/swin_small_patch4_window7_224_statedict.pth',
     'dynamic_swin_tiny_p4_w7_224':
     'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/timm/swin_tiny_patch4_window7_224_statedict.pth',
+
+    # dynamic_vit:
+    'dynamic_vit_base_p16':
+    'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/timm/vit/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz',
+    'dynamic_vit_large_p16':
+    'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/timm/vit/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz',
 }

From 5dfe7b289829ef3eab2345279797c8b360faa813 Mon Sep 17 00:00:00 2001
From: Cathy0908 <30484308+Cathy0908@users.noreply.github.com>
Date: Tue, 20 Sep 2022 10:04:42 +0800
Subject: [PATCH 9/9] update some predcitors, support batch inference (#195)

update some predcitors, support batch inference
---
 .gitignore                                    |   3 -
 .../resnet/resnet50_b32x8_100e_jpg.py         |  10 +
 .../segmentation/data_sources/base.py         |   2 +-
 easycv/file/image.py                          |  38 ++-
 easycv/file/utils.py                          |   3 +-
 easycv/predictors/__init__.py                 |   3 +-
 easycv/predictors/base.py                     |  97 ++++--
 easycv/predictors/classifier.py               | 115 ++++++-
 easycv/predictors/detector.py                 | 250 ++++-----------
 easycv/predictors/face_keypoints_predictor.py |   9 +-
 easycv/predictors/hand_keypoints_predictor.py | 149 +++++----
 easycv/predictors/segmentation.py             | 287 +++++-------------
 easycv/utils/checkpoint.py                    |  44 ++-
 easycv/utils/constant.py                      |   4 +-
 easycv/utils/misc.py                          |  36 ++-
 .../segmentation/test_seg_raw_dataset.py      |   2 -
 tests/predictors/test_classifier.py           |  51 +++-
 tests/predictors/test_detector.py             |  51 +++-
 tests/predictors/test_detector_blade.py       |  11 +-
 .../test_face_keypoints_predictor.py          |  18 +-
 .../test_hand_keypoints_predictor.py          |  31 ++
 tests/predictors/test_segmentation.py         |  63 ++--
 tests/predictors/test_segmentor.py            |  48 ---
 tests/ut_config.py                            |   8 +-
 thirdparty/u2sod/sodpredictor.py              |   5 +-
 25 files changed, 671 insertions(+), 667 deletions(-)
 delete mode 100644 tests/predictors/test_segmentor.py

diff --git a/.gitignore b/.gitignore
index 1828f7e8..f63cdea2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,6 +137,3 @@ pai_jobs/easycv/resources/
 *.tar.gz
 thirdparty/test
 scripts/test
-
-# easycv default cache dir
-.easycv_cache
diff --git a/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py b/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py
index 03124f20..a81e4ae2 100644
--- a/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py
+++ b/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py
@@ -86,3 +86,13 @@ checkpoint_config = dict(interval=10)
 
 # runtime settings
 total_epochs = 100
+
+predict = dict(
+    type='ClassificationPredictor',
+    pipelines=[
+        dict(type='Resize', size=256),
+        dict(type='CenterCrop', size=224),
+        dict(type='ToTensor'),
+        dict(type='Normalize', **img_norm_cfg),
+        dict(type='Collect', keys=['img'])
+    ])
diff --git a/easycv/datasets/segmentation/data_sources/base.py b/easycv/datasets/segmentation/data_sources/base.py
index b8dc3673..a893932e 100644
--- a/easycv/datasets/segmentation/data_sources/base.py
+++ b/easycv/datasets/segmentation/data_sources/base.py
@@ -27,7 +27,7 @@ def load_image(img_path):
 
 
 def load_seg_map(seg_path, reduce_zero_label):
-    gt_semantic_seg = _load_img(seg_path, mode='RGB')
+    gt_semantic_seg = _load_img(seg_path, mode='P')
     # reduce zero_label
     if reduce_zero_label:
         # avoid using underflow conversion
diff --git a/easycv/file/image.py b/easycv/file/image.py
index a6253427..3a1fff90 100644
--- a/easycv/file/image.py
+++ b/easycv/file/image.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import io
 import logging
 import time
 
@@ -6,10 +7,10 @@ import cv2
 import numpy as np
 from PIL import Image
 
-from easycv.file import io
+from easycv import file
 from easycv.framework.errors import IOError
 from easycv.utils.constant import MAX_READ_IMAGE_TRY_TIMES
-from .utils import is_oss_path
+from .utils import is_oss_path, is_url_path
 
 
 def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES):
@@ -20,16 +21,31 @@ def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES):
     img = None
     while try_cnt < max_try_times:
         try:
-            with io.open(img_path, 'rb') as infile:
-                # cv2.imdecode may corrupt when the img is broken
-                image = Image.open(infile)  # RGB
+            if is_url_path(img_path):
+                from mmcv.fileio.file_client import HTTPBackend
+                client = HTTPBackend()
+                img_bytes = client.get(img_path)
+                buff = io.BytesIO(img_bytes)
+                image = Image.open(buff)
+                if mode.upper() != 'BGR' and image.mode.upper() != mode.upper(
+                ):
+                    image = image.convert(mode.upper())
                 img = np.asarray(image, dtype=np.uint8)
-                if mode.upper() == 'BGR':
-                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-                assert mode.upper() in ['RGB', 'BGR'
-                                        ], 'Only support `RGB` and `BGR` mode!'
-                assert img is not None
-                break
+            else:
+                with file.io.open(img_path, 'rb') as infile:
+                    # cv2.imdecode may corrupt when the img is broken
+                    image = Image.open(infile)
+                    if mode.upper() != 'BGR' and image.mode.upper(
+                    ) != mode.upper():
+                        image = image.convert(mode.upper())
+                    img = np.asarray(image, dtype=np.uint8)
+
+            if mode.upper() == 'BGR':
+                if image.mode.upper() != 'RGB':
+                    image = image.convert('RGB')
+                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+            assert img is not None
+            break
         except Exception as e:
             logging.error(e)
             logging.warning('Read file {} fault, try count : {}'.format(
diff --git a/easycv/file/utils.py b/easycv/file/utils.py
index 49920611..f943c725 100644
--- a/easycv/file/utils.py
+++ b/easycv/file/utils.py
@@ -13,7 +13,7 @@ from tqdm import tqdm
 from easycv.framework.errors import ValueError
 
 OSS_PREFIX = 'oss://'
-URL_PREFIX = 'https://'
+URL_PREFIX = ('https://', 'http://')
 
 
 def create_namedtuple(**kwargs):
@@ -33,6 +33,7 @@ def url_path_exists(url):
         urllib.request.urlopen(url).code
     except Exception as err:
         print(err)
+        return False
     return True
 
 
diff --git a/easycv/predictors/__init__.py b/easycv/predictors/__init__.py
index 3fe86936..2d38f2f2 100644
--- a/easycv/predictors/__init__.py
+++ b/easycv/predictors/__init__.py
@@ -9,5 +9,4 @@ from .feature_extractor import (TorchFaceAttrExtractor,
 from .hand_keypoints_predictor import HandKeypointsPredictor
 from .pose_predictor import (TorchPoseTopDownPredictor,
                              TorchPoseTopDownPredictorWithDetector)
-from .segmentation import (Mask2formerPredictor, SegFormerPredictor,
-                           SegmentationPredictor)
+from .segmentation import Mask2formerPredictor, SegmentationPredictor
diff --git a/easycv/predictors/base.py b/easycv/predictors/base.py
index d65b1bb6..5b36f2fd 100644
--- a/easycv/predictors/base.py
+++ b/easycv/predictors/base.py
@@ -1,19 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
 import os
 import pickle
 
+import cv2
 import numpy as np
 import torch
 from mmcv.parallel import collate, scatter_kwargs
 from PIL import Image
+from torch.hub import load_state_dict_from_url
 from torchvision.transforms import Compose
 
 from easycv.datasets.registry import PIPELINES
 from easycv.file import io
+from easycv.file.utils import is_url_path
 from easycv.framework.errors import ValueError
 from easycv.models.builder import build_model
 from easycv.utils.checkpoint import load_checkpoint
-from easycv.utils.config_tools import mmcv_config_fromfile
+from easycv.utils.config_tools import Config, mmcv_config_fromfile
 from easycv.utils.constant import CACHE_DIR
 from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab,
                                       remove_adapt_for_mmlab)
@@ -107,7 +111,9 @@ class PredictorV2(object):
             device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
             save_results (bool): Whether to save predict results.
             save_path (str): File path for saving results, only valid when `save_results` is True.
+            pipelines (list[dict]): Data pipeline configs.
         """
+    INPUT_IMAGE_MODE = 'BGR'  # the image mode into the model
 
     def __init__(self,
                  model_path,
@@ -116,30 +122,51 @@ class PredictorV2(object):
                  device=None,
                  save_results=False,
                  save_path=None,
-                 mode='rgb',
+                 pipelines=None,
                  *args,
                  **kwargs):
         self.model_path = model_path
         self.batch_size = batch_size
         self.save_results = save_results
         self.save_path = save_path
+        self.config_file = config_file
         if self.save_results:
             assert self.save_path is not None
         self.device = device
         if self.device is None:
             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
-        self.cfg = None
         if config_file is not None:
             if isinstance(config_file, str):
                 self.cfg = mmcv_config_fromfile(config_file)
             else:
                 self.cfg = config_file
+        else:
+            self.cfg = self._load_cfg_from_ckpt(self.model_path)
+
+        if self.cfg is None:
+            raise ValueError('Please provide "config_file"!')
 
         self.model = self.prepare_model()
+        self.pipelines = pipelines
         self.processor = self.build_processor()
         self._load_op = None
-        self.mode = mode
+
+    def _load_cfg_from_ckpt(self, model_path):
+        if is_url_path(model_path):
+            ckpt = load_state_dict_from_url(model_path)
+        else:
+            with io.open(model_path, 'rb') as infile:
+                ckpt = torch.load(infile, map_location='cpu')
+
+        cfg = None
+        if 'meta' in ckpt and 'config' in ckpt['meta']:
+            cfg = ckpt['meta']['config']
+            if isinstance(cfg, dict):
+                cfg = Config(cfg)
+            elif isinstance(cfg, str):
+                cfg = Config(json.loads(cfg))
+        return cfg
 
     def prepare_model(self):
         """Build model from config file by default.
@@ -152,8 +179,6 @@ class PredictorV2(object):
         return model
 
     def _build_model(self):
-        if self.cfg is None:
-            raise ValueError('Please provide "config_file"!')
         # Use mmdet model
         dynamic_adapt_for_mmlab(self.cfg)
         model = build_model(self.cfg.model)
@@ -165,16 +190,15 @@ class PredictorV2(object):
         """Build processor to process loaded input.
         If you need custom preprocessing ops, you need to reimplement it.
         """
-        if self.cfg is None:
-            pipeline = []
+        if self.pipelines is not None:
+            pipelines = self.pipelines
         else:
-            pipeline = [
-                build_from_cfg(p, PIPELINES)
-                for p in self.cfg.get('test_pipeline', [])
-            ]
+            pipelines = self.cfg.get('test_pipeline', [])
+
+        pipelines = [build_from_cfg(p, PIPELINES) for p in pipelines]
 
         from easycv.datasets.shared.pipelines.transforms import Compose
-        processor = Compose(pipeline)
+        processor = Compose(pipelines)
         return processor
 
     def _load_input(self, input):
@@ -190,10 +214,13 @@ class PredictorV2(object):
             }
         """
         if self._load_op is None:
-            load_cfg = dict(type='LoadImage', mode=self.mode)
+            load_cfg = dict(type='LoadImage', mode=self.INPUT_IMAGE_MODE)
             self._load_op = build_from_cfg(load_cfg, PIPELINES)
 
         if not isinstance(input, str):
+            if isinstance(input, np.ndarray):
+                # Only support RGB mode if input is np.ndarray.
+                input = cv2.cvtColor(input, cv2.COLOR_RGB2BGR)
             sample = self._load_op({'img': input})
         else:
             sample = self._load_op({'filename': input})
@@ -229,8 +256,32 @@ class PredictorV2(object):
         return outputs
 
     def postprocess(self, inputs, *args, **kwargs):
-        """Process model outputs.
-        If you need add some processing ops to process model outputs, you need to reimplement it.
+        """Process model batch outputs.
+        """
+        outputs = []
+        out_i = {}
+        batch_size = 1
+        # get current batch size
+        for k, batch_v in inputs.items():
+            if batch_v is not None:
+                batch_size = len(batch_v)
+                break
+
+        for i in range(batch_size):
+            for k, batch_v in inputs.items():
+                if batch_v is not None:
+                    out_i[k] = batch_v[i]
+                else:
+                    out_i[k] = None
+
+            out_i = self.postprocess_single(out_i)
+            outputs.append(out_i)
+
+        return outputs
+
+    def postprocess_single(self, inputs):
+        """Process outputs of single sample.
+        If you need add some processing ops, you need to reimplement it.
         """
         return inputs
 
@@ -260,16 +311,22 @@ class PredictorV2(object):
 
         results_list = []
         for i in range(0, len(inputs), self.batch_size):
-            batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)]
+            batch = inputs[i:min(len(inputs), i + self.batch_size)]
             batch_outputs = self.preprocess(batch)
             batch_outputs = self.forward(batch_outputs)
             results = self.postprocess(batch_outputs)
+            assert len(results) == len(
+                batch), f'Mismatch size {len(results)} != {len(batch)}'
             if keep_inputs:
-                results = {'inputs': batch, 'results': results}
+                for i in range(len(batch)):
+                    results[i].update({'inputs': batch[i]})
             # if dump, the outputs will not added to the return value to prevent taking up too much memory
             if self.save_results:
-                self.dump([results], self.save_path, mode='ab+')
+                self.dump(results, self.save_path, mode='ab+')
             else:
-                results_list.append(results)
+                if isinstance(results, list):
+                    results_list.extend(results)
+                else:
+                    results_list.append(results)
 
         return results_list
diff --git a/easycv/predictors/classifier.py b/easycv/predictors/classifier.py
index e29d6736..a788c354 100644
--- a/easycv/predictors/classifier.py
+++ b/easycv/predictors/classifier.py
@@ -3,17 +3,130 @@ import math
 
 import numpy as np
 import torch
+from PIL import Image, ImageFile
 
+from easycv.file import io
 from easycv.framework.errors import ValueError
-from .base import Predictor
+from easycv.utils.misc import deprecated
+from .base import Predictor, PredictorV2
 from .builder import PREDICTORS
 
+
+@PREDICTORS.register_module()
+class ClassificationPredictor(PredictorV2):
+    """Predictor for classification.
+    Args:
+        model_path (str): Path of model path.
+        config_file (Optinal[str]): config file path for model and processor to init. Defaults to None.
+        batch_size (int): batch size for forward.
+        device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+        save_results (bool): Whether to save predict results.
+        save_path (str): File path for saving results, only valid when `save_results` is True.
+        pipelines (list[dict]): Data pipeline configs.
+        topk (int): Return top-k results. Default: 1.
+        pil_input (bool): Whether use PIL image. If processor need PIL input, set true, default false.
+        label_map_path (str): File path of saving labels list.
+    """
+
+    def __init__(self,
+                 model_path,
+                 config_file=None,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 pipelines=[],
+                 topk=1,
+                 pil_input=True,
+                 label_map_path=[],
+                 *args,
+                 **kwargs):
+        super(ClassificationPredictor, self).__init__(
+            model_path,
+            config_file=config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            pipelines=pipelines,
+            *args,
+            **kwargs)
+        self.topk = topk
+        self.pil_input = pil_input
+
+        # Adapt to torchvision transforms which process PIL inputs.
+        if self.pil_input:
+            self.INPUT_IMAGE_MODE = 'RGB'
+
+        if label_map_path is None:
+            class_list = self.cfg.get('CLASSES', [])
+        else:
+            with io.open(label_map_path, 'r') as f:
+                class_list = f.readlines()
+        self.label_map = [i.strip() for i in class_list]
+
+    def _load_input(self, input):
+        """Load image from file or numpy or PIL object.
+        Args:
+            input: File path or numpy or PIL object.
+        Returns:
+           {
+                'filename': filename,
+                'img': img,
+                'img_shape': img_shape,
+                'img_fields': ['img']
+            }
+        """
+        if self.pil_input:
+            results = {}
+            if isinstance(input, str):
+                img = Image.open(input)
+                if img.mode.upper() != self.INPUT_IMAGE_MODE.upper():
+                    img = img.convert(self.INPUT_IMAGE_MODE.upper())
+                results['filename'] = input
+            else:
+                assert isinstance(input, ImageFile.ImageFile)
+                img = input
+                results['filename'] = None
+            results['img'] = img
+            results['img_shape'] = img.size
+            results['ori_shape'] = img.size
+            results['img_fields'] = ['img']
+            return results
+
+        return super()._load_input(input)
+
+    def postprocess(self, inputs, *args, **kwargs):
+        """Return top-k results."""
+        output_prob = inputs['prob'].data.cpu()
+        topk_class = torch.topk(output_prob, self.topk).indices.numpy()
+        output_prob = output_prob.numpy()
+        batch_results = []
+        batch_size = output_prob.shape[0]
+        for i in range(batch_size):
+            result = {'class': np.squeeze(topk_class[i]).tolist()}
+            if isinstance(result['class'], int):
+                result['class'] = [result['class']]
+
+            if len(self.label_map) > 0:
+                result['class_name'] = [
+                    self.label_map[i] for i in result['class']
+                ]
+                result['class_probs'] = {}
+                for l_idx, l_name in enumerate(self.label_map):
+                    result['class_probs'][l_name] = output_prob[i][l_idx]
+
+            batch_results.append(result)
+        return batch_results
+
+
 try:
     from easy_vision.python.inference.predictor import PredictorInterface
 except:
     from .interface import PredictorInterface
 
 
+@deprecated(reason='Please use ClassificationPredictor.')
 @PREDICTORS.register_module()
 class TorchClassifier(PredictorInterface):
 
diff --git a/easycv/predictors/detector.py b/easycv/predictors/detector.py
index 7637ca67..38fd262f 100644
--- a/easycv/predictors/detector.py
+++ b/easycv/predictors/detector.py
@@ -5,9 +5,6 @@ from glob import glob
 
 import numpy as np
 import torch
-from mmcv.ops import RoIPool
-from mmcv.parallel import collate, scatter
-from torch.hub import load_state_dict_from_url
 from torchvision.transforms import Compose
 
 from easycv.apis.export import reparameterize_models
@@ -15,16 +12,12 @@ from easycv.core.visualization import imshow_bboxes
 from easycv.datasets.registry import PIPELINES
 from easycv.datasets.utils import replace_ImageToTensor
 from easycv.file import io
-from easycv.file.utils import is_url_path, url_path_exists
-from easycv.framework.errors import TypeError
 from easycv.models import build_model
 from easycv.models.detection.utils import postprocess
 from easycv.utils.checkpoint import load_checkpoint
 from easycv.utils.config_tools import mmcv_config_fromfile
 from easycv.utils.constant import CACHE_DIR
-from easycv.utils.logger import get_root_logger
-from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab,
-                                      remove_adapt_for_mmlab)
+from easycv.utils.misc import deprecated
 from easycv.utils.registry import build_from_cfg
 from .base import PredictorV2
 from .builder import PREDICTORS
@@ -47,14 +40,16 @@ class DetectionPredictor(PredictorV2):
     """
 
     def __init__(self,
-                 model_path=None,
+                 model_path,
                  config_file=None,
                  batch_size=1,
                  device=None,
                  save_results=False,
                  save_path=None,
-                 mode='rgb',
-                 score_threshold=0.5):
+                 pipelines=None,
+                 score_threshold=0.5,
+                 *arg,
+                 **kwargs):
         super(DetectionPredictor, self).__init__(
             model_path,
             config_file=config_file,
@@ -62,194 +57,55 @@ class DetectionPredictor(PredictorV2):
             device=device,
             save_results=save_results,
             save_path=save_path,
-            mode=mode,
+            pipelines=pipelines,
         )
         self.score_thresh = score_threshold
+        self.CLASSES = self.cfg.get('CLASSES', None)
+
+    def build_processor(self):
+        if self.pipelines is not None:
+            pipelines = self.pipelines
+        elif self.cfg is None:
+            pipelines = []
+        else:
+            pipelines = self.cfg.get('test_pipeline', [])
+
+        # for batch inference
+        self.pipelines = replace_ImageToTensor(pipelines)
+
+        return super().build_processor()
+
+    def postprocess_single(self, inputs, *args, **kwargs):
+        if inputs['detection_scores'] is None or len(
+                inputs['detection_scores']) < 1:
+            return inputs
+
+        scores = inputs['detection_scores']
+        if scores is not None and self.score_thresh > 0:
+            keeped_ids = scores > self.score_thresh
+            inputs['detection_scores'] = inputs['detection_scores'][keeped_ids]
+            inputs['detection_boxes'] = inputs['detection_boxes'][keeped_ids]
+            inputs['detection_classes'] = inputs['detection_classes'][
+                keeped_ids]
+
+        class_names = []
+        for _, classes_id in enumerate(inputs['detection_classes']):
+            if classes_id is None:
+                class_names.append(None)
+            elif self.CLASSES is not None and len(self.CLASSES) > 0:
+                class_names.append(self.CLASSES[int(classes_id)])
+            else:
+                class_names.append(classes_id)
+
+        inputs['detection_class_names'] = class_names
 
-    def postprocess(self, inputs, *args, **kwargs):
-        for batch_index in range(self.batch_size):
-            this_detection_scores = inputs['detection_scores'][batch_index]
-            sel_ids = this_detection_scores > self.score_thresh
-            inputs['detection_scores'][batch_index] = inputs[
-                'detection_scores'][batch_index][sel_ids]
-            inputs['detection_boxes'][batch_index] = inputs['detection_boxes'][
-                batch_index][sel_ids]
-            inputs['detection_classes'][batch_index] = inputs[
-                'detection_classes'][batch_index][sel_ids]
-        # TODO class label remapping
         return inputs
 
-
-class DetrPredictor(PredictorInterface):
-    """Inference image(s) with the detector.
-    Args:
-        model_path (str): checkpoint model and export model are shared.
-        config_path (str): If config_path is specified, both checkpoint model and export model can be used; if config_path=None, the export model is used by default.
-    """
-
-    def __init__(self, model_path, config_path=None):
-
-        self.model_path = model_path
-
-        if config_path is not None:
-            self.cfg = mmcv_config_fromfile(config_path)
-        else:
-            logger = get_root_logger()
-            logger.warning('please use export model!')
-            if is_url_path(self.model_path) and url_path_exists(
-                    self.model_path):
-                checkpoint = load_state_dict_from_url(model_path)
-            else:
-                assert io.exists(
-                    self.model_path), f'{self.model_path} does not exists'
-
-                with io.open(self.model_path, 'rb') as infile:
-                    checkpoint = torch.load(infile, map_location='cpu')
-
-            assert 'meta' in checkpoint and 'config' in checkpoint[
-                'meta'], 'meta.config is missing from checkpoint'
-
-            config_str = checkpoint['meta']['config']
-            if isinstance(config_str, dict):
-                config_str = json.dumps(config_str)
-
-            # get config
-            basename = os.path.basename(self.model_path)
-            fname, _ = os.path.splitext(basename)
-            self.local_config_file = os.path.join(CACHE_DIR,
-                                                  f'{fname}_config.json')
-            if not os.path.exists(CACHE_DIR):
-                os.makedirs(CACHE_DIR)
-            with open(self.local_config_file, 'w') as ofile:
-                ofile.write(config_str)
-            self.cfg = mmcv_config_fromfile(self.local_config_file)
-
-        # dynamic adapt mmdet models
-        dynamic_adapt_for_mmlab(self.cfg)
-
-        # build model
-        self.model = build_model(self.cfg.model)
-
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        map_location = 'cpu' if self.device == 'cpu' else 'cuda'
-        self.ckpt = load_checkpoint(
-            self.model, self.model_path, map_location=map_location)
-
-        self.model.to(self.device)
-        self.model.eval()
-
-        self.CLASSES = self.cfg.CLASSES
-
-    def predict(self, imgs):
-        """
-        Args:
-            imgs (str/ndarray or list[str/ndarray] or tuple[str/ndarray]):
-            Either image files or loaded images.
-        Returns:
-            If imgs is a list or tuple, the same length list type results
-            will be returned, otherwise return the detection results directly.
-        """
-
-        if isinstance(imgs, (list, tuple)):
-            is_batch = True
-        else:
-            imgs = [imgs]
-            is_batch = False
-
-        cfg = self.cfg
-        device = next(self.model.parameters()).device  # model device
-
-        if isinstance(imgs[0], np.ndarray):
-            cfg = cfg.copy()
-            # set loading pipeline type
-            cfg.data.val.pipeline.insert(0, dict(type='LoadImageFromWebcam'))
-        else:
-            cfg = cfg.copy()
-            # set loading pipeline type
-            cfg.data.val.pipeline.insert(
-                0,
-                dict(
-                    type='LoadImageFromFile',
-                    file_client_args=dict(
-                        backend=('http' if imgs[0].startswith('http'
-                                                              ) else 'disk'))))
-
-        cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline)
-
-        transforms = []
-        for transform in cfg.data.val.pipeline:
-            if 'img_scale' in transform:
-                transform['img_scale'] = tuple(transform['img_scale'])
-            if isinstance(transform, dict):
-                transform = build_from_cfg(transform, PIPELINES)
-                transforms.append(transform)
-            elif callable(transform):
-                transforms.append(transform)
-            else:
-                raise TypeError('transform must be callable or a dict')
-        test_pipeline = Compose(transforms)
-
-        datas = []
-        for img in imgs:
-            # prepare data
-            if isinstance(img, np.ndarray):
-                # directly add img
-                data = dict(img=img)
-            else:
-                # add information into dict
-                data = dict(img_info=dict(filename=img), img_prefix=None)
-            # build the data pipeline
-            data = test_pipeline(data)
-            datas.append(data)
-
-        data = collate(datas, samples_per_gpu=len(imgs))
-        # just get the actual data from DataContainer
-        data['img_metas'] = [
-            img_metas.data[0] for img_metas in data['img_metas']
-        ]
-        data['img'] = [img.data[0] for img in data['img']]
-        if next(self.model.parameters()).is_cuda:
-            # scatter to specified GPU
-            data = scatter(data, [device])[0]
-        else:
-            for m in self.model.modules():
-                assert not isinstance(
-                    m, RoIPool
-                ), 'CPU inference with RoIPool is not supported currently.'
-
-        # forward the model
-        with torch.no_grad():
-            results = self.model(mode='test', **data)
-
-        return results
-
-    def visualize(self,
-                  img,
-                  results,
-                  score_thr=0.3,
-                  show=False,
-                  out_file=None):
-        bboxes = results['detection_boxes'][0]
-        scores = results['detection_scores'][0]
-        labels = results['detection_classes'][0].tolist()
-
-        # If self.CLASSES is not None, class_id will be converted to self.CLASSES for visualization,
-        # otherwise the class_id will be displayed.
-        # And don't try to modify the value in results, it may cause some bugs or even precision problems,
-        # because `self.evaluate` will also use the results, refer to: https://github.com/alibaba/EasyCV/pull/67
-
-        if self.CLASSES is not None and len(self.CLASSES) > 0:
-            for i, classes_id in enumerate(labels):
-                if classes_id is None:
-                    labels[i] = None
-                else:
-                    labels[i] = self.CLASSES[int(classes_id)]
-
-        if scores is not None and score_thr > 0:
-            inds = scores > score_thr
-            bboxes = bboxes[inds]
-            labels = np.array(labels)[inds]
-
+    def visualize(self, img, results, show=False, out_file=None):
+        """Only support show one sample now."""
+        bboxes = results['detection_boxes']
+        labels = results['detection_class_names']
+        img = self._load_input(img)['img']
         imshow_bboxes(
             img,
             bboxes,
@@ -263,6 +119,12 @@ class DetrPredictor(PredictorInterface):
             out_file=out_file)
 
 
+@deprecated(reason='Please use DetectionPredictor.')
+@PREDICTORS.register_module()
+class DetrPredictor(DetectionPredictor):
+    """"""
+
+
 @PREDICTORS.register_module()
 class TorchYoloXPredictor(PredictorInterface):
 
diff --git a/easycv/predictors/face_keypoints_predictor.py b/easycv/predictors/face_keypoints_predictor.py
index 2c94f0a4..54b13424 100644
--- a/easycv/predictors/face_keypoints_predictor.py
+++ b/easycv/predictors/face_keypoints_predictor.py
@@ -25,6 +25,11 @@ class FaceKeypointsPredictor(PredictorV2):
     Args:
         model_path (str): Path of model path
         config_file (str): config file path for model and processor to init. Defaults to None.
+        batch_size (int): batch size for forward.
+        device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+        save_results (bool): Whether to save predict results.
+        save_path (str): File path for saving results, only valid when `save_results` is True.
+        pipelines (list[dict]): Data pipeline configs.
     """
 
     def __init__(self,
@@ -34,7 +39,7 @@ class FaceKeypointsPredictor(PredictorV2):
                  device=None,
                  save_results=False,
                  save_path=None,
-                 mode='bgr'):
+                 pipelines=None):
         super(FaceKeypointsPredictor, self).__init__(
             model_path,
             config_file,
@@ -42,7 +47,7 @@ class FaceKeypointsPredictor(PredictorV2):
             device=device,
             save_results=save_results,
             save_path=save_path,
-            mode=mode)
+            pipelines=pipelines)
 
         self.input_size = self.cfg.IMAGE_SIZE
         self.point_number = self.cfg.POINT_NUMBER
diff --git a/easycv/predictors/hand_keypoints_predictor.py b/easycv/predictors/hand_keypoints_predictor.py
index 01d0b0ce..0e092488 100644
--- a/easycv/predictors/hand_keypoints_predictor.py
+++ b/easycv/predictors/hand_keypoints_predictor.py
@@ -25,9 +25,11 @@ class HandKeypointsPredictor(PredictorV2):
         config_file: path or ``Config`` of config file
         detection_model_config: dict of hand detection model predictor config,
                                 example like ``dict(type="", model_path="", config_file="", ......)``
-        batch_size: batch_size to infer
-        save_results: bool
-        save_path: path of result image
+        batch_size (int): batch size for forward.
+        device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+        save_results (bool): Whether to save predict results.
+        save_path (str): File path for saving results, only valid when `save_results` is True.
+        pipelines (list[dict]): Data pipeline configs.
     """
 
     def __init__(self,
@@ -38,7 +40,7 @@ class HandKeypointsPredictor(PredictorV2):
                  device=None,
                  save_results=False,
                  save_path=None,
-                 mode='rgb',
+                 pipelines=None,
                  *args,
                  **kwargs):
         super(HandKeypointsPredictor, self).__init__(
@@ -48,7 +50,7 @@ class HandKeypointsPredictor(PredictorV2):
             device=device,
             save_results=save_results,
             save_path=save_path,
-            mode=mode,
+            pipelines=pipelines,
             *args,
             **kwargs)
         self.dataset_info = DatasetInfo(COCO_WHOLEBODY_HAND_DATASET_INFO)
@@ -70,52 +72,48 @@ class HandKeypointsPredictor(PredictorV2):
                     }
                 }
         """
-        image_paths = input['inputs']
-        batch_data = []
+        image_path = input['inputs']
+        data_list = []
         box_id = 0
-        for batch_index, image_path in enumerate(image_paths):
-            det_bbox_result = input['results']['detection_boxes'][batch_index]
-            det_bbox_scores = input['results']['detection_scores'][batch_index]
-            img = mmcv.imread(image_path, 'color', self.mode)
-            for bbox, score in zip(det_bbox_result, det_bbox_scores):
-                center, scale = _box2cs(self.cfg.data_cfg['image_size'], bbox)
-                # prepare data
-                data = {
-                    'image_file':
-                    image_path,
-                    'img':
-                    img,
-                    'image_id':
-                    batch_index,
-                    'center':
-                    center,
-                    'scale':
-                    scale,
-                    'bbox_score':
-                    score,
-                    'bbox_id':
-                    box_id,  # need to be assigned if batch_size > 1
-                    'dataset':
-                    'coco_wholebody_hand',
-                    'joints_3d':
-                    np.zeros((self.cfg.data_cfg.num_joints, 3),
-                             dtype=np.float32),
-                    'joints_3d_visible':
-                    np.zeros((self.cfg.data_cfg.num_joints, 3),
-                             dtype=np.float32),
-                    'rotation':
-                    0,
-                    'flip_pairs':
-                    self.dataset_info.flip_pairs,
-                    'ann_info': {
-                        'image_size':
-                        np.array(self.cfg.data_cfg['image_size']),
-                        'num_joints': self.cfg.data_cfg['num_joints']
-                    }
+        det_bbox_result = input['detection_boxes']
+        det_bbox_scores = input['detection_scores']
+        img = mmcv.imread(image_path, 'color', self.INPUT_IMAGE_MODE)
+        for bbox, score in zip(det_bbox_result, det_bbox_scores):
+            center, scale = _box2cs(self.cfg.data_cfg['image_size'], bbox)
+            # prepare data
+            data = {
+                'image_file':
+                image_path,
+                'img':
+                img,
+                'image_id':
+                0,
+                'center':
+                center,
+                'scale':
+                scale,
+                'bbox_score':
+                score,
+                'bbox_id':
+                box_id,  # need to be assigned if batch_size > 1
+                'dataset':
+                'coco_wholebody_hand',
+                'joints_3d':
+                np.zeros((self.cfg.data_cfg.num_joints, 3), dtype=np.float32),
+                'joints_3d_visible':
+                np.zeros((self.cfg.data_cfg.num_joints, 3), dtype=np.float32),
+                'rotation':
+                0,
+                'flip_pairs':
+                self.dataset_info.flip_pairs,
+                'ann_info': {
+                    'image_size': np.array(self.cfg.data_cfg['image_size']),
+                    'num_joints': self.cfg.data_cfg['num_joints']
                 }
-                batch_data.append(data)
-                box_id += 1
-        return batch_data
+            }
+            data_list.append(data)
+            box_id += 1
+        return data_list
 
     def preprocess_single(self, input):
         results = []
@@ -128,8 +126,11 @@ class HandKeypointsPredictor(PredictorV2):
         """Process all inputs list. And collate to batch and put to target device.
         If you need custom ops to load or process a batch samples, you need to reimplement it.
         """
+        # hand det and return source image
+        det_results = self.detection_predictor(inputs, keep_inputs=True)
+
         batch_outputs = []
-        for i in inputs:
+        for i in det_results:
             for res in self.preprocess_single(i, *args, **kwargs):
                 batch_outputs.append(res)
         batch_outputs = self._collate_fn(batch_outputs)
@@ -137,37 +138,25 @@ class HandKeypointsPredictor(PredictorV2):
         return batch_outputs
 
     def postprocess(self, inputs, *args, **kwargs):
-        output = {}
-        output['keypoints'] = inputs['preds']
-        output['boxes'] = inputs['boxes']
-        for i, bbox in enumerate(output['boxes']):
+        keypoints = inputs['preds']
+        boxes = inputs['boxes']
+        for i, bbox in enumerate(boxes):
             center, scale = bbox[:2], bbox[2:4]
-            output['boxes'][i][:4] = bbox_cs2xyxy(center, scale)
-        output['boxes'] = output['boxes'][:, :4]
-        return output
-
-    def __call__(self, inputs, keep_inputs=False):
-        if isinstance(inputs, str):
-            inputs = [inputs]
-
-        results_list = []
-        for i in range(0, len(inputs), self.batch_size):
-            batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)]
-            # hand det and return source image
-            det_results = self.detection_predictor(batch, keep_inputs=True)
-            # hand keypoints
-            batch_outputs = self.preprocess(det_results)
-            batch_outputs = self.forward(batch_outputs)
-            results = self.postprocess(batch_outputs)
-            if keep_inputs:
-                results = {'inputs': batch, 'results': results}
-            # if dump, the outputs will not added to the return value to prevent taking up too much memory
-            if self.save_results:
-                self.dump([results], self.save_path, mode='ab+')
-            else:
-                results_list.append(results)
-
-        return results_list
+            boxes[i][:4] = bbox_cs2xyxy(center, scale)
+        boxes = boxes[:, :4]
+        # TODO: support multi bboxes for a single sample
+        assert len(keypoints.shape) == 3
+        assert len(boxes.shape) == 2
+        batch_outputs = []
+        batch_size = keypoints.shape[0]
+        keypoints = np.split(keypoints, batch_size)
+        boxes = np.split(boxes, batch_size)
+        for i in range(batch_size):
+            batch_outputs.append({
+                'keypoints': keypoints[i],
+                'boxes': boxes[i]
+            })
+        return batch_outputs
 
     def show_result(self,
                     image_path,
diff --git a/easycv/predictors/segmentation.py b/easycv/predictors/segmentation.py
index 6916817b..51365653 100644
--- a/easycv/predictors/segmentation.py
+++ b/easycv/predictors/segmentation.py
@@ -5,22 +5,25 @@ import numpy as np
 import torch
 from matplotlib.collections import PatchCollection
 from matplotlib.patches import Polygon
-from torchvision.transforms import Compose
 
 from easycv.core.visualization.image import imshow_bboxes
-from easycv.datasets.registry import PIPELINES
-from easycv.file import io
-from easycv.models import build_model
 from easycv.predictors.builder import PREDICTORS
-from easycv.predictors.interface import PredictorInterface
-from easycv.utils.checkpoint import load_checkpoint
-from easycv.utils.config_tools import mmcv_config_fromfile
-from easycv.utils.registry import build_from_cfg
 from .base import PredictorV2
 
 
 @PREDICTORS.register_module()
 class SegmentationPredictor(PredictorV2):
+    """Predictor for Segmentation.
+
+    Args:
+        model_path (str): Path of model path.
+        config_file (Optinal[str]): config file path for model and processor to init. Defaults to None.
+        batch_size (int): batch size for forward.
+        device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+        save_results (bool): Whether to save predict results.
+        save_path (str): File path for saving results, only valid when `save_results` is True.
+        pipelines (list[dict]): Data pipeline configs.
+    """
 
     def __init__(self,
                  model_path,
@@ -28,20 +31,21 @@ class SegmentationPredictor(PredictorV2):
                  batch_size=1,
                  device=None,
                  save_results=False,
-                 save_path=None):
-        """Predict pipeline for Segmentation
+                 save_path=None,
+                 pipelines=None,
+                 *args,
+                 **kwargs):
 
-        Args:
-            model_path (str): Path of model path
-            config_file (str): config file path for model and processor to init. Defaults to None.
-        """
         super(SegmentationPredictor, self).__init__(
             model_path,
             config_file,
             batch_size=batch_size,
             device=device,
             save_results=save_results,
-            save_path=save_path)
+            save_path=save_path,
+            pipelines=pipelines,
+            *args,
+            **kwargs)
 
         self.CLASSES = self.cfg.CLASSES
         self.PALETTE = self.cfg.PALETTE
@@ -123,71 +127,61 @@ class SegmentationPredictor(PredictorV2):
 
 
 @PREDICTORS.register_module()
-class Mask2formerPredictor(PredictorInterface):
+class Mask2formerPredictor(SegmentationPredictor):
+    """Predictor for Mask2former.
 
-    def __init__(self, model_path, model_config=None):
-        """init model
+    Args:
+        model_path (str): Path of model path.
+        config_file (Optinal[str]): config file path for model and processor to init. Defaults to None.
+        batch_size (int): batch size for forward.
+        device (str): Support 'cuda' or 'cpu', if is None, detect device automatically.
+        save_results (bool): Whether to save predict results.
+        save_path (str): File path for saving results, only valid when `save_results` is True.
+        pipelines (list[dict]): Data pipeline configs.
+    """
 
-        Args:
-            model_path (str): Path of model path
-            model_config (config, optional): config string for model to init. Defaults to None.
+    def __init__(self,
+                 model_path,
+                 config_file=None,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 pipelines=None,
+                 task_mode='panoptic',
+                 *args,
+                 **kwargs):
+        super(Mask2formerPredictor, self).__init__(
+            model_path,
+            config_file,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            pipelines=pipelines,
+            *args,
+            **kwargs)
+        self.task_mode = task_mode
+
+    def forward(self, inputs):
+        """Model forward.
         """
-        self.model_path = model_path
+        with torch.no_grad():
+            outputs = self.model(**inputs, mode='test', encode=False)
+        return outputs
 
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        self.model = None
-        with io.open(self.model_path, 'rb') as infile:
-            checkpoint = torch.load(infile, map_location='cpu')
-
-        assert 'meta' in checkpoint and 'config' in checkpoint[
-            'meta'], 'meta.config is missing from checkpoint'
-
-        self.cfg = checkpoint['meta']['config']
-        self.classes = len(self.cfg.PALETTE)
-        self.class_name = self.cfg.CLASSES
-        # build model
-        self.model = build_model(self.cfg.model)
-
-        self.ckpt = load_checkpoint(
-            self.model, self.model_path, map_location=self.device)
-        self.model.to(self.device)
-        self.model.eval()
-
-        # build pipeline
-        test_pipeline = self.cfg.test_pipeline
-        pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline]
-        self.pipeline = Compose(pipeline)
-
-    def predict(self, input_data_list, mode='panoptic'):
-        """
-        Args:
-            input_data_list: a list of numpy array(in rgb order), each array is a sample
-        to be predicted
-        """
-        output_list = []
-        for idx, img in enumerate(input_data_list):
-            output = {}
-            if not isinstance(img, np.ndarray):
-                img = np.asarray(img)
-            data_dict = {'img': img}
-            ori_shape = img.shape
-            data_dict = self.pipeline(data_dict)
-            img = data_dict['img']
-            img[0] = torch.unsqueeze(img[0], 0).to(self.device)
-            img_metas = [[
-                img_meta._data for img_meta in data_dict['img_metas']
-            ]]
-            img_metas[0][0]['ori_shape'] = ori_shape
-            res = self.model.forward_test(img, img_metas, encode=False)
-            if mode == 'panoptic':
-                output['pan'] = res['pan_results'][0]
-            elif mode == 'instance':
-                output['segms'] = res['detection_masks'][0]
-                output['bboxes'] = res['detection_boxes'][0]
-                output['scores'] = res['detection_scores'][0]
-                output['labels'] = res['detection_classes'][0]
-            output_list.append(output)
-        return output_list
+    def postprocess(self, inputs):
+        output = {}
+        if self.task_mode == 'panoptic':
+            output['pan'] = inputs['pan_results'][0]
+        elif self.task_mode == 'instance':
+            output['segms'] = inputs['detection_masks'][0]
+            output['bboxes'] = inputs['detection_boxes'][0]
+            output['scores'] = inputs['detection_scores'][0]
+            output['labels'] = inputs['detection_classes'][0]
+        else:
+            raise ValueError(f'Not support model {self.task_mode}')
+        return output
 
     def show_panoptic(self, img, pan_mask):
         pan_label = np.unique(pan_mask)
@@ -214,147 +208,6 @@ class Mask2formerPredictor(PredictorInterface):
         return instance_result
 
 
-@PREDICTORS.register_module()
-class SegFormerPredictor(PredictorInterface):
-
-    def __init__(self, model_path, model_config):
-        """init model
-
-        Args:
-            model_path (str): Path of model path
-            model_config (config): config string for model to init. Defaults to None.
-        """
-        self.model_path = model_path
-
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        self.model = None
-        with io.open(self.model_path, 'rb') as infile:
-            checkpoint = torch.load(infile, map_location='cpu')
-
-        self.cfg = mmcv_config_fromfile(model_config)
-        self.CLASSES = self.cfg.CLASSES
-        self.PALETTE = self.cfg.PALETTE
-        # build model
-        self.model = build_model(self.cfg.model)
-
-        self.ckpt = load_checkpoint(
-            self.model, self.model_path, map_location=self.device)
-        self.model.to(self.device)
-        self.model.eval()
-
-        # build pipeline
-        test_pipeline = self.cfg.test_pipeline
-        pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline]
-        self.pipeline = Compose(pipeline)
-
-    def predict(self, input_data_list):
-        """
-    using session run predict a number of samples using batch_size
-
-    Args:
-      input_data_list:  a list of numpy array(in rgb order), each array is a sample
-        to be predicted
-        use a fixed number if you do not want to adjust batch_size in runtime
-    """
-        output_list = []
-        for idx, img in enumerate(input_data_list):
-            if type(img) is not np.ndarray:
-                img = np.asarray(img)
-
-            ori_img_shape = img.shape[:2]
-
-            data_dict = {'img': img}
-            data_dict['ori_shape'] = ori_img_shape
-            data_dict = self.pipeline(data_dict)
-            img = data_dict['img']
-            img = torch.unsqueeze(img[0], 0).to(self.device)
-            data_dict.pop('img')
-
-            with torch.no_grad():
-                out = self.model([img],
-                                 mode='test',
-                                 img_metas=[[data_dict['img_metas'][0]._data]])
-
-            output_list.append(out)
-
-        return output_list
-
-    def show_result(self,
-                    img,
-                    result,
-                    palette=None,
-                    win_name='',
-                    show=False,
-                    wait_time=0,
-                    out_file=None,
-                    opacity=0.5):
-        """Draw `result` over `img`.
-
-        Args:
-            img (str or Tensor): The image to be displayed.
-            result (Tensor): The semantic segmentation results to draw over
-                `img`.
-            palette (list[list[int]]] | np.ndarray | None): The palette of
-                segmentation map. If None is given, random palette will be
-                generated. Default: None
-            win_name (str): The window name.
-            wait_time (int): Value of waitKey param.
-                Default: 0.
-            show (bool): Whether to show the image.
-                Default: False.
-            out_file (str or None): The filename to write the image.
-                Default: None.
-            opacity(float): Opacity of painted segmentation map.
-                Default 0.5.
-                Must be in (0, 1] range.
-        Returns:
-            img (Tensor): Only if not `show` or `out_file`
-        """
-
-        img = mmcv.imread(img)
-        img = img.copy()
-        seg = result[0]
-        if palette is None:
-            if self.PALETTE is None:
-                # Get random state before set seed,
-                # and restore random state later.
-                # It will prevent loss of randomness, as the palette
-                # may be different in each iteration if not specified.
-                # See: https://github.com/open-mmlab/mmdetection/issues/5844
-                state = np.random.get_state()
-                np.random.seed(42)
-                # random palette
-                palette = np.random.randint(
-                    0, 255, size=(len(self.CLASSES), 3))
-                np.random.set_state(state)
-            else:
-                palette = self.PALETTE
-        palette = np.array(palette)
-        assert palette.shape[0] == len(self.CLASSES)
-        assert palette.shape[1] == 3
-        assert len(palette.shape) == 2
-        assert 0 < opacity <= 1.0
-        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
-        for label, color in enumerate(palette):
-            color_seg[seg == label, :] = color
-        # convert to BGR
-        color_seg = color_seg[..., ::-1]
-
-        img = img * (1 - opacity) + color_seg * opacity
-        img = img.astype(np.uint8)
-        # if out_file specified, do not show image in window
-        if out_file is not None:
-            show = False
-
-        if show:
-            mmcv.imshow(img, win_name, wait_time)
-        if out_file is not None:
-            mmcv.imwrite(img, out_file)
-
-        if not (show or out_file):
-            return img
-
-
 def _get_bias_color(base, max_dist=30):
     """Get different colors for each masks.
 
diff --git a/easycv/utils/checkpoint.py b/easycv/utils/checkpoint.py
index c583d9a0..4c987c83 100644
--- a/easycv/utils/checkpoint.py
+++ b/easycv/utils/checkpoint.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
 import os
 
 import torch
@@ -8,6 +9,7 @@ from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu
 from torch.optim import Optimizer
 
 from easycv.file import io
+from easycv.file.utils import is_url_path
 from easycv.framework.errors import TypeError
 from easycv.utils.constant import CACHE_DIR
 
@@ -32,28 +34,40 @@ def load_checkpoint(model,
     Returns:
         dict or OrderedDict: The loaded checkpoint.
     """
-    if not filename.startswith('oss://'):
-        return mmcv_load_checkpoint(
-            model,
-            filename,
-            map_location=map_location,
-            strict=strict,
-            logger=logger)
-    else:
+    if filename.startswith('oss://'):
         _, fname = os.path.split(filename)
         cache_file = os.path.join(CACHE_DIR, fname)
+        if not os.path.exists(CACHE_DIR):
+            os.makedirs(CACHE_DIR)
         if not os.path.exists(cache_file):
-            print(f'download checkpoint from {filename} to {cache_file}')
+            logging.info(
+                f'download checkpoint from {filename} to {cache_file}')
             io.copy(filename, cache_file)
         if torch.distributed.is_available(
         ) and torch.distributed.is_initialized():
             torch.distributed.barrier()
-        return mmcv_load_checkpoint(
-            model,
-            cache_file,
-            map_location=map_location,
-            strict=strict,
-            logger=logger)
+        filename = cache_file
+    elif is_url_path(filename):
+        from torch.hub import urlparse, download_url_to_file
+        parts = urlparse(filename)
+        base_name = os.path.basename(parts.path)
+        cache_file = os.path.join(CACHE_DIR, base_name)
+        if not os.path.exists(CACHE_DIR):
+            os.makedirs(CACHE_DIR)
+        if not os.path.exists(cache_file):
+            logging.info(
+                f'download checkpoint from {filename} to {cache_file}')
+            download_url_to_file(filename, cache_file)
+        if torch.distributed.is_available(
+        ) and torch.distributed.is_initialized():
+            torch.distributed.barrier()
+        filename = cache_file
+    return mmcv_load_checkpoint(
+        model,
+        filename,
+        map_location=map_location,
+        strict=strict,
+        logger=logger)
 
 
 def save_checkpoint(model, filename, optimizer=None, meta=None):
diff --git a/easycv/utils/constant.py b/easycv/utils/constant.py
index 981a8bbb..87afc813 100644
--- a/easycv/utils/constant.py
+++ b/easycv/utils/constant.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-CACHE_DIR = '.easycv_cache'
+import os
+
+CACHE_DIR = os.path.expanduser('~/.cache/easycv/')
 
 MAX_READ_IMAGE_TRY_TIMES = 10
diff --git a/easycv/utils/misc.py b/easycv/utils/misc.py
index 8e544b96..cce21293 100644
--- a/easycv/utils/misc.py
+++ b/easycv/utils/misc.py
@@ -1,12 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import functools
+import inspect
 import logging
+import warnings
 from functools import partial
 
 import mmcv
 import numpy as np
-from six.moves import map, zip
-
-from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock
 
 
 def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
@@ -79,6 +79,8 @@ def reparameterize_models(model):
     Args:
         model: nn.Module
     """
+    from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock
+
     reparameterize_count = 0
     for layer in model.modules():
         if isinstance(layer, RepVGGBlock):
@@ -89,3 +91,31 @@ def reparameterize_models(model):
         .format(reparameterize_count))
     print('reparam:', reparameterize_count)
     return model
+
+
+def deprecated(reason):
+    """
+    This is a decorator which can be used to mark functions
+    as deprecated. It will result in a warning being emitted
+    when the function is used.
+    """
+
+    def decorator(func1):
+        if inspect.isclass(func1):
+            fmt1 = 'Call to deprecated class {name} ({reason}).'
+        else:
+            fmt1 = 'Call to deprecated function {name} ({reason}).'
+
+        @functools.wraps(func1)
+        def new_func1(*args, **kwargs):
+            warnings.simplefilter('always', DeprecationWarning)
+            warnings.warn(
+                fmt1.format(name=func1.__name__, reason=reason),
+                category=DeprecationWarning,
+                stacklevel=2)
+            warnings.simplefilter('default', DeprecationWarning)
+            return func1(*args, **kwargs)
+
+        return new_func1
+
+    return decorator
diff --git a/tests/datasets/segmentation/test_seg_raw_dataset.py b/tests/datasets/segmentation/test_seg_raw_dataset.py
index 9ad7d489..b9e5a628 100644
--- a/tests/datasets/segmentation/test_seg_raw_dataset.py
+++ b/tests/datasets/segmentation/test_seg_raw_dataset.py
@@ -7,9 +7,7 @@ from tests.ut_config import (IMG_NORM_CFG_255, SEG_DATA_SMALL_RAW_LOCAL,
 
 from easycv.core.evaluation.builder import build_evaluator
 from easycv.datasets.builder import build_datasource
-from easycv.datasets.segmentation.data_sources.raw import SegSourceRaw
 from easycv.datasets.segmentation.raw import SegDataset
-from easycv.file import io
 
 
 class SegDatasetTest(unittest.TestCase):
diff --git a/tests/predictors/test_classifier.py b/tests/predictors/test_classifier.py
index 8aef4778..c546dfe6 100644
--- a/tests/predictors/test_classifier.py
+++ b/tests/predictors/test_classifier.py
@@ -8,14 +8,57 @@ import unittest
 
 import cv2
 import torch
-
-from easycv.predictors.classifier import TorchClassifier
-
+from easycv.predictors.builder import build_predictor
 from easycv.utils.test_util import clean_up, get_tmp_dir
+from easycv.utils.config_tools import mmcv_config_fromfile
 from tests.ut_config import (PRETRAINED_MODEL_RESNET50_WITHOUTHEAD,
                              IMAGENET_LABEL_TXT, TEST_IMAGES_DIR)
 
 
+class ClassificationPredictorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_single(self):
+        checkpoint = PRETRAINED_MODEL_RESNET50_WITHOUTHEAD
+        config_file = 'configs/classification/imagenet/resnet/imagenet_resnet50_jpg.py'
+        cfg = mmcv_config_fromfile(config_file)
+        predict_op = build_predictor(
+            dict(
+                **cfg.predict,
+                model_path=checkpoint,
+                config_file=config_file,
+                label_map_path=IMAGENET_LABEL_TXT))
+        img_path = os.path.join(TEST_IMAGES_DIR, 'catb.jpg')
+
+        results = predict_op([img_path])[0]
+        self.assertListEqual(results['class'], [283])
+        self.assertListEqual(results['class_name'], ['"Persian cat",'])
+        self.assertEqual(len(results['class_probs']), 1000)
+
+    def test_batch(self):
+        checkpoint = PRETRAINED_MODEL_RESNET50_WITHOUTHEAD
+        config_file = 'configs/classification/imagenet/resnet/imagenet_resnet50_jpg.py'
+        cfg = mmcv_config_fromfile(config_file)
+        predict_op = build_predictor(
+            dict(
+                **cfg.predict,
+                model_path=checkpoint,
+                config_file=config_file,
+                label_map_path=IMAGENET_LABEL_TXT,
+                batch_size=3))
+        img_path = os.path.join(TEST_IMAGES_DIR, 'catb.jpg')
+
+        num_imgs = 4
+        results = predict_op([img_path] * num_imgs)
+        self.assertEqual(len(results), num_imgs)
+        for res in results:
+            self.assertListEqual(res['class'], [283])
+            self.assertListEqual(res['class_name'], ['"Persian cat",'])
+            self.assertEqual(len(res['class_probs']), 1000)
+
+
 class TorchClassifierTest(unittest.TestCase):
 
     def setUp(self):
@@ -62,6 +105,8 @@ class TorchClassifierTest(unittest.TestCase):
         output_ckpt = f'{self.tmp_dir}/export.pth'
         torch.save(output_dict, output_ckpt)
 
+        from easycv.predictors.classifier import TorchClassifier
+
         fe = TorchClassifier(
             output_ckpt, topk=topk, label_map_path=IMAGENET_LABEL_TXT)
 
diff --git a/tests/predictors/test_detector.py b/tests/predictors/test_detector.py
index c3be2ed6..1b160a01 100644
--- a/tests/predictors/test_detector.py
+++ b/tests/predictors/test_detector.py
@@ -4,11 +4,11 @@ isort:skip_file
 """
 import os
 import unittest
-
+import tempfile
 import numpy as np
 from PIL import Image
 
-from easycv.predictors.detector import TorchYoloXPredictor, DetrPredictor
+from easycv.predictors.detector import TorchYoloXPredictor, DetectionPredictor
 from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT,
                              PRETRAINED_MODEL_YOLOXS_EXPORT_OLD,
                              PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_JIT,
@@ -154,25 +154,18 @@ class DetectorTest(unittest.TestCase):
                       [510.37033, 268.4982, 527.67017, 273.04935]]),
             decimal=1)
 
-    def test_vitdet_detector(self):
-        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth'
-        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
-        out_file = './result.jpg'
-        vitdet = DetrPredictor(model_path)
-        output = vitdet.predict(img)
-        vitdet.visualize(img, output, out_file=out_file)
-
+    def _detection_detector_assert(self, output):
         self.assertIn('detection_boxes', output)
         self.assertIn('detection_scores', output)
         self.assertIn('detection_classes', output)
         self.assertIn('detection_masks', output)
         self.assertIn('img_metas', output)
-        self.assertEqual(len(output['detection_boxes'][0]), 33)
-        self.assertEqual(len(output['detection_scores'][0]), 33)
-        self.assertEqual(len(output['detection_classes'][0]), 33)
+        self.assertEqual(len(output['detection_boxes']), 33)
+        self.assertEqual(len(output['detection_scores']), 33)
+        self.assertEqual(len(output['detection_classes']), 33)
 
         self.assertListEqual(
-            output['detection_classes'][0].tolist(),
+            output['detection_classes'].tolist(),
             np.array([
                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                 2, 2, 2, 2, 2, 2, 7, 7, 13, 13, 13, 56
@@ -180,7 +173,7 @@ class DetectorTest(unittest.TestCase):
                      dtype=np.int32).tolist())
 
         assert_array_almost_equal(
-            output['detection_scores'][0],
+            output['detection_scores'],
             np.array([
                 0.9975854158401489, 0.9965696334838867, 0.9922919869422913,
                 0.9833580851554871, 0.983080267906189, 0.970454752445221,
@@ -198,7 +191,7 @@ class DetectorTest(unittest.TestCase):
             decimal=2)
 
         assert_array_almost_equal(
-            output['detection_boxes'][0],
+            output['detection_boxes'],
             np.array([[
                 294.22674560546875, 116.6078109741211, 379.4328918457031,
                 150.14097595214844
@@ -333,6 +326,32 @@ class DetectorTest(unittest.TestCase):
                       ]]),
             decimal=1)
 
+    def test_detection_detector_single(self):
+        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth'
+        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
+        vitdet = DetectionPredictor(model_path, score_threshold=0.0)
+        output = vitdet(img)
+        output = output[0]
+        with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp_file:
+            tmp_save_path = tmp_file.name
+            vitdet.visualize(img, output, out_file=tmp_save_path)
+        self._detection_detector_assert(output)
+
+    def test_detection_detector_batch(self):
+        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth'
+        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
+        vitdet = DetectionPredictor(
+            model_path, score_threshold=0.0, batch_size=2)
+        num_samples = 3
+        images = [img] * num_samples
+        outputs = vitdet(images)
+        self.assertEqual(len(outputs), num_samples)
+        for output in outputs:
+            with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp_file:
+                tmp_save_path = tmp_file.name
+                vitdet.visualize(img, output, out_file=tmp_save_path)
+            self._detection_detector_assert(output)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/predictors/test_detector_blade.py b/tests/predictors/test_detector_blade.py
index 3f3aae65..143425a3 100644
--- a/tests/predictors/test_detector_blade.py
+++ b/tests/predictors/test_detector_blade.py
@@ -3,22 +3,14 @@
 isort:skip_file
 """
 import os
-import tempfile
 import unittest
-import cv2
 import numpy as np
 from PIL import Image
 from easycv.predictors.detector import TorchYoloXPredictor
-from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT,
-                             PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_JIT,
-                             PRETRAINED_MODEL_YOLOXS_PRE_NOTRT_JIT,
-                             PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE,
+from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE,
                              PRETRAINED_MODEL_YOLOXS_PRE_NOTRT_BLADE,
                              DET_DATA_SMALL_COCO_LOCAL)
 
-from easycv.utils.test_util import benchmark
-import logging
-import pandas as pd
 import torch
 from numpy.testing import assert_array_almost_equal
 
@@ -37,7 +29,6 @@ class DetectorTest(unittest.TestCase):
         input_data_list = [np.asarray(Image.open(img))]
 
         blade_path = PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE
-        # blade_path = '/home/zouxinyi.zxy/easycv_nfs/pretrained_models/detection/infer_yolox/debug_blade.pt.blade'
         predictor_blade = TorchYoloXPredictor(
             model_path=blade_path, score_thresh=0.5)
 
diff --git a/tests/predictors/test_face_keypoints_predictor.py b/tests/predictors/test_face_keypoints_predictor.py
index 67482e51..3f62319a 100644
--- a/tests/predictors/test_face_keypoints_predictor.py
+++ b/tests/predictors/test_face_keypoints_predictor.py
@@ -19,7 +19,7 @@ class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase):
     def test_single(self):
         predict_pipeline = FaceKeypointsPredictor(
             model_path=self.model_path, config_file=self.model_config_path)
-        output = predict_pipeline(self.image_path)[0][0]
+        output = predict_pipeline(self.image_path)[0]
         output_keypoints = output['point']
         output_pose = output['pose']
         img = cv2.imread(self.image_path)
@@ -38,18 +38,10 @@ class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase):
         total_samples = 3
         output = predict_pipeline([self.image_path] * total_samples)
 
-        self.assertEqual(len(output), 2)
-        self.assertEqual(len(output[0]), 2)
-        self.assertEqual(len(output[1]), 1)
-        self.assertEqual(output[0][0]['point'].shape[0], 106)
-        self.assertEqual(output[0][0]['point'].shape[1], 2)
-        self.assertEqual(output[0][0]['pose'].shape[0], 3)
-        self.assertEqual(output[0][1]['point'].shape[0], 106)
-        self.assertEqual(output[0][1]['point'].shape[1], 2)
-        self.assertEqual(output[0][1]['pose'].shape[0], 3)
-        self.assertEqual(output[1][0]['point'].shape[0], 106)
-        self.assertEqual(output[1][0]['point'].shape[1], 2)
-        self.assertEqual(output[1][0]['pose'].shape[0], 3)
+        self.assertEqual(len(output), total_samples)
+        for out in output:
+            self.assertEqual(out['point'].shape, (106, 2))
+            self.assertEqual(out['pose'].shape, (3, ))
 
 
 if __name__ == '__main__':
diff --git a/tests/predictors/test_hand_keypoints_predictor.py b/tests/predictors/test_hand_keypoints_predictor.py
index b2bca4cf..4a325098 100644
--- a/tests/predictors/test_hand_keypoints_predictor.py
+++ b/tests/predictors/test_hand_keypoints_predictor.py
@@ -39,6 +39,37 @@ class HandKeypointsPredictorTest(unittest.TestCase):
         self.assertEqual(keypoints.shape[1], 21)
         self.assertEqual(keypoints.shape[2], 3)
 
+    def test_batch(self):
+        config = mmcv_config_fromfile(self.model_config_path)
+        predict_pipeline = HandKeypointsPredictor(
+            model_path=self.model_path,
+            config_file=config,
+            batch_size=2,
+            detection_predictor_config=dict(
+                type='DetectionPredictor',
+                model_path=MM_DEFAULT_HAND_DETECTION_SSDLITE_MODEL_PATH,
+                config_file=MM_DEFAULT_HAND_DETECTION_SSDLITE_CONFIG_FILE,
+                score_threshold=0.5))
+
+        num_samples = 4
+        outputs = predict_pipeline(
+            [self.image_path] * num_samples, keep_inputs=True)
+        base_keypoints = outputs[0]['keypoints']
+        base_boxes = outputs[0]['boxes']
+        for output in outputs:
+            keypoints = output['keypoints']
+            boxes = output['boxes']
+            image_show = predict_pipeline.show_result(
+                self.image_path,
+                keypoints,
+                boxes,
+                save_path=self.save_image_path)
+            self.assertEqual(keypoints.shape, (1, 21, 3))
+            self.assertEqual(boxes.shape, (1, 4))
+            self.assertListEqual(keypoints.tolist(), base_keypoints.tolist())
+            self.assertListEqual(boxes.tolist(), base_boxes.tolist())
+            self.assertEqual(output['inputs'], self.image_path)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/predictors/test_segmentation.py b/tests/predictors/test_segmentation.py
index e84a3e1a..5b36a2fb 100644
--- a/tests/predictors/test_segmentation.py
+++ b/tests/predictors/test_segmentation.py
@@ -8,6 +8,7 @@ import unittest
 import numpy as np
 from PIL import Image
 from tests.ut_config import (MODEL_CONFIG_SEGFORMER,
+                             PRETRAINED_MODEL_MASK2FORMER_DIR,
                              PRETRAINED_MODEL_SEGFORMER, TEST_IMAGES_DIR)
 
 from easycv.predictors.segmentation import SegmentationPredictor
@@ -31,14 +32,14 @@ class SegmentationPredictorTest(unittest.TestCase):
 
         outputs = predict_pipeline(img_path, keep_inputs=True)
         self.assertEqual(len(outputs), 1)
-        self.assertEqual(outputs[0]['inputs'], [img_path])
+        results = outputs[0]
+        self.assertEqual(results['inputs'], img_path)
 
-        results = outputs[0]['results']
         self.assertListEqual(
-            list(img.shape)[:2], list(results['seg_pred'][0].shape))
-        self.assertListEqual(results['seg_pred'][0][1, :10].tolist(),
+            list(img.shape)[:2], list(results['seg_pred'].shape))
+        self.assertListEqual(results['seg_pred'][1, :10].tolist(),
                              [161 for i in range(10)])
-        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
+        self.assertListEqual(results['seg_pred'][-1, -10:].tolist(),
                              [133 for i in range(10)])
 
     def test_batch(self):
@@ -56,19 +57,15 @@ class SegmentationPredictorTest(unittest.TestCase):
         total_samples = 3
         outputs = predict_pipeline(
             [img_path] * total_samples, keep_inputs=True)
-        self.assertEqual(len(outputs), 2)
+        self.assertEqual(len(outputs), 3)
 
-        self.assertEqual(outputs[0]['inputs'], [img_path] * 2)
-        self.assertEqual(outputs[1]['inputs'], [img_path] * 1)
-        self.assertEqual(len(outputs[0]['results']['seg_pred']), 2)
-        self.assertEqual(len(outputs[1]['results']['seg_pred']), 1)
-
-        for result in [outputs[0]['results'], outputs[1]['results']]:
+        for i in range(len(outputs)):
+            self.assertEqual(outputs[i]['inputs'], img_path)
             self.assertListEqual(
-                list(img.shape)[:2], list(result['seg_pred'][0].shape))
-            self.assertListEqual(result['seg_pred'][0][1, :10].tolist(),
+                list(img.shape)[:2], list(outputs[i]['seg_pred'].shape))
+            self.assertListEqual(outputs[i]['seg_pred'][1, :10].tolist(),
                                  [161 for i in range(10)])
-            self.assertListEqual(result['seg_pred'][0][-1, -10:].tolist(),
+            self.assertListEqual(outputs[i]['seg_pred'][-1, -10:].tolist(),
                                  [133 for i in range(10)])
 
     def test_dump(self):
@@ -91,17 +88,47 @@ class SegmentationPredictorTest(unittest.TestCase):
 
         total_samples = 3
         outputs = predict_pipeline(
-            [img_path] * total_samples, keep_inputs=True)
+            [img_path] * total_samples, keep_inputs=False)
         self.assertEqual(outputs, [])
 
         with open(tmp_path, 'rb') as f:
             results = pickle.loads(f.read())
 
-        self.assertIn('inputs', results[0])
-        self.assertIn('results', results[0])
+        for res in results:
+            self.assertNotIn('inputs', res)
+            self.assertIn('seg_pred', res)
 
         shutil.rmtree(temp_dir, ignore_errors=True)
 
 
+@unittest.skipIf(True, 'WIP')
+class Mask2formerPredictorTest(unittest.TestCase):
+
+    def test_single(self):
+        import cv2
+        from easycv.predictors.segmentation import Mask2formerPredictor
+        pan_ckpt = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR,
+                                'mask2former_pan_export.pth')
+        instance_ckpt = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR,
+                                     'mask2former_r50_instance.pth')
+        img_path = os.path.join(TEST_IMAGES_DIR, 'mask2former.jpg')
+
+        # panop
+        predictor = Mask2formerPredictor(
+            model_path=pan_ckpt, output_mode='panoptic')
+        img = cv2.imread(img_path)
+        predict_out = predictor([img])
+        pan_img = predictor.show_panoptic(img, predict_out[0]['pan'])
+        cv2.imwrite('pan_out.jpg', pan_img)
+
+        # instance
+        predictor = Mask2formerPredictor(
+            model_path=instance_ckpt, output_mode='instance')
+        img = cv2.imread(img_path)
+        predict_out = predictor.predict([img], mode='instance')
+        instance_img = predictor.show_instance(img, **predict_out[0])
+        cv2.imwrite('instance_out.jpg', instance_img)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/predictors/test_segmentor.py b/tests/predictors/test_segmentor.py
deleted file mode 100644
index 1ca3eece..00000000
--- a/tests/predictors/test_segmentor.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-"""
-isort:skip_file
-"""
-import os
-import unittest
-
-import numpy as np
-from PIL import Image
-
-from tests.ut_config import TEST_IMAGES_DIR
-from tests.ut_config import (PRETRAINED_MODEL_SEGFORMER,
-                             MODEL_CONFIG_SEGFORMER)
-from easycv.predictors.segmentation import SegFormerPredictor
-
-
-class SegmentorTest(unittest.TestCase):
-
-    def setUp(self):
-        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
-
-    def test_segformer_detector(self):
-        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
-        segmentation_model_config = MODEL_CONFIG_SEGFORMER
-
-        img = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg')
-        if not os.path.exists(img):
-            img = './data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg'
-
-        input_data_list = [np.asarray(Image.open(img))]
-        predictor = SegFormerPredictor(
-            model_path=segmentation_model_path,
-            model_config=segmentation_model_config)
-
-        output = predictor.predict(input_data_list)[0]
-        self.assertIn('seg_pred', output)
-
-        self.assertListEqual(
-            list(input_data_list[0].shape)[:2],
-            list(output['seg_pred'][0].shape))
-        self.assertListEqual(output['seg_pred'][0][1, :10].tolist(),
-                             [161 for i in range(10)])
-        self.assertListEqual(output['seg_pred'][0][-1, -10:].tolist(),
-                             [133 for i in range(10)])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/ut_config.py b/tests/ut_config.py
index e053b630..42c8620b 100644
--- a/tests/ut_config.py
+++ b/tests/ut_config.py
@@ -120,10 +120,10 @@ PRETRAINED_MODEL_YOLOX_COMPRESSION = os.path.join(
     BASE_LOCAL_PATH, 'pretrained_models/compression/yolox_compression.pth')
 PRETRAINED_MODEL_MAE = os.path.join(
     BASE_LOCAL_PATH, 'pretrained_models/classification/vit/mae_vit_b_1600.pth')
-PRETRAINED_MODEL_MASK2FORMER = os.path.join(
-    BASE_LOCAL_PATH,
-    'pretrained_models/segmentation/mask2former/mask2former_r50_instance.pth')
-
+PRETRAINED_MODEL_MASK2FORMER_DIR = os.path.join(
+    BASE_LOCAL_PATH, 'pretrained_models/segmentation/mask2former/')
+PRETRAINED_MODEL_MASK2FORMER = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR,
+                                            'mask2former_r50_instance.pth')
 PRETRAINED_MODEL_SEGFORMER = os.path.join(
     BASE_LOCAL_PATH,
     'pretrained_models/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth'
diff --git a/thirdparty/u2sod/sodpredictor.py b/thirdparty/u2sod/sodpredictor.py
index a336c1ac..ff981379 100644
--- a/thirdparty/u2sod/sodpredictor.py
+++ b/thirdparty/u2sod/sodpredictor.py
@@ -21,6 +21,7 @@ except:
 
 
 from easycv.predictors.builder import build_predictor, PREDICTORS
+from easycv.utils.constant import CACHE_DIR
 
 
 def normPRED(d):
@@ -47,8 +48,8 @@ class SODPredictor(object):
         """ 
 
         def load_url_weights(name, url_index="http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/evtorch_thirdparty/u2net_sod/", map_location=None):
-            os.makedirs('.easycv_cache', exist_ok=True)
-            local_model = os.path.join('.easycv_cache', name+'.pth')
+            os.makedirs(CACHE_DIR, exist_ok=True)
+            local_model = os.path.join(CACHE_DIR, name+'.pth')
             if os.path.exists(local_model):
                 weights = torch.load(local_model)
                 if weights is not None: