Align SPOS and DetNAS to MMRazor2.0

2022-07-13 08:35:31 +00:00 · 2022-07-13 08:35:31 +00:00 · 6c920c88ee
parent 2d5e8bc675
commit 6c920c88ee
68 changed files with 3108 additions and 863 deletions
--- a/bash_anglenas.sh
+++ b/bash_anglenas.sh
@ -0,0 +1,12 @@
 #!/usr/bin/env sh
 MKL_NUM_THREADS=4
 OMP_NUM_THREADS=1
 # bash tools/slurm_train.sh mm_model detnas_train configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py /mnt/lustre/dongpeijie/checkpoints/tests/detnas_pretrain_test
 bash tools/slurm_test.sh mm_model angle_test configs/nas/spos/spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py /mnt/lustre/dongpeijie/spos_angelnas_flops_0.49G_acc_75.98_20220307-54f4698f_2.0.pth
--- a/bash_cream_train.sh
+++ b/bash_cream_train.sh
@ -0,0 +1,56 @@
 #!/usr/bin/env sh
 MKL_NUM_THREADS=4
 OMP_NUM_THREADS=1
 # train
 # srun --partition=mm_model \
 #     --job-name=spos_train \
 #     --gres=gpu:8 \
 #     --ntasks=8 \
 #     --ntasks-per-node=8 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/train.py configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py
 # bash tools/slurm_train.sh mm_model spos_train configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py ./work_dir/spos
 # SPOS test
 # srun --partition=mm_model \
 #     --job-name=spos_test \
 #     --gres=gpu:1 \
 #     --ntasks=1 \
 #     --ntasks-per-node=1 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/test.py configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py "/mnt/lustre/dongpeijie/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_2.0.pth"
 # DetNAS train
 # srun --partition=mm_model \
 #     --job-name=detnas_train \
 #     --gres=gpu:8 \
 #     --ntasks=8 \
 #     --ntasks-per-node=8 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/train.py configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py
 # bash tools/slurm_train.sh mm_model detnas_train configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py ./work_dir/detnas_pretrain
 # DetNAS test
 # srun --partition=mm_model \
 #     --job-name=detnas_test \
 #     --gres=gpu:1 \
 #     --ntasks=1 \
 #     --ntasks-per-node=1 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/test.py configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k_2.0_frcnn.py "/mnt/lustre/dongpeijie/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_2.0.pth"
 # CREAM Test
 # bash tools/slurm_test.sh mm_model cream_test configs/nas/cream/cream_14_subnet_mobilenet.py '/mnt/lustre/dongpeijie/14_2.0.pth'
 # CREAM Train
 bash tools/slurm_train.sh mm_model cream_train configs/nas/cream/cream_14_subnet_mobilenet.py
--- a/bash_darts_test.sh
+++ b/bash_darts_test.sh
@ -0,0 +1,7 @@
 #!/usr/bin/env sh
 MKL_NUM_THREADS=4
 OMP_NUM_THREADS=1
 bash tools/slurm_test.sh mm_model spos_test configs/nas/darts/darts_subnet_1xb96_cifar10_2.0.py '/mnt/lustre/dongpeijie/darts_subnetnet_1xb96_cifar10_acc-97.32_20211222-e5727921_2.0.pth'
--- a/bash_detnas_train.sh
+++ b/bash_detnas_train.sh
@ -0,0 +1,31 @@
 #!/usr/bin/env sh
 MKL_NUM_THREADS=4
 OMP_NUM_THREADS=1
 # DetNAS train
 # srun --partition=mm_model \
 #     --job-name=detnas_train \
 #     --gres=gpu:8 \
 #     --ntasks=8 \
 #     --ntasks-per-node=8 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/train.py configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py
 # bash tools/slurm_train.sh mm_model detnas_train configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py /mnt/lustre/dongpeijie/checkpoints/tests/detnas_pretrain_test
 # bash tools/slurm_test.sh mm_model detnas_test configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py /mnt/lustre/dongpeijie/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_2.0.pth
 # DetNAS test
 srun --partition=mm_model \
    --job-name=detnas_test \
    --gres=gpu:1 \
    --ntasks=1 \
    --ntasks-per-node=1 \
    --cpus-per-task=8 \
    --kill-on-bad-exit=1 \
    --quotatype=auto \
    python tools/test.py configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k_2.0_frcnn.py "/mnt/lustre/dongpeijie/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_2.0.pth" --launcher=slurm
--- a/bash_spos_train.sh
+++ b/bash_spos_train.sh
@ -0,0 +1,51 @@
 #!/usr/bin/env sh
 MKL_NUM_THREADS=4
 OMP_NUM_THREADS=1
 # train
 # srun --partition=mm_model \
 #     --job-name=spos_train \
 #     --gres=gpu:8 \
 #     --ntasks=8 \
 #     --ntasks-per-node=8 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/train.py configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py
 # bash tools/slurm_train.sh mm_model spos_train configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/spos_format_output
 # bash tools/slurm_train.sh mm_model spos_retrain configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/spos_retrain_detnas_with_ceph
 # 55% wrong settings of PolyLR
 # bash tools/slurm_train.sh mm_model spos_retrain_w_cj configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/spos_retrain_detnas_with_ceph
 # fix setting of PolyLR and rerun with colorjittor
 # bash tools/slurm_train.sh mm_model spos_retrain_w_cj configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/retrain_detnas_spos_with_colorjittor
 # fix setting of PolyLR and rerun w/o colorjittor
 # bash tools/slurm_train.sh mm_model spos_retrain_wo_cj configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example_wo_colorjittor.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/retrain_detnas_spos_wo_colorjittor
 # fix setting of optimizer decay[wo cj] (paramwise_cfg)
 # bash tools/slurm_train.sh mm_model spos_retrain_fix_decay_wo_cj configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example_wo_colorjittor.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/retrain_detnas_spos_retrain_fix_decay_wo_cj
 # fix setting of optimizer decay[with cj] (paramwise_cfg)
 # bash tools/slurm_train.sh mm_model spos_retrain_fix_decay_w_cj configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/retrain_detnas_spos_retrain_fix_decay_w_cj
 # SPOS test
 # srun --partition=mm_model \
 #     --job-name=spos_test \
 #     --gres=gpu:1 \
 #     --ntasks=1 \
 #     --ntasks-per-node=1 \
 #     --cpus-per-task=8 \
 #     --kill-on-bad-exit=1 \
 #     python tools/test.py configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py "/mnt/lustre/dongpeijie/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_2.0.pth"
 bash tools/slurm_test.sh mm_model spos_test configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py '/mnt/lustre/dongpeijie/detnas_subnet_shufflenetv2_8xb128_in1k_acc-74.08_20211223-92e9b66a_2.0.pth'
 # bash tools/slurm_train.sh mm_model spos_retrain configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py /mnt/lustre/dongpeijie/checkpoints/work_dirs/spos_retrain_detnas_spos
--- a/configs/_base_/models/arch_settings/mobilenet/cream_114.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_114.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 1, 2, _OTHER_STAGE_MUTABLE],
    [40, 2, 2, _OTHER_STAGE_MUTABLE],
    [80, 2, 2, _OTHER_STAGE_MUTABLE],
    [96, 3, 1, _OTHER_STAGE_MUTABLE],
    [192, 2, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/_base_/models/arch_settings/mobilenet/cream_14.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_14.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 1, 2, _OTHER_STAGE_MUTABLE],
    [40, 2, 2, _OTHER_STAGE_MUTABLE],
    [80, 2, 2, _OTHER_STAGE_MUTABLE],
    [96, 1, 1, _OTHER_STAGE_MUTABLE],
    [192, 1, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/_base_/models/arch_settings/mobilenet/cream_287.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_287.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 1, 2, _OTHER_STAGE_MUTABLE],
    [40, 2, 2, _OTHER_STAGE_MUTABLE],
    [80, 3, 2, _OTHER_STAGE_MUTABLE],
    [96, 4, 1, _OTHER_STAGE_MUTABLE],
    [192, 3, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/_base_/models/arch_settings/mobilenet/cream_43.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_43.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 1, 2, _OTHER_STAGE_MUTABLE],
    [40, 2, 2, _OTHER_STAGE_MUTABLE],
    [80, 2, 2, _OTHER_STAGE_MUTABLE],
    [96, 3, 1, _OTHER_STAGE_MUTABLE],
    [192, 2, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/_base_/models/arch_settings/mobilenet/cream_481.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_481.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 4, 2, _OTHER_STAGE_MUTABLE],
    [40, 4, 2, _OTHER_STAGE_MUTABLE],
    [80, 5, 2, _OTHER_STAGE_MUTABLE],
    [96, 4, 1, _OTHER_STAGE_MUTABLE],
    [192, 4, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/_base_/models/arch_settings/mobilenet/cream_604.py
+++ b/configs/_base_/models/arch_settings/mobilenet/cream_604.py
@ -0,0 +1,76 @@
 se_cfg = dict(
    ratio=4,
    divisor=1,
    act_cfg=(dict(type='HSwish'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='HSwish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 5, 2, _OTHER_STAGE_MUTABLE],
    [40, 5, 2, _OTHER_STAGE_MUTABLE],
    [80, 5, 2, _OTHER_STAGE_MUTABLE],
    [96, 6, 1, _OTHER_STAGE_MUTABLE],
    [192, 6, 2, _OTHER_STAGE_MUTABLE],
    [320, 1, 1, _OTHER_STAGE_MUTABLE]
 ]
--- a/configs/nas/cream/CREAM_14_MOBILENET_IN1k_2.0.yaml
+++ b/configs/nas/cream/CREAM_14_MOBILENET_IN1k_2.0.yaml
@ -0,0 +1,11 @@
 modules:
  backbone.layer1.0: depthsepconv
  backbone.layer2.0: mb_k3e4_se
  backbone.layer3.0: mb_k5e6_se
  backbone.layer3.1: mb_k5e6_se
  backbone.layer4.0: mb_k5e6_se
  backbone.layer4.1: mb_k5e6_se
  backbone.layer5.0: mb_k3e6_se
  backbone.layer6.0: mb_k5e6_se
  backbone.layer7.0: convbnact
 channels:
--- a/configs/nas/cream/cream_14_subnet_mobilenet.py
+++ b/configs/nas/cream/cream_14_subnet_mobilenet.py
@ -0,0 +1,8 @@
 _base_ = ['./cream_14_supernet_mobilenet.py']
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 fix_subnet = 'configs/nas/cream/CREAM_14_MOBILENET_IN1k_2.0.yaml'  # noqa: E501
 model = dict(fix_subnet=fix_subnet)
 find_unused_parameters = False
--- a/configs/nas/cream/cream_14_supernet_mobilenet.py
+++ b/configs/nas/cream/cream_14_supernet_mobilenet.py
@ -0,0 +1,241 @@
 # dataset settings
 dataset_type = 'ImageNet'
 preprocess_cfg = dict(
    # RGB format normalization parameters
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    # convert image from BGR to RGB
    to_rgb=True,
 )
 # file_client_args = dict(
 #     backend='petrel',
 #     path_mapping=dict({
 #         './data/imagenet': 's3://openmmlab/datasets/classification/imagenet',
 #         'data/imagenet': 's3://openmmlab/datasets/classification/imagenet'
 #     }))
 train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='RandomResizedCrop', scale=224),
    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='PackClsInputs'),
 ]
 test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='ResizeEdge',
        scale=73,
        edge='short',
        backend='pillow',
        interpolation='bicubic'),
    dict(type='CenterCrop', crop_size=64),
    dict(type='PackClsInputs'),
 ]
 train_dataloader = dict(
    batch_size=128,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/train.txt',
        data_prefix='train',
        pipeline=train_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=True),
    persistent_workers=True,
 )
 # /mnt/lustre/share_data/wangjiaqi/data/imagenet',
 val_dataloader = dict(
    batch_size=128,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/val.txt',
        data_prefix='val',
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
    persistent_workers=True,
 )
 val_evaluator = dict(type='Accuracy', topk=(1, 5))
 # If you want standard test, please manually configure the test dataset
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
 # scheduler
 # optimizer
 optim_wrapper = dict(
    optimizer=dict(type='SGD', lr=0.5, momentum=0.9, weight_decay=4e-5),
    clip_grad=None)
 # leanring policy
 param_scheduler = [
    dict(type='PolyLR', power=1.0, eta_min=0.0, by_epoch=False),
 ]
 # train, val, test setting
 train_cfg = dict(by_epoch=False, max_iters=300000)
 val_cfg = dict()
 test_cfg = dict()
 # runtime
 # defaults to use registries in mmrazor
 default_scope = 'mmcls'
 # configure default hooks
 default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=100),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1000),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='VisualizationHook', enable=False),
 )
 # configure environment
 env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
 )
 # set visualizer
 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
    type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
 # set log level
 log_level = 'INFO'
 # load from which checkpoint
 load_from = None
 # whether to resume training from the loaded checkpoint
 resume = False
 se_cfg = dict(
    ratio=4,
    divisor=8,
    act_cfg=(dict(type='ReLU'),
             dict(
                 type='HSigmoid', bias=3, divisor=6, min_value=0,
                 max_value=1)))
 _FIRST_STAGE_MUTABLE = dict(  # DepthwiseSep
    type='OneShotMutableOP',
    candidates=dict(
        depthsepconv=dict(
            type='DepthwiseSeparableConv',
            dw_kernel_size=3,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish'))))
 _MIDDLE_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e4_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish')),
        mb_k3e6_se=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish')),
        mb_k5e4_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish')),
        mb_k5e6_se=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish')),
        mb_k7e4_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=4,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish')),
        mb_k7e6_se=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            se_cfg=se_cfg,
            norm_cfg=dict(type='BN'),
            act_cfg=dict(type='Swish'))))
 arch_setting = [
    # Parameters to build layers. 4 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, stride, mutable cfg.
    [16, 1, 1, _FIRST_STAGE_MUTABLE],
    [24, 1, 2, _MIDDLE_STAGE_MUTABLE],
    [40, 2, 2, _MIDDLE_STAGE_MUTABLE],
    [80, 2, 2, _MIDDLE_STAGE_MUTABLE],
    [96, 1, 1, _MIDDLE_STAGE_MUTABLE],
    [192, 1, 2, _MIDDLE_STAGE_MUTABLE],
 ]
 norm_cfg = dict(type='BN')
 supernet = dict(
    _scope_='mmcls',
    type='ImageClassifier',
    data_preprocessor=preprocess_cfg,
    backbone=dict(
        _scope_='mmrazor',
        type='SearchableMobileNet',
        arch_setting=arch_setting,
        first_channels=16,
        last_channels=320,
        widen_factor=1.0,
        norm_cfg=norm_cfg,
        act_cfg=dict(type='Swish'),
        out_indices=(6, ),
    ),
    neck=dict(type='GlobalAveragePooling'),
    head=dict(
        type='mmrazor.CreamClsHead',
        num_classes=1000,
        in_channels=320,
        num_features=1280,
        act_cfg=dict(type='Swish'),
        loss=dict(
            type='LabelSmoothLoss',
            num_classes=1000,
            label_smooth_val=0.1,
            mode='original',
            loss_weight=1.0),
        topk=(1, 5),
    ),
 )
 mutator = dict(type='mmrazor.OneShotModuleMutator')
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
 )
 find_unused_parameters = True
--- a/configs/nas/darts/DARTS_SUBNET_CIFAR_PAPER.yaml
+++ b/configs/nas/darts/DARTS_SUBNET_CIFAR_PAPER.yaml
@ -1,116 +0,0 @@
 normal_n2:
  chosen:
  - normal_n2_p1
  - normal_n2_p0
 normal_n3:
  chosen:
  - normal_n3_p0
  - normal_n3_p1
 normal_n4:
  chosen:
  - normal_n4_p0
  - normal_n4_p1
 normal_n5:
  chosen:
  - normal_n5_p2
  - normal_n5_p0
 reduce_n2:
  chosen:
  - reduce_n2_p0
  - reduce_n2_p1
 reduce_n3:
  chosen:
  - reduce_n3_p1
  - reduce_n3_p2
 reduce_n4:
  chosen:
  - reduce_n4_p2
  - reduce_n4_p0
 reduce_n5:
  chosen:
  - reduce_n5_p1
  - reduce_n5_p2
 normal_n2_p0:
  chosen:
  - sep_conv_3x3
 normal_n2_p1:
  chosen:
  - sep_conv_3x3
 normal_n3_p0:
  chosen:
  - sep_conv_3x3
 normal_n3_p1:
  chosen:
  - sep_conv_3x3
 normal_n3_p2:
  chosen:
  - sep_conv_3x3
 normal_n4_p0:
  chosen:
  - skip_connect
 normal_n4_p1:
  chosen:
  - sep_conv_3x3
 normal_n4_p2:
  chosen:
  - skip_connect
 normal_n4_p3:
  chosen:
  - sep_conv_3x3
 normal_n5_p0:
  chosen:
  - skip_connect
 normal_n5_p1:
  chosen:
  - skip_connect
 normal_n5_p2:
  chosen:
  - dil_conv_3x3
 normal_n5_p3:
  chosen:
  - skip_connect
 normal_n5_p4:
  chosen:
  - skip_connect
 reduce_n2_p0:
  chosen:
  - max_pool_3x3
 reduce_n2_p1:
  chosen:
  - max_pool_3x3
 reduce_n3_p0:
  chosen:
  - max_pool_3x3
 reduce_n3_p1:
  chosen:
  - max_pool_3x3
 reduce_n3_p2:
  chosen:
  - skip_connect
 reduce_n4_p0:
  chosen:
  - max_pool_3x3
 reduce_n4_p1:
  chosen:
  - max_pool_3x3
 reduce_n4_p2:
  chosen:
  - skip_connect
 reduce_n4_p3:
  chosen:
  - skip_connect
 reduce_n5_p0:
  chosen:
  - max_pool_3x3
 reduce_n5_p1:
  chosen:
  - max_pool_3x3
 reduce_n5_p2:
  chosen:
  - skip_connect
 reduce_n5_p3:
  chosen:
  - skip_connect
 reduce_n5_p4:
  chosen:
  - skip_connect
--- a/configs/nas/darts/DARTS_SUBNET_CIFAR_PAPER_ALIAS.yaml
+++ b/configs/nas/darts/DARTS_SUBNET_CIFAR_PAPER_ALIAS.yaml
@ -0,0 +1,58 @@
 modules:
  normal_n2:
    - normal_n2_p0
    - normal_n2_p1
  normal_n2_p0:
    - sep_conv_3x3
  normal_n2_p1:
    - sep_conv_3x3
  normal_n3:
    - normal_n3_p0
    - normal_n3_p1
  normal_n3_p0:
    - skip_connect
  normal_n3_p1:
    - sep_conv_5x5
  normal_n4:
    - normal_n4_p0
    - normal_n4_p1
  normal_n4_p0:
    - sep_conv_3x3
  normal_n4_p1:
    - skip_connect
  normal_n5:
    - normal_n5_p0
    - normal_n5_p1
  normal_n5_p0:
    - skip_connect
  normal_n5_p1:
    - skip_connect
  reduce_n2:
    - reduce_n2_p0
    - reduce_n2_p1
  reduce_n2_p0:
    - max_pool_3x3
  reduce_n2_p1:
    - sep_conv_3x3
  reduce_n3:
    - reduce_n3_p0
    - reduce_n3_p2
  reduce_n3_p0:
    - max_pool_3x3
  reduce_n3_p2:
    - dil_conv_5x5
  reduce_n4:
    - reduce_n4_p0
    - reduce_n4_p2
  reduce_n4_p0:
    - max_pool_3x3
  reduce_n4_p2:
    - skip_connect
  reduce_n5:
    - reduce_n5_p0
    - reduce_n5_p2
  reduce_n5_p0:
    - max_pool_3x3
  reduce_n5_p2:
    - skip_connect
 channels:
--- a/configs/nas/darts/darts_subnet_1xb96_cifar10_2.0.py
+++ b/configs/nas/darts/darts_subnet_1xb96_cifar10_2.0.py
@ -0,0 +1,196 @@
 # dataset settings
 dataset_type = 'CIFAR10'
 preprocess_cfg = dict(
    # RGB format normalization parameters
    mean=[125.307, 122.961, 113.8575],
    std=[51.5865, 50.847, 51.255],
    # loaded images are already RGB format
    to_rgb=False)
 train_pipeline = [
    dict(type='RandomCrop', crop_size=32, padding=4),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='PackClsInputs'),
    dict(
        type='Cutout',
        magnitude_key='shape',
        magnitude_range=(1, 16),
        pad_val=0,
        prob=0.5),
 ]
 test_pipeline = [
    dict(type='PackClsInputs'),
 ]
 train_dataloader = dict(
    batch_size=96,
    num_workers=2,
    dataset=dict(
        type=dataset_type,
        data_prefix='/mnt/cache/share_data/dongpeijie/data/cifar10',
        test_mode=False,
        pipeline=train_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=True),
    persistent_workers=True,
 )
 val_dataloader = dict(
    batch_size=16,
    num_workers=2,
    dataset=dict(
        type=dataset_type,
        data_prefix='/mnt/cache/share_data/dongpeijie/data/cifar10/',
        test_mode=True,
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
    persistent_workers=True,
 )
 val_evaluator = dict(type='Accuracy', topk=(1, ))
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
 # optimizer
 optim_wrapper = dict(
    architecture=dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=3e-4),
    mutator=dict(type='Adam', lr=3e-4, weight_decay=1e-3),
    clip_grad=dict(max_norm=5, norm_type=2))
 # leanring policy
 param_scheduler = [
    dict(
        type='CosineAnnealingLR',
        T_max=600,
        by_epoch=True,
        begin=0,
        end=600,
    )
 ]
 # train, val, test setting
 train_cfg = dict(by_epoch=True, max_epochs=600)
 val_cfg = dict(interval=1)  # validate each epoch
 test_cfg = dict()
 # defaults to use registries in mmcls
 default_scope = 'mmcls'
 # configure default hooks
 default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=100),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(
        type='CheckpointHook', interval=1, save_last=True, max_keep_ckpts=3),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='VisualizationHook', enable=False),
 )
 # configure environment
 env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
 )
 # set visualizer
 visualizer = None
 # set log level
 log_level = 'INFO'
 # load from which checkpoint
 load_from = None
 # whether to resume training from the loaded checkpoint
 resume = False
 # model
 norm_cfg = dict(type='BN', affine=True)
 mutable_cfg = dict(
    _scope_='mmrazor',
    type='mmrazor.DiffMutableOP',
    candidates=dict(
        zero=dict(type='mmrazor.DartsZero'),
        skip_connect=dict(
            type='mmrazor.DartsSkipConnect',
            norm_cfg=norm_cfg,
            use_drop_path=True),
        max_pool_3x3=dict(
            type='mmrazor.DartsPoolBN',
            pool_type='max',
            norm_cfg=norm_cfg,
            use_drop_path=True),
        avg_pool_3x3=dict(
            type='mmrazor.DartsPoolBN',
            pool_type='avg',
            norm_cfg=norm_cfg,
            use_drop_path=True),
        sep_conv_3x3=dict(
            type='mmrazor.DartsSepConv',
            kernel_size=3,
            norm_cfg=norm_cfg,
            use_drop_path=True),
        sep_conv_5x5=dict(
            type='mmrazor.DartsSepConv',
            kernel_size=5,
            norm_cfg=norm_cfg,
            use_drop_path=True),
        dil_conv_3x3=dict(
            type='mmrazor.DartsDilConv',
            kernel_size=3,
            norm_cfg=norm_cfg,
            use_drop_path=True),
        dil_conv_5x5=dict(
            type='mmrazor.DartsDilConv',
            kernel_size=5,
            norm_cfg=norm_cfg,
            use_drop_path=True),
    ))
 route_cfg = dict(
    type='mmrazor.DiffChoiceRoute',
    with_arch_param=True,
 )
 supernet = dict(
    type='mmcls.ImageClassifier',
    data_preprocessor=preprocess_cfg,
    backbone=dict(
        type='mmrazor.DartsBackbone',
        in_channels=3,
        base_channels=36,
        num_layers=20,
        num_nodes=4,
        stem_multiplier=3,
        auxliary=True,
        aux_channels=128,
        aux_out_channels=768,
        out_indices=(19, ),
        mutable_cfg=mutable_cfg,
        route_cfg=route_cfg),
    neck=dict(type='mmcls.GlobalAveragePooling'),
    head=dict(
        type='mmrazor.DartsSubnetClsHead',
        num_classes=10,
        in_channels=576,
        aux_in_channels=768,
        loss=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
        aux_loss=dict(type='mmcls.CrossEntropyLoss', loss_weight=0.4),
        topk=(1, 5),
        cal_acc=True),
 )
 mutator = dict(type='mmrazor.DiffModuleMutator')
 fix_subnet = 'configs/nas/darts/DARTS_SUBNET_CIFAR_PAPER_ALIAS.yaml'
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
    fix_subnet=fix_subnet,
 )
 find_unused_parameter = False
--- a/configs/nas/darts/darts_supernet_unroll_1xb64_cifar10_2.0.py
+++ b/configs/nas/darts/darts_supernet_unroll_1xb64_cifar10_2.0.py
@ -0,0 +1,163 @@
 # dataset settings
 dataset_type = 'CIFAR10'
 preprocess_cfg = dict(
    # RGB format normalization parameters
    mean=[125.307, 122.961, 113.8575],
    std=[51.5865, 50.847, 51.255],
    # loaded images are already RGB format
    to_rgb=False)
 train_pipeline = [
    dict(type='RandomCrop', crop_size=32, padding=4),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='PackClsInputs'),
 ]
 test_pipeline = [
    dict(type='PackClsInputs'),
 ]
 train_dataloader = dict(
    batch_size=16,
    num_workers=2,
    dataset=dict(
        type=dataset_type,
        data_prefix='/mnt/cache/share_data/dongpeijie/data/cifar10',
        test_mode=False,
        pipeline=train_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=True),
    persistent_workers=True,
 )
 val_dataloader = dict(
    batch_size=16,
    num_workers=2,
    dataset=dict(
        type=dataset_type,
        data_prefix='/mnt/cache/share_data/dongpeijie/data/cifar10/',
        test_mode=True,
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
    persistent_workers=True,
 )
 val_evaluator = dict(type='Accuracy', topk=(1, ))
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
 # optimizer
 optim_wrapper = dict(
    architecture=dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=3e-4),
    mutator=dict(type='Adam', lr=3e-4, weight_decay=1e-3),
    clip_grad=None)
 # leanring policy
 param_scheduler = [
    dict(
        type='CosineAnnealingLR',
        T_max=50,
        by_epoch=True,
        min_lr=1e-3,
        begin=0,
        end=50,
    )
 ]
 # train, val, test setting
 train_cfg = dict(by_epoch=True, max_epochs=50)
 val_cfg = dict(interval=1)  # validate each epoch
 test_cfg = dict()
 # defaults to use registries in mmcls
 default_scope = 'mmcls'
 # configure default hooks
 default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=100),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(
        type='CheckpointHook', interval=1, save_last=True, max_keep_ckpts=3),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='VisualizationHook', enable=False),
 )
 # configure environment
 env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
 )
 # set visualizer
 visualizer = None
 # set log level
 log_level = 'INFO'
 # load from which checkpoint
 load_from = None
 # whether to resume training from the loaded checkpoint
 resume = False
 # model
 norm_cfg = dict(type='BN', affine=False)
 mutable_cfg = dict(
    _scope_='mmrazor',
    type='mmrazor.DiffMutableOP',
    candidates=dict(
        zero=dict(type='mmrazor.DartsZero'),
        skip_connect=dict(type='mmrazor.DartsSkipConnect', norm_cfg=norm_cfg),
        max_pool_3x3=dict(
            type='mmrazor.DartsPoolBN', pool_type='max', norm_cfg=norm_cfg),
        avg_pool_3x3=dict(
            type='mmrazor.DartsPoolBN', pool_type='avg', norm_cfg=norm_cfg),
        sep_conv_3x3=dict(
            type='mmrazor.DartsSepConv', kernel_size=3, norm_cfg=norm_cfg),
        sep_conv_5x5=dict(
            type='mmrazor.DartsSepConv', kernel_size=5, norm_cfg=norm_cfg),
        dil_conv_3x3=dict(
            type='mmrazor.DartsDilConv', kernel_size=3, norm_cfg=norm_cfg),
        dil_conv_5x5=dict(
            type='mmrazor.DartsDilConv', kernel_size=5, norm_cfg=norm_cfg),
    ))
 route_cfg = dict(
    type='mmrazor.DiffChoiceRoute',
    with_arch_param=True,
 )
 supernet = dict(
    type='mmcls.ImageClassifier',
    backbone=dict(
        type='mmrazor.DartsBackbone',
        in_channels=3,
        base_channels=36,
        num_layers=20,
        num_nodes=4,
        stem_multiplier=3,
        auxliary=False,
        out_indices=(19, ),
        mutable_cfg=mutable_cfg,
        route_cfg=route_cfg),
    neck=dict(type='mmcls.GlobalAveragePooling'),
    head=dict(
        type='mmrazor.DartsSubnetClsHead',
        num_classes=10,
        in_channels=576,
        aux_in_channels=768,
        loss=dict(type='mmcls.CrossEntropyLoss', loss_weight=1.0),
        aux_loss=dict(type='mmcls.CrossEntropyLoss', loss_weight=0.4),
        topk=(1, 5),
        cal_acc=True),
 )
 mutator = dict(type='mmrazor.DiffModuleMutator')
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
 )
 find_unused_parameter = True
--- a/configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR.yaml
+++ b/configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR.yaml
@ -1,60 +0,0 @@
 stage_0_block_0:
  chosen:
  - shuffle_7x7
 stage_0_block_1:
  chosen:
  - shuffle_5x5
 stage_0_block_2:
  chosen:
  - shuffle_7x7
 stage_0_block_3:
  chosen:
  - shuffle_3x3
 stage_1_block_0:
  chosen:
  - shuffle_7x7
 stage_1_block_1:
  chosen:
  - shuffle_5x5
 stage_1_block_2:
  chosen:
  - shuffle_5x5
 stage_1_block_3:
  chosen:
  - shuffle_7x7
 stage_2_block_0:
  chosen:
  - shuffle_xception
 stage_2_block_1:
  chosen:
  - shuffle_xception
 stage_2_block_2:
  chosen:
  - shuffle_5x5
 stage_2_block_3:
  chosen:
  - shuffle_xception
 stage_2_block_4:
  chosen:
  - shuffle_3x3
 stage_2_block_5:
  chosen:
  - shuffle_3x3
 stage_2_block_6:
  chosen:
  - shuffle_xception
 stage_2_block_7:
  chosen:
  - shuffle_5x5
 stage_3_block_0:
  chosen:
  - shuffle_xception
 stage_3_block_1:
  chosen:
  - shuffle_5x5
 stage_3_block_2:
  chosen:
  - shuffle_xception
 stage_3_block_3:
  chosen:
  - shuffle_7x7
--- a/configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR_2.0.yaml
+++ b/configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR_2.0.yaml
@ -0,0 +1,22 @@
 modules:
  backbone.layers.0.0: shuffle_5x5
  backbone.layers.0.1: shuffle_3x3
  backbone.layers.0.2: shuffle_3x3
  backbone.layers.0.3: shuffle_3x3
  backbone.layers.1.0: shuffle_xception
  backbone.layers.1.1: shuffle_3x3
  backbone.layers.1.2: shuffle_xception
  backbone.layers.1.3: shuffle_7x7
  backbone.layers.2.0: shuffle_7x7
  backbone.layers.2.1: shuffle_7x7
  backbone.layers.2.2: shuffle_xception
  backbone.layers.2.3: shuffle_xception
  backbone.layers.2.4: shuffle_3x3
  backbone.layers.2.5: shuffle_7x7
  backbone.layers.2.6: shuffle_5x5
  backbone.layers.2.7: shuffle_xception
  backbone.layers.3.0: shuffle_7x7
  backbone.layers.3.1: shuffle_7x7
  backbone.layers.3.2: shuffle_7x7
  backbone.layers.3.3: shuffle_5x5
 channels:
--- a/configs/nas/detnas/DetNAS_SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml
+++ b/configs/nas/detnas/DetNAS_SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml
@ -0,0 +1,22 @@
 modules:
  backbone.layers.0.0: shuffle_5x5
  backbone.layers.0.1: shuffle_3x3
  backbone.layers.0.2: shuffle_3x3
  backbone.layers.0.3: shuffle_3x3
  backbone.layers.1.0: shuffle_xception
  backbone.layers.1.1: shuffle_3x3
  backbone.layers.1.2: shuffle_xception
  backbone.layers.1.3: shuffle_7x7
  backbone.layers.2.0: shuffle_7x7
  backbone.layers.2.1: shuffle_7x7
  backbone.layers.2.2: shuffle_xception
  backbone.layers.2.3: shuffle_xception
  backbone.layers.2.4: shuffle_3x3
  backbone.layers.2.5: shuffle_7x7
  backbone.layers.2.6: shuffle_5x5
  backbone.layers.2.7: shuffle_xception
  backbone.layers.3.0: shuffle_7x7
  backbone.layers.3.1: shuffle_7x7
  backbone.layers.3.2: shuffle_7x7
  backbone.layers.3.3: shuffle_5x5
 channels:
--- a/configs/nas/detnas/detnas_evolution_search_frcnn_shufflenetv2_fpn_coco.py
+++ b/configs/nas/detnas/detnas_evolution_search_frcnn_shufflenetv2_fpn_coco.py
@ -1,20 +0,0 @@
 _base_ = ['./detnas_supernet_frcnn_shufflenetv2_fpn_1x_coco.py']
 data = dict(
    samples_per_gpu=128,
    workers_per_gpu=8,
 )
 algorithm = dict(bn_training_mode=True)
 searcher = dict(
    type='EvolutionSearcher',
    metrics='bbox',
    score_key='bbox_mAP',
    constraints=dict(flops=300 * 1e6),
    candidate_pool_size=50,
    candidate_top_k=10,
    max_epoch=20,
    num_mutation=20,
    num_crossover=20,
 )
--- a/configs/nas/detnas/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco.py
+++ b/configs/nas/detnas/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco.py
@ -1,6 +0,0 @@
 _base_ = ['./detnas_supernet_frcnn_shufflenetv2_fpn_1x_coco.py']
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 mutable_cfg = 'https://download.openmmlab.com/mmrazor/v0.1/nas/detnas/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_mutable_cfg.yaml'  # noqa: E501
 algorithm = dict(retraining=True, mutable_cfg=mutable_cfg)
--- a/configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k.py
+++ b/configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k.py
@ -1,8 +0,0 @@
 _base_ = [
    '../spos/spos_subnet_shufflenetv2_8xb128_in1k.py',
 ]
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 mutable_cfg = 'https://download.openmmlab.com/mmrazor/v0.1/nas/detnas/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_mutable_cfg.yaml'  # noqa: E501
 algorithm = dict(mutable_cfg=mutable_cfg)
--- a/configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k_2.0_frcnn.py
+++ b/configs/nas/detnas/detnas_subnet_shufflenetv2_8xb128_in1k_2.0_frcnn.py
@ -0,0 +1,8 @@
 _base_ = ['./detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py']
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 fix_subnet = 'configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR_2.0.yaml'  # noqa: E501
 model = dict(fix_subnet=fix_subnet)
 find_unused_parameters = False
--- a/configs/nas/detnas/detnas_supernet_frcnn_shufflenetv2_fpn_1x_coco.py
+++ b/configs/nas/detnas/detnas_supernet_frcnn_shufflenetv2_fpn_1x_coco.py
@ -1,144 +0,0 @@
 _base_ = [
    '../../_base_/datasets/mmdet/coco_detection.py',
    '../../_base_/schedules/mmdet/schedule_1x.py',
    '../../_base_/mmdet_runtime.py'
 ]
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
    type='mmdet.FasterRCNN',
    backbone=dict(
        type='mmcls.SearchableShuffleNetV2',
        norm_cfg=norm_cfg,
        out_indices=(0, 1, 2, 3),
        widen_factor=1.0,
        with_last_layer=False),
    neck=dict(
        type='FPN',
        norm_cfg=norm_cfg,
        in_channels=[64, 160, 320, 640],
        out_channels=256,
        num_outs=5),
    rpn_head=dict(
        type='RPNHead',
        in_channels=256,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
            ratios=[0.5, 1.0, 2.0],
            strides=[4, 8, 16, 32, 64]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    roi_head=dict(
        type='StandardRoIHead',
        bbox_roi_extractor=dict(
            type='SingleRoIExtractor',
            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
            out_channels=256,
            featmap_strides=[4, 8, 16, 32]),
        bbox_head=dict(
            type='Shared4Conv1FCBBoxHead',
            norm_cfg=norm_cfg,
            in_channels=256,
            fc_out_channels=1024,
            roi_feat_size=7,
            num_classes=80,
            bbox_coder=dict(
                type='DeltaXYWHBBoxCoder',
                target_means=[0., 0., 0., 0.],
                target_stds=[0.1, 0.1, 0.2, 0.2]),
            reg_class_agnostic=False,
            loss_cls=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
    train_cfg=dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=2000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            pos_weight=-1,
            debug=False)),
    test_cfg=dict(
        rpn=dict(
            nms_pre=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100)
        # soft-nms is also supported for rcnn testing
        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
    ),
 )
 mutator = dict(
    type='OneShotModuleMutator',
    placeholder_mapping=dict(
        all_blocks=dict(
            type='OneShotMutableOP',
            choices=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=3),
                shuffle_5x5=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=5),
                shuffle_7x7=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=7),
                shuffle_xception=dict(
                    type='ShuffleXception',
                    norm_cfg=norm_cfg,
                ),
            ))))
 algorithm = dict(
    type='DetNAS',
    architecture=dict(
        type='MMDetArchitecture',
        model=model,
    ),
    mutator=mutator,
    pruner=None,
    distiller=None,
    retraining=False,
 )
 find_unused_parameters = True
--- a/configs/nas/detnas/detnas_supernet_shufflenetv2_8xb128_in1k.py
+++ b/configs/nas/detnas/detnas_supernet_shufflenetv2_8xb128_in1k.py
@ -1,5 +0,0 @@
 _base_ = [
    '../spos/spos_supernet_shufflenetv2_8xb128_in1k.py',
 ]
 runner = dict(max_iters=300000)
--- a/configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py
+++ b/configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_frcnn.py
@ -0,0 +1,87 @@
 _base_ = [
    'mmdet::_base_/models/faster_rcnn_r50_fpn.py',
    'mmdet::_base_/datasets/coco_detection.py',
    'mmdet::_base_/schedules/schedule_1x.py',
    'mmdet::_base_/default_runtime.py'
 ]
 data_root = '/mnt/lustre/share_data/zhangwenwei/data/coco/'
 _base_.train_dataloader.dataset.data_root = data_root
 visualizer = None
 log_level = 'INFO'
 load_from = '/mnt/lustre/dongpeijie/detnas_subnet_shufflenetv2_8xb128_in1k_acc-74.08_20211223-92e9b66a_2.0.pth'  # noqa: E501
 resume = False
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 # model settings
 _STAGE_MUTABLE = dict(
    _scope_='mmrazor',
    type='mmrazor.OneShotMutableOP',
    candidates=dict(
        shuffle_3x3=dict(
            type='mmrazor.ShuffleBlock', kernel_size=3, norm_cfg=norm_cfg),
        shuffle_5x5=dict(
            type='mmrazor.ShuffleBlock', kernel_size=5, norm_cfg=norm_cfg),
        shuffle_7x7=dict(
            type='mmrazor.ShuffleBlock', kernel_size=7, norm_cfg=norm_cfg),
        shuffle_xception=dict(
            type='mmrazor.ShuffleXception', norm_cfg=norm_cfg),
    ))
 arch_setting = [
    # Parameters to build layers. 3 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, mutable_cfg.
    [64, 4, _STAGE_MUTABLE],
    [160, 4, _STAGE_MUTABLE],
    [320, 8, _STAGE_MUTABLE],
    [640, 4, _STAGE_MUTABLE],
 ]
 supernet = _base_.model
 supernet.backbone = dict(
    type='mmrazor.SearchableShuffleNetV2',
    arch_setting=arch_setting,
    norm_cfg=norm_cfg,
    out_indices=(0, 1, 2, 3),
    widen_factor=1.0,
    with_last_layer=False)
 supernet.neck = dict(
    type='FPN',
    norm_cfg=norm_cfg,
    in_channels=[64, 160, 320, 640],
    out_channels=256,
    num_outs=5)
 supernet.roi_head.bbox_head = dict(
    type='Shared4Conv1FCBBoxHead',
    norm_cfg=norm_cfg,
    in_channels=256,
    fc_out_channels=1024,
    roi_feat_size=7,
    num_classes=80,
    bbox_coder=dict(
        type='DeltaXYWHBBoxCoder',
        target_means=[0., 0., 0., 0.],
        target_stds=[0.1, 0.1, 0.2, 0.2]),
    reg_class_agnostic=False,
    loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
    loss_bbox=dict(type='L1Loss', loss_weight=1.0))
 mutator = dict(type='mmrazor.OneShotModuleMutator')
 fix_subnet = 'configs/nas/detnas/DETNAS_FRCNN_SHUFFLENETV2_340M_COCO_MMRAZOR_2.0.yaml'  # noqa: E501
 model = dict(
    _delete_=True,
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
    fix_subnet=fix_subnet,
 )
 find_unused_parameters = True
--- a/configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_retinanet.py
+++ b/configs/nas/detnas/detnas_supernet_shufflenetv2_coco_1x_2.0_retinanet.py
@ -0,0 +1,114 @@
 _base_ = [
    'mmdet::faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py',
    'mmdet::datasets/coco_detection.py', 'mmdet::schedules/schedule_1x.py',
    'mmdet::default_runtime.py'
 ]
 data_root = '/mnt/lustre/share_data/zhangwenwei/data/coco/'
 train_dataloader = dict(dataset=dict(data_root=data_root, ))
 visualizer = None
 # custom_hooks = [dict(type='DetVisualizationHook', interval=10)]
 log_level = 'INFO'
 load_from = None
 resume = False
 # TODO: support auto scaling lr
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 # model settings
 _STAGE_MUTABLE = dict(
    _scope_='mmrazor',
    type='mmrazor.OneShotMutableOP',
    candidates=dict(
        shuffle_3x3=dict(
            type='mmrazor.ShuffleBlock', kernel_size=3, norm_cfg=norm_cfg),
        shuffle_5x5=dict(
            type='mmrazor.ShuffleBlock', kernel_size=5, norm_cfg=norm_cfg),
        shuffle_7x7=dict(
            type='mmrazor.ShuffleBlock', kernel_size=7, norm_cfg=norm_cfg),
        shuffle_xception=dict(
            type='mmrazor.ShuffleXception', norm_cfg=norm_cfg),
    ))
 arch_setting = [
    # Parameters to build layers. 3 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, mutable_cfg.
    [64, 4, _STAGE_MUTABLE],
    [160, 4, _STAGE_MUTABLE],
    [320, 8, _STAGE_MUTABLE],
    [640, 4, _STAGE_MUTABLE],
 ]
 supernet = dict(
    type='RetinaNet',
    data_preprocessor=dict(
        type='DetDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True,
        pad_size_divisor=32),
    backbone=dict(
        type='mmrazor.SearchableShuffleNetV2',
        arch_setting=arch_setting,
        norm_cfg=norm_cfg,
        out_indices=(0, 1, 2, 3),
        widen_factor=1.0,
        with_last_layer=False),
    neck=dict(
        type='FPN',
        in_channels=[64, 160, 320, 640],
        out_channels=256,
        num_outs=5),
    bbox_head=dict(
        type='RetinaHead',
        num_classes=80,
        in_channels=256,
        stacked_convs=4,
        feat_channels=256,
        anchor_generator=dict(
            type='AnchorGenerator',
            octave_base_scale=4,
            scales_per_octave=3,
            ratios=[0.5, 1.0, 2.0],
            strides=[8, 16, 32, 64, 128]),
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[.0, .0, .0, .0],
            target_stds=[1.0, 1.0, 1.0, 1.0]),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
    # model training and testing settings
    train_cfg=dict(
        assigner=dict(
            type='MaxIoUAssigner',
            pos_iou_thr=0.5,
            neg_iou_thr=0.4,
            min_pos_iou=0,
            ignore_iof_thr=-1),
        allowed_border=-1,
        pos_weight=-1,
        debug=False),
    test_cfg=dict(
        nms_pre=1000,
        min_bbox_size=0,
        score_thr=0.05,
        nms=dict(type='nms', iou_threshold=0.5),
        max_per_img=100))
 mutator = dict(type='mmrazor.OneShotModuleMutator')
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
 )
 find_unused_parameters = True
--- a/configs/nas/spos/AngleNAS_SHUFFLENETV2_IN1k_2.0.yaml
+++ b/configs/nas/spos/AngleNAS_SHUFFLENETV2_IN1k_2.0.yaml
@ -0,0 +1,24 @@
 modules:
  backbone.layer1.0: mb_k3e1
  backbone.layer2.0: mb_k5e3
  backbone.layer2.1: mb_k5e3
  backbone.layer2.2: identity
  backbone.layer2.3: mb_k3e3
  backbone.layer3.0: mb_k3e3
  backbone.layer3.1: identity
  backbone.layer3.2: identity
  backbone.layer3.3: mb_k3e3
  backbone.layer4.0: mb_k7e6
  backbone.layer4.1: identity
  backbone.layer4.2: mb_k7e3
  backbone.layer4.3: mb_k7e3
  backbone.layer5.0: mb_k3e3
  backbone.layer5.1: mb_k3e3
  backbone.layer5.2: mb_k7e3
  backbone.layer5.3: mb_k5e3
  backbone.layer6.0: mb_k5e6
  backbone.layer6.1: mb_k7e3
  backbone.layer6.2: mb_k7e3
  backbone.layer6.3: mb_k7e3
  backbone.layer7.0: mb_k5e6
 channels:
--- a/configs/nas/spos/SPOS_MOBILENET_490M_FROM_ANGELNAS.yaml
+++ b/configs/nas/spos/SPOS_MOBILENET_490M_FROM_ANGELNAS.yaml
@ -1,66 +0,0 @@
 stage_0_block_0:
  chosen:
  - mb_k3e1
 stage_1_block_0:
  chosen:
  - mb_k5e3
 stage_1_block_1:
  chosen:
  - mb_k5e3
 stage_1_block_2:
  chosen:
  - identity
 stage_1_block_3:
  chosen:
  - mb_k3e3
 stage_2_block_0:
  chosen:
  - mb_k3e3
 stage_2_block_1:
  chosen:
  - identity
 stage_2_block_2:
  chosen:
  - identity
 stage_2_block_3:
  chosen:
  - mb_k3e3
 stage_3_block_0:
  chosen:
  - mb_k7e6
 stage_3_block_1:
  chosen:
  - identity
 stage_3_block_2:
  chosen:
  - mb_k7e3
 stage_3_block_3:
  chosen:
  - mb_k7e3
 stage_4_block_0:
  chosen:
  - mb_k3e3
 stage_4_block_1:
  chosen:
  - mb_k3e3
 stage_4_block_2:
  chosen:
  - mb_k7e3
 stage_4_block_3:
  chosen:
  - mb_k5e3
 stage_5_block_0:
  chosen:
  - mb_k5e6
 stage_5_block_1:
  chosen:
  - mb_k7e3
 stage_5_block_2:
  chosen:
  - mb_k7e3
 stage_5_block_3:
  chosen:
  - mb_k7e3
 stage_6_block_0:
  chosen:
  - mb_k5e6
--- a/configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER.yaml
+++ b/configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER.yaml
@ -1,60 +0,0 @@
 stage_0_block_0:
  chosen:
  - shuffle_7x7
 stage_0_block_1:
  chosen:
  - shuffle_5x5
 stage_0_block_2:
  chosen:
  - shuffle_3x3
 stage_0_block_3:
  chosen:
  - shuffle_5x5
 stage_1_block_0:
  chosen:
  - shuffle_7x7
 stage_1_block_1:
  chosen:
  - shuffle_3x3
 stage_1_block_2:
  chosen:
  - shuffle_7x7
 stage_1_block_3:
  chosen:
  - shuffle_3x3
 stage_2_block_0:
  chosen:
  - shuffle_7x7
 stage_2_block_1:
  chosen:
  - shuffle_3x3
 stage_2_block_2:
  chosen:
  - shuffle_7x7
 stage_2_block_3:
  chosen:
  - shuffle_xception
 stage_2_block_4:
  chosen:
  - shuffle_3x3
 stage_2_block_5:
  chosen:
  - shuffle_3x3
 stage_2_block_6:
  chosen:
  - shuffle_3x3
 stage_2_block_7:
  chosen:
  - shuffle_3x3
 stage_3_block_0:
  chosen:
  - shuffle_xception
 stage_3_block_1:
  chosen:
  - shuffle_7x7
 stage_3_block_2:
  chosen:
  - shuffle_xception
 stage_3_block_3:
  chosen:
  - shuffle_xception
--- a/configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml
+++ b/configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml
@ -0,0 +1,22 @@
 modules:
  backbone.layers.0.0: shuffle_7x7
  backbone.layers.0.1: shuffle_3x3
  backbone.layers.0.2: shuffle_7x7
  backbone.layers.0.3: shuffle_3x3
  backbone.layers.1.0: shuffle_xception
  backbone.layers.1.1: shuffle_5x5
  backbone.layers.1.2: shuffle_5x5
  backbone.layers.1.3: shuffle_3x3
  backbone.layers.2.0: shuffle_3x3
  backbone.layers.2.1: shuffle_5x5
  backbone.layers.2.2: shuffle_3x3
  backbone.layers.2.3: shuffle_5x5
  backbone.layers.2.4: shuffle_3x3
  backbone.layers.2.5: shuffle_xception
  backbone.layers.2.6: shuffle_5x5
  backbone.layers.2.7: shuffle_7x7
  backbone.layers.3.0: shuffle_7x7
  backbone.layers.3.1: shuffle_3x3
  backbone.layers.3.2: shuffle_5x5
  backbone.layers.3.3: shuffle_xception
 channels:
--- a/configs/nas/spos/spos_evolution_search_mobilenet_proxyless_gpu_flops465_8xb512_in1k.py
+++ b/configs/nas/spos/spos_evolution_search_mobilenet_proxyless_gpu_flops465_8xb512_in1k.py
@ -1,20 +0,0 @@
 _base_ = ['./spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k.py']
 data = dict(
    samples_per_gpu=512,
    workers_per_gpu=16,
 )
 algorithm = dict(bn_training_mode=True)
 searcher = dict(
    type='EvolutionSearcher',
    candidate_pool_size=50,
    candidate_top_k=10,
    constraints=dict(flops=465 * 1e6),
    metrics='accuracy',
    score_key='accuracy_top-1',
    max_epoch=20,
    num_mutation=25,
    num_crossover=25,
    mutate_prob=0.1)
--- a/configs/nas/spos/spos_evolution_search_shufflenetv2_8xb2048_in1k.py
+++ b/configs/nas/spos/spos_evolution_search_shufflenetv2_8xb2048_in1k.py
@ -1,20 +0,0 @@
 _base_ = ['./spos_supernet_shufflenetv2_8xb128_in1k.py']
 data = dict(
    samples_per_gpu=2048,
    workers_per_gpu=16,
 )
 algorithm = dict(bn_training_mode=True)
 searcher = dict(
    type='EvolutionSearcher',
    candidate_pool_size=50,
    candidate_top_k=10,
    constraints=dict(flops=330 * 1e6),
    metrics='accuracy',
    score_key='accuracy_top-1',
    max_epoch=20,
    num_mutation=25,
    num_crossover=25,
    mutate_prob=0.1)
--- a/configs/nas/spos/spos_mobilenet_for_check_ckpt_from_anglenas.py
+++ b/configs/nas/spos/spos_mobilenet_for_check_ckpt_from_anglenas.py
@ -1,27 +0,0 @@
 _base_ = [
    './spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k.py',
 ]
 img_norm_cfg = dict(mean=[0., 0., 0.], std=[1., 1., 1.], to_rgb=False)
 train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='RandomResizedCrop', size=224),
    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='ImageToTensor', keys=['img']),
    dict(type='ToTensor', keys=['gt_label']),
    dict(type='Collect', keys=['img', 'gt_label'])
 ]
 test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', size=(256, -1)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='ImageToTensor', keys=['img']),
    dict(type='Collect', keys=['img'])
 ]
 data = dict(
    train=dict(pipeline=train_pipeline),
    val=dict(pipeline=test_pipeline),
    test=dict(pipeline=test_pipeline))
--- a/configs/nas/spos/spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k.py
+++ b/configs/nas/spos/spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k.py
@ -1,13 +0,0 @@
 _base_ = [
    './spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k.py',
 ]
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 mutable_cfg = 'https://download.openmmlab.com/mmrazor/v0.1/nas/spos/spos_mobilenet_subnet/spos_angelnas_flops_0.49G_acc_75.98_20220307-54f4698f_mutable_cfg.yaml'  # noqa: E501
 algorithm = dict(retraining=True, mutable_cfg=mutable_cfg)
 evaluation = dict(interval=10000, metric='accuracy')
 checkpoint_config = dict(interval=30000)
 runner = dict(max_iters=300000)
 find_unused_parameters = False
--- a/configs/nas/spos/spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py
+++ b/configs/nas/spos/spos_subnet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py
@ -0,0 +1,8 @@
 _base_ = ['./spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py']
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 fix_subnet = 'configs/nas/spos/AngleNAS_SHUFFLENETV2_IN1k_2.0.yaml'  # noqa: E501
 model = dict(fix_subnet=fix_subnet)
 find_unused_parameters = False
--- a/configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k.py
+++ b/configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k.py
@ -1,11 +0,0 @@
 _base_ = [
    './spos_supernet_shufflenetv2_8xb128_in1k.py',
 ]
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 mutable_cfg = 'https://download.openmmlab.com/mmrazor/v0.1/nas/spos/spos_shufflenetv2_subnet_8xb128_in1k/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-454627be_mutable_cfg.yaml'  # noqa: E501
 algorithm = dict(retraining=True, mutable_cfg=mutable_cfg)
 runner = dict(max_iters=300000)
 find_unused_parameters = False
--- a/configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py
+++ b/configs/nas/spos/spos_subnet_shufflenetv2_8xb128_in1k_2.0_example.py
@ -0,0 +1,9 @@
 _base_ = ['./spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py']
 # FIXME: you may replace this with the mutable_cfg searched by yourself
 # fix_subnet = 'configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml'  # noqa: E501
 fix_subnet = 'configs/nas/detnas/DetNAS_SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml'  # noqa: E501
 model = dict(fix_subnet=fix_subnet)
 find_unused_parameters = False
--- a/configs/nas/spos/spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k.py
+++ b/configs/nas/spos/spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k.py
@ -1,101 +0,0 @@
 _base_ = [
    '../../_base_/datasets/mmcls/imagenet_bs128_colorjittor.py',
    '../../_base_/schedules/mmcls/imagenet_bs1024_spos.py',
    '../../_base_/mmcls_runtime.py'
 ]
 norm_cfg = dict(type='BN')
 model = dict(
    type='mmcls.ImageClassifier',
    backbone=dict(
        type='SearchableMobileNet',
        first_channels=40,
        last_channels=1728,
        widen_factor=1.0,
        norm_cfg=norm_cfg,
        arch_setting_type='proxyless_gpu'),
    neck=dict(type='GlobalAveragePooling'),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1728,
        loss=dict(
            type='LabelSmoothLoss',
            num_classes=1000,
            label_smooth_val=0.1,
            mode='original',
            loss_weight=1.0),
        topk=(1, 5),
    ),
 )
 mutator = dict(
    type='OneShotModuleMutator',
    placeholder_mapping=dict(
        searchable_blocks=dict(
            type='OneShotMutableOP',
            choices=dict(
                mb_k3e3=dict(
                    type='MBBlock',
                    kernel_size=3,
                    expand_ratio=3,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                mb_k5e3=dict(
                    type='MBBlock',
                    kernel_size=5,
                    expand_ratio=3,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                mb_k7e3=dict(
                    type='MBBlock',
                    kernel_size=7,
                    expand_ratio=3,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                mb_k3e6=dict(
                    type='MBBlock',
                    kernel_size=3,
                    expand_ratio=6,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                mb_k5e6=dict(
                    type='MBBlock',
                    kernel_size=5,
                    expand_ratio=6,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                mb_k7e6=dict(
                    type='MBBlock',
                    kernel_size=7,
                    expand_ratio=6,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')),
                identity=dict(type='Identity'))),
        first_blocks=dict(
            type='OneShotMutableOP',
            choices=dict(
                mb_k3e1=dict(
                    type='MBBlock',
                    kernel_size=3,
                    expand_ratio=1,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU6')), ))))
 algorithm = dict(
    type='SPOS',
    architecture=dict(
        type='MMClsArchitecture',
        model=model,
    ),
    mutator=mutator,
    distiller=None,
    retraining=False,
 )
 runner = dict(max_iters=150000)
 evaluation = dict(interval=10000, metric='accuracy')
 # checkpoint saving
 checkpoint_config = dict(interval=30000)
 find_unused_parameters = True
--- a/configs/nas/spos/spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py
+++ b/configs/nas/spos/spos_supernet_mobilenet_proxyless_gpu_8xb128_in1k_2.0.py
@ -0,0 +1,245 @@
 # dataset settings
 dataset_type = 'ImageNet'
 preprocess_cfg = dict(
    # RGB format normalization parameters
    mean=[0., 0., 0.],
    std=[1., 1., 1.],
    # convert image from BGR to RGB
    to_rgb=False,
 )
 file_client_args = dict(
    backend='petrel',
    path_mapping=dict({
        './data/imagenet':
        'sproject:s3://openmmlab/datasets/classification/imagenet',
        'data/imagenet':
        'sproject:s3://openmmlab/datasets/classification/imagenet'
    }))
 train_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(type='RandomResizedCrop', scale=224),
    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='PackClsInputs'),
 ]
 test_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(
        type='ResizeEdge',
        scale=256,
        edge='short',
        backend='pillow',
        interpolation='bicubic'),
    dict(type='CenterCrop', crop_size=224),
    dict(type='PackClsInputs'),
 ]
 train_dataloader = dict(
    batch_size=128,
    num_workers=8,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/train.txt',
        data_prefix='train',
        pipeline=train_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=True),
    persistent_workers=True,
 )
 # /mnt/lustre/share_data/wangjiaqi/data/imagenet',
 val_dataloader = dict(
    batch_size=128,
    num_workers=8,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/val.txt',
        data_prefix='val',
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
    persistent_workers=True,
 )
 val_evaluator = dict(type='Accuracy', topk=(1, 5))
 # If you want standard test, please manually configure the test dataset
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
 # scheduler
 # optimizer
 optim_wrapper = dict(
    optimizer=dict(type='SGD', lr=0.5, momentum=0.9, weight_decay=4e-5),
    clip_grad=None)
 # leanring policy
 param_scheduler = [
    dict(type='PolyLR', power=1.0, eta_min=0.0, by_epoch=False, end=300000),
 ]
 # train, val, test setting
 train_cfg = dict(by_epoch=False, max_iters=300000)
 val_cfg = dict()
 test_cfg = dict()
 # runtime
 # defaults to use registries in mmrazor
 default_scope = 'mmcls'
 log_processor = dict(
    window_size=100,
    by_epoch=False,
    custom_cfg=[
        dict(
            data_src='loss',
            log_name='loss_large_window',
            method_name='mean',
            window_size=100)
    ])
 # configure default hooks
 default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=100),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(
        type='CheckpointHook',
        by_epoch=False,
        interval=10000,
        save_last=True,
        max_keep_ckpts=3),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='VisualizationHook', enable=False),
 )
 # configure environment
 env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
 )
 # set visualizer
 visualizer = None
 # dict(type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
 # vis_backends = [dict(type='LocalVisBackend')]
 # set log level
 log_level = 'INFO'
 # load from which checkpoint
 load_from = None
 # whether to resume training from the loaded checkpoint
 resume = False
 # model
 norm_cfg = dict(type='BN')
 _STAGE_MUTABLE = dict(
    _scope_='mmrazor',
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e3=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=3,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        mb_k5e3=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=3,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        mb_k7e3=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=3,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        mb_k3e6=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=6,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        mb_k5e6=dict(
            type='MBBlock',
            kernel_size=5,
            expand_ratio=6,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        mb_k7e6=dict(
            type='MBBlock',
            kernel_size=7,
            expand_ratio=6,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')),
        identity=dict(type='Identity'),
    ))
 _FIRST_MUTABLE = dict(
    _scope_='mmrazor',
    type='OneShotMutableOP',
    candidates=dict(
        mb_k3e1=dict(
            type='MBBlock',
            kernel_size=3,
            expand_ratio=1,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU6')), ))
 arch_setting = [
    # Parameters to build layers. 3 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, mutable_cfg.
    [24, 1, 1, _FIRST_MUTABLE],
    [32, 4, 2, _STAGE_MUTABLE],
    [56, 4, 2, _STAGE_MUTABLE],
    [112, 4, 2, _STAGE_MUTABLE],
    [128, 4, 1, _STAGE_MUTABLE],
    [256, 4, 2, _STAGE_MUTABLE],
    [432, 1, 1, _STAGE_MUTABLE]
 ]
 norm_cfg = dict(type='BN')
 supernet = dict(
    type='ImageClassifier',
    data_preprocessor=preprocess_cfg,
    backbone=dict(
        _scope_='mmrazor',
        type='SearchableMobileNet',
        first_channels=40,
        last_channels=1728,
        widen_factor=1.0,
        norm_cfg=norm_cfg,
        arch_setting=arch_setting),
    neck=dict(type='GlobalAveragePooling'),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1728,
        loss=dict(
            type='LabelSmoothLoss',
            num_classes=1000,
            label_smooth_val=0.1,
            mode='original',
            loss_weight=1.0),
        topk=(1, 5),
    ),
 )
 mutator = dict(type='mmrazor.OneShotModuleMutator')
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
 )
 find_unused_parameters = True
--- a/configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k.py
+++ b/configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k.py
@ -1,59 +0,0 @@
 _base_ = [
    '../../_base_/datasets/mmcls/imagenet_bs128_colorjittor.py',
    '../../_base_/schedules/mmcls/imagenet_bs1024_spos.py',
    '../../_base_/mmcls_runtime.py'
 ]
 norm_cfg = dict(type='BN')
 model = dict(
    type='mmcls.ImageClassifier',
    backbone=dict(
        type='SearchableShuffleNetV2', widen_factor=1.0, norm_cfg=norm_cfg),
    neck=dict(type='GlobalAveragePooling'),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1024,
        loss=dict(
            type='LabelSmoothLoss',
            num_classes=1000,
            label_smooth_val=0.1,
            mode='original',
            loss_weight=1.0),
        topk=(1, 5),
    ),
 )
 mutator = dict(
    type='OneShotModuleMutator',
    placeholder_mapping=dict(
        all_blocks=dict(
            type='OneShotMutableOP',
            choices=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', kernel_size=3, norm_cfg=norm_cfg),
                shuffle_5x5=dict(
                    type='ShuffleBlock', kernel_size=5, norm_cfg=norm_cfg),
                shuffle_7x7=dict(
                    type='ShuffleBlock', kernel_size=7, norm_cfg=norm_cfg),
                shuffle_xception=dict(
                    type='ShuffleXception', norm_cfg=norm_cfg),
            ))))
 algorithm = dict(
    type='SPOS',
    architecture=dict(
        type='MMClsArchitecture',
        model=model,
    ),
    mutator=mutator,
    distiller=None,
    retraining=False,
 )
 runner = dict(max_iters=150000)
 evaluation = dict(interval=1000, metric='accuracy')
 # checkpoint saving
 checkpoint_config = dict(interval=1000)
 find_unused_parameters = True
--- a/configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py
+++ b/configs/nas/spos/spos_supernet_shufflenetv2_8xb128_in1k_2.0_example.py
@ -0,0 +1,214 @@
 # dataset settings
 dataset_type = 'ImageNet'
 preprocess_cfg = dict(
    # RGB format normalization parameters
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    # convert image from BGR to RGB
    to_rgb=True,
 )
 file_client_args = dict(
    backend='petrel',
    path_mapping=dict({
        './data/imagenet':
        'sproject:s3://openmmlab/datasets/classification/imagenet',
        'data/imagenet':
        'sproject:s3://openmmlab/datasets/classification/imagenet'
    }))
 train_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(type='RandomResizedCrop', scale=224),
    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
    dict(type='PackClsInputs'),
 ]
 test_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=file_client_args),
    dict(type='ResizeEdge', scale=256, edge='short', backend='cv2'),
    dict(type='CenterCrop', crop_size=224),
    dict(type='PackClsInputs'),
 ]
 train_dataloader = dict(
    batch_size=128,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/train.txt',
        data_prefix='train',
        pipeline=train_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=True),
    persistent_workers=True,
 )
 # /mnt/lustre/share_data/wangjiaqi/data/imagenet',
 val_dataloader = dict(
    batch_size=128,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='/mnt/cache/share/images',
        ann_file='meta/val.txt',
        data_prefix='val',
        pipeline=test_pipeline),
    sampler=dict(type='DefaultSampler', shuffle=False),
    persistent_workers=True,
 )
 val_evaluator = dict(type='Accuracy', topk=(1, 5))
 # If you want standard test, please manually configure the test dataset
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
 # scheduler
 # optimizer
 optim_wrapper = dict(
    optimizer=dict(type='SGD', lr=0.5, momentum=0.9, weight_decay=4e-5),
    clip_grad=None)
 # leanring policy
 param_scheduler = [
    dict(type='PolyLR', power=1.0, eta_min=0.0, by_epoch=False, end=300000),
 ]
 # train, val, test setting
 train_cfg = dict(by_epoch=False, max_iters=300000)
 val_cfg = dict()
 test_cfg = dict()
 # runtime
 # defaults to use registries in mmrazor
 default_scope = 'mmcls'
 log_processor = dict(
    window_size=100,
    by_epoch=False,
    custom_cfg=[
        dict(
            data_src='loss',
            log_name='loss_large_window',
            method_name='mean',
            window_size=100)
    ])
 # configure default hooks
 default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type='IterTimerHook'),
    # print log every 100 iterations.
    logger=dict(type='LoggerHook', interval=100),
    # enable the parameter scheduler.
    param_scheduler=dict(type='ParamSchedulerHook'),
    # save checkpoint per epoch.
    checkpoint=dict(
        type='CheckpointHook',
        by_epoch=False,
        interval=10000,
        save_last=True,
        max_keep_ckpts=3),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type='DistSamplerSeedHook'),
    # validation results visualization, set True to enable it.
    visualization=dict(type='VisualizationHook', enable=False),
 )
 # configure environment
 env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
 )
 # set visualizer
 visualizer = None
 # dict(type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
 # vis_backends = [dict(type='LocalVisBackend')]
 # set log level
 log_level = 'INFO'
 # load from which checkpoint
 load_from = None
 # "/mnt/lustre/dongpeijie/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d.pth"
 # whether to resume training from the loaded checkpoint
 resume = False
 # model
 _STAGE_MUTABLE = dict(
    _scope_='mmrazor',
    type='OneShotMutableOP',
    candidates=dict(
        shuffle_3x3=dict(
            type='ShuffleBlock', kernel_size=3, norm_cfg=dict(type='BN')),
        shuffle_5x5=dict(
            type='ShuffleBlock', kernel_size=5, norm_cfg=dict(type='BN')),
        shuffle_7x7=dict(
            type='ShuffleBlock', kernel_size=7, norm_cfg=dict(type='BN')),
        shuffle_xception=dict(
            type='ShuffleXception', norm_cfg=dict(type='BN')),
    ))
 arch_setting = [
    # Parameters to build layers. 3 parameters are needed to construct a
    # layer, from left to right: channel, num_blocks, mutable_cfg.
    [64, 4, _STAGE_MUTABLE],
    [160, 4, _STAGE_MUTABLE],
    [320, 8, _STAGE_MUTABLE],
    [640, 4, _STAGE_MUTABLE],
 ]
 norm_cfg = dict(type='BN')
 supernet = dict(
    type='ImageClassifier',
    data_preprocessor=preprocess_cfg,
    backbone=dict(
        _scope_='mmrazor',
        type='SearchableShuffleNetV2',
        widen_factor=1.0,
        norm_cfg=norm_cfg,
        arch_setting=arch_setting),
    neck=dict(type='GlobalAveragePooling'),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1024,
        loss=dict(
            type='LabelSmoothLoss',
            num_classes=1000,
            label_smooth_val=0.1,
            mode='original',
            loss_weight=1.0),
        topk=(1, 5),
    ),
 )
 mutator = dict(type='mmrazor.OneShotModuleMutator')
 model = dict(
    type='mmrazor.SPOS',
    architecture=supernet,
    mutator=mutator,
    # fix_subnet='configs/nas/spos/SPOS_SHUFFLENETV2_330M_IN1k_PAPER_2.0.yaml'
 )
 find_unused_parameters = True
--- a/convert_keys.py
+++ b/convert_keys.py
@ -0,0 +1,372 @@
 from collections import OrderedDict
 import torch
 from mmengine.config import Config
 from mmrazor.core import *  # noqa: F401,F403
 from mmrazor.models import *  # noqa: F401,F403
 from mmrazor.registry import MODELS
 from mmrazor.utils import register_all_modules
 def convert_spos_key(old_path, new_path):
    old_dict = torch.load(old_path)
    new_dict = {'meta': old_dict['meta'], 'state_dict': {}}
    mapping = {
        'choices': '_candidates',
        'architecture.': '',
        'model.': '',
    }
    for k, v in old_dict['state_dict'].items():
        new_key = k
        for _from, _to in mapping.items():
            new_key = new_key.replace(_from, _to)
        new_key = f'architecture.{new_key}'
        new_dict['state_dict'][new_key] = v
    torch.save(new_dict, new_path)
 def convert_detnas_key(old_path, new_path):
    old_dict = torch.load(old_path)
    new_dict = {'meta': old_dict['meta'], 'state_dict': {}}
    mapping = {
        'choices': '_candidates',
        'model.': '',
    }
    for k, v in old_dict['state_dict'].items():
        new_key = k
        for _from, _to in mapping.items():
            new_key = new_key.replace(_from, _to)
        new_dict['state_dict'][new_key] = v
    torch.save(new_dict, new_path)
 def convert_anglenas_key(old_path, new_path):
    old_dict = torch.load(old_path)
    new_dict = {'state_dict': {}}
    mapping = {
        'choices': '_candidates',
        'model.': '',
        'mbv2': 'mb',
    }
    for k, v in old_dict.items():
        new_key = k
        for _from, _to in mapping.items():
            new_key = new_key.replace(_from, _to)
        new_dict['state_dict'][new_key] = v
    torch.save(new_dict, new_path)
 def convert_darts_key(old_path, new_path):
    old_dict = torch.load(old_path)
    new_dict = {'meta': old_dict['meta'], 'state_dict': {}}
    cfg = Config.fromfile(
        'configs/nas/darts/darts_subnet_1xb96_cifar10_2.0.py')
    # import ipdb; ipdb.set_trace()
    model = MODELS.build(cfg.model)
    print('============> module name')
    for name, module in model.state_dict().items():
        print(name)
    mapping = {
        'choices': '_candidates',
        'model.': '',
        'edges': 'route',
    }
    for k, v in old_dict['state_dict'].items():
        new_key = k
        for _from, _to in mapping.items():
            new_key = new_key.replace(_from, _to)
            # cells.0.nodes.0.edges.choices.normal_n2_p1.0.choices.sep_conv_3x3.conv1.2.weight
            splited_list = new_key.split('.')
            if len(splited_list) > 10 and splited_list[-6] == '0':
                del splited_list[-6]
                new_key = '.'.join(splited_list)
            elif len(splited_list) > 10 and splited_list[-5] == '0':
                del splited_list[-5]
                new_key = '.'.join(splited_list)
        new_dict['state_dict'][new_key] = v
    print('============> new dict')
    for key, v in new_dict['state_dict'].items():
        print(key)
    model.load_state_dict(new_dict['state_dict'], strict=True)
    torch.save(new_dict, new_path)
 def convert_cream_key(old_path, new_path):
    old_dict = torch.load(old_path, map_location=torch.device('cpu'))
    new_dict = {'state_dict': {}}  # noqa: F841
    ordered_old_dict = OrderedDict(old_dict['state_dict'])
    cfg = Config.fromfile('configs/nas/cream/cream_14_subnet_mobilenet.py')
    model = MODELS.build(cfg.model)
    model_name_list = []
    model_module_list = []
    # TODO show structure of model and checkpoint
    print('=' * 30, 'the key of model')
    for k, v in model.state_dict().items():
        print(k)
    print('=' * 30, 'the key of ckpt')
    for k, v in ordered_old_dict.items():
        print(k)
    # final mapping dict
    mapping = {}
    middle_razor2cream = {  # noqa: F841
        # point-wise expansion
        'expand_conv.conv.weight': 'conv_pw.weight',
        'expand_conv.bn.weight': 'bn1.weight',
        'expand_conv.bn.bias': 'bn1.bias',
        'expand_conv.bn.running_mean': 'bn1.running_mean',
        'expand_conv.bn.running_var': 'bn1.running_var',
        'expand_conv.bn.num_batches_tracked': 'bn1.num_batches_tracked',
        # se
        'se.conv1.conv.weight': 'se.conv_reduce.weight',
        'se.conv1.conv.bias': 'se.conv_reduce.bias',
        'se.conv2.conv.weight': 'se.conv_expand.weight',
        'se.conv2.conv.bias': 'se.conv_expand.bias',
        # depth-wise conv
        'depthwise_conv.conv.weight': 'conv_dw.weight',
        'depthwise_conv.bn.weight': 'bn2.weight',
        'depthwise_conv.bn.bias': 'bn2.bias',
        'depthwise_conv.bn.running_mean': 'bn2.running_mean',
        'depthwise_conv.bn.running_var': 'bn2.running_var',
        'depthwise_conv.bn.num_batches_tracked': 'bn2.num_batches_tracked',
        # point-wise linear projection
        'linear_conv.conv.weight': 'conv_pwl.weight',
        'linear_conv.bn.weight': 'bn3.weight',
        'linear_conv.bn.bias': 'bn3.bias',
        'linear_conv.bn.running_mean': 'bn3.running_mean',
        'linear_conv.bn.running_var': 'bn3.running_var',
        'linear_conv.bn.num_batches_tracked': 'bn3.num_batches_tracked',
    }
    first_razor2cream = {
        # for first depthsepconv dw
        'conv_dw.conv.weight': 'conv_dw.weight',
        'conv_dw.bn.weight': 'bn1.weight',
        'conv_dw.bn.bias': 'bn1.bias',
        'conv_dw.bn.running_mean': 'bn1.running_mean',
        'conv_dw.bn.running_var': 'bn1.running_var',
        'conv_dw.bn.num_batches_tracked': 'bn1.num_batches_tracked',
        # for first depthsepconv pw
        'conv_pw.conv.weight': 'conv_pw.weight',
        'conv_pw.bn.weight': 'bn2.weight',
        'conv_pw.bn.bias': 'bn2.bias',
        'conv_pw.bn.running_mean': 'bn2.running_mean',
        'conv_pw.bn.running_var': 'bn2.running_var',
        'conv_pw.bn.num_batches_tracked': 'bn2.num_batches_tracked',
        # se
        'se.conv1.conv.weight': 'se.conv_reduce.weight',
        'se.conv1.conv.bias': 'se.conv_reduce.bias',
        'se.conv2.conv.weight': 'se.conv_expand.weight',
        'se.conv2.conv.bias': 'se.conv_expand.bias',
    }
    last_razor2cream = {
        # for last convbnact
        'conv2.conv.weight': 'conv.weight',
        'conv2.bn.weight': 'bn1.weight',
        'conv2.bn.bias': 'bn1.bias',
        'conv2.bn.running_mean': 'bn1.running_mean',
        'conv2.bn.running_var': 'bn1.running_var',
        'conv2.bn.num_batches_tracked': 'bn1.num_batches_tracked',
    }
    middle_cream2razor = {v: k for k, v in middle_razor2cream.items()}
    first_cream2razor = {v: k for k, v in first_razor2cream.items()}
    last_cream2razor = {v: k for k, v in last_razor2cream.items()}
    # 1. group the razor's module names
    grouped_razor_module_name = {
        'middle': {},
        'first': [],
        'last': [],
    }
    for name, module in model.state_dict().items():
        tmp_name: str = name.split(
            'backbone.')[1] if 'backbone' in name else name
        model_name_list.append(tmp_name)
        model_module_list.append(module)
        if 'conv1' in tmp_name and len(tmp_name) <= 35:
            # belong to stem conv
            grouped_razor_module_name['first'].append(name)
        elif 'head' in tmp_name:
            # belong to last linear
            grouped_razor_module_name['last'].append(name)
        else:
            # middle
            if tmp_name.startswith('layer'):
                key_of_middle = tmp_name[5:8]
                if key_of_middle not in grouped_razor_module_name['middle']:
                    grouped_razor_module_name['middle'][key_of_middle] = [name]
                else:
                    grouped_razor_module_name['middle'][key_of_middle].append(
                        name)
            elif tmp_name.startswith('conv2'):
                key_of_middle = '7.0'
                if key_of_middle not in grouped_razor_module_name['middle']:
                    grouped_razor_module_name['middle'][key_of_middle] = [name]
                else:
                    grouped_razor_module_name['middle'][key_of_middle].append(
                        name)
    # 2. group the cream's module names
    grouped_cream_module_name = {
        'middle': {},
        'first': [],
        'last': [],
    }
    for k in ordered_old_dict.keys():
        if 'classifier' in k or 'conv_head' in k:
            # last conv
            grouped_cream_module_name['last'].append(k)
        elif 'blocks' in k:
            # middle blocks
            key_of_middle = k[7:10]
            if key_of_middle not in grouped_cream_module_name['middle']:
                grouped_cream_module_name['middle'][key_of_middle] = [k]
            else:
                grouped_cream_module_name['middle'][key_of_middle].append(k)
        else:
            # first blocks
            grouped_cream_module_name['first'].append(k)
    # 4. process the first modules
    for cream_item in grouped_cream_module_name['first']:
        if 'conv_stem' in cream_item:
            # get corresponding item from razor
            for razor_item in grouped_razor_module_name['first']:
                if 'conv.weight' in razor_item:
                    mapping[cream_item] = razor_item
                    grouped_razor_module_name['first'].remove(razor_item)
                    break
        else:
            kws = cream_item.split('.')[-1]
            # get corresponding item from razor
            for razor_item in grouped_razor_module_name['first']:
                if kws in razor_item:
                    mapping[cream_item] = razor_item
                    grouped_razor_module_name['first'].remove(razor_item)
    # 5. process the last modules
    for cream_item in grouped_cream_module_name['last']:
        if 'classifier' in cream_item:
            kws = cream_item.split('.')[-1]
            for razor_item in grouped_razor_module_name['last']:
                if 'fc' in razor_item:
                    if kws in razor_item:
                        mapping[cream_item] = razor_item
                        grouped_razor_module_name['last'].remove(razor_item)
                        break
        elif 'conv_head' in cream_item:
            kws = cream_item.split('.')[-1]
            for razor_item in grouped_razor_module_name['last']:
                if 'head.conv2' in razor_item:
                    if kws in razor_item:
                        mapping[cream_item] = razor_item
                        grouped_razor_module_name['last'].remove(razor_item)
    # 6. process the middle modules
    for cream_group_id, cream_items in grouped_cream_module_name[
            'middle'].items():
        # get the corresponding group from razor
        razor_group_id: str = str(float(cream_group_id) + 1)
        razor_items: list = grouped_razor_module_name['middle'][razor_group_id]
        if int(razor_group_id[0]) == 1:
            key_cream2razor = first_cream2razor
        elif int(razor_group_id[0]) == 7:
            key_cream2razor = last_cream2razor
        else:
            key_cream2razor = middle_cream2razor
        # matching razor items and cream items
        for cream_item in cream_items:
            # traverse all of key_cream2razor
            for cream_match, razor_match in key_cream2razor.items():
                if cream_match in cream_item:
                    # traverse razor_items to get the corresponding razor name
                    for razor_item in razor_items:
                        if razor_match in razor_item:
                            mapping[cream_item] = razor_item
                            break
    print('=' * 100)
    print('length of mapping: ', len(mapping.keys()))
    for k, v in mapping.items():
        print(k, '\t=>\t', v)
    print('#' * 100)
    # TODO DELETE this print
    print('**' * 20)
    for c, cm, r, rm in zip(ordered_old_dict.keys(), ordered_old_dict.values(),
                            model_name_list, model_module_list):
        print(f'{c}: shape {cm.shape} => {r}: shape {rm.shape}')
    print('**' * 20)
    for k, v in ordered_old_dict.items():
        print(f'Mapping from {k} to {mapping[k]}......')
        new_dict['state_dict'][mapping[k]] = v
    model.load_state_dict(new_dict['state_dict'], strict=True)
    torch.save(new_dict, new_path)
 if __name__ == '__main__':
    register_all_modules(True)
    # old_path = '/mnt/lustre/dongpeijie/detnas_subnet_shufflenetv2_8xb128_in1k_acc-74.08_20211223-92e9b66a.pth'  # noqa: E501
    # new_path = '/mnt/lustre/dongpeijie/detnas_subnet_shufflenetv2_8xb128_in1k_acc-74.08_20211223-92e9b66a_2.0.pth'  # noqa: E501
    # convert_spos_key(old_path, new_path)
    # old_path = '/mnt/lustre/dongpeijie/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f.pth'  # noqa: E501
    # new_path = '/mnt/lustre/dongpeijie/detnas_subnet_frcnn_shufflenetv2_fpn_1x_coco_bbox_backbone_flops-0.34M_mAP-37.5_20211222-67fea61f_2.0.pth'  # noqa: E501
    # convert_detnas_key(old_path, new_path)
    # old_path = './data/14.pth.tar'
    # new_path = './data/14_2.0.pth'
    # old_path = '/mnt/lustre/dongpeijie/14.pth.tar'
    # new_path = '/mnt/lustre/dongpeijie/14_2.0.pth'
    # convert_cream_key(old_path, new_path)
    # old_path = '/mnt/lustre/dongpeijie/darts_subnetnet_1xb96_cifar10_acc-97.32_20211222-e5727921.pth'  # noqa: E501
    # new_path = '/mnt/lustre/dongpeijie/darts_subnetnet_1xb96_cifar10_acc-97.32_20211222-e5727921_2.0.pth'  # noqa: E501
    # convert_darts_key(old_path, new_path)
    old_path = '/mnt/lustre/dongpeijie/spos_angelnas_flops_0.49G_acc_75.98_20220307-54f4698f.pth'  # noqa: E501
    new_path = '/mnt/lustre/dongpeijie/spos_angelnas_flops_0.49G_acc_75.98_20220307-54f4698f_2.0.pth'  # noqa: E501
    convert_anglenas_key(old_path, new_path)
--- a/mmrazor/models/algorithms/nas/darts.py
+++ b/mmrazor/models/algorithms/nas/darts.py
@ -0,0 +1,280 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 from typing import Any, Dict, List, Optional, Union
 import torch
 from mmengine import BaseDataElement
 from mmengine.model import BaseModel
 from mmengine.optim import OptimWrapper, OptimWrapperDict
 from torch import nn
 from torch.nn.modules.batchnorm import _BatchNorm
 from mmrazor.models.mutators import DiffModuleMutator
 from mmrazor.models.subnet import (SINGLE_MUTATOR_RANDOM_SUBNET, FixSubnet,
                                   FixSubnetMixin)
 from mmrazor.registry import MODELS
 from ..base import BaseAlgorithm, LossResults
 VALID_FIX_SUBNET = Union[str, FixSubnet, Dict[str, Dict[str, Any]]]
@MODELS.register_module()
 class Darts(BaseAlgorithm, FixSubnetMixin):
    """Implementation of `DARTS <https://arxiv.org/abs/1806.09055>`_
    DARTS means Differentiable Architecture Search, a classic NAS algorithm.
    :class:`Darts` implements the APIs required by the DARTS, as well as the
    supernet training and subnet retraining logic for each iter.
    Args:
        architecture (dict|:obj:`BaseModel`): The config of :class:`BaseModel`
            or built model. Corresponding to supernet in NAS algorithm.
        mutator (dict|:obj:`DiffModuleMutator`): The config of
            :class:`DiffModuleMutator` or built mutator.
        fix_subnet (str | dict | :obj:`FixSubnet`): The path of yaml file or
            loaded dict or built :obj:`FixSubnet`.
        norm_training (bool): Whether to set norm layers to training mode,
            namely, not freeze running stats (mean and var). Note: Effect on
            Batch Norm and its variants only. Defaults to False.
        data_preprocessor (dict, optional): The pre-process config of
            :class:`BaseDataPreprocessor`. Defaults to None.
        init_cfg (dict): Init config for ``BaseModule``.
    Note:
        Darts has two training mode: supernet training and subnet retraining.
        If `fix_subnet` is None, it means supernet training.
        If `fix_subnet` is not None, it means subnet training.
    Note:
        During supernet training, since each op is not fully trained, the
        statistics of :obj:_BatchNorm are inaccurate. This problem affects the
        evaluation of the performance of each subnet in the search phase. There
        are usually two ways to solve this problem, both need to set
        `norm_training` to True:
        1) Using a large batch size, BNs use the mean and variance of the
           current batch during forward.
        2) Recalibrate the statistics of BN before searching.
    """
    def __init__(self,
                 architecture: Union[BaseModel, Dict],
                 mutator: Optional[Union[DiffModuleMutator, Dict]] = None,
                 fix_subnet: Optional[VALID_FIX_SUBNET] = None,
                 unroll: bool = False,
                 norm_training: bool = False,
                 data_preprocessor: Optional[Union[dict, nn.Module]] = None,
                 init_cfg: Optional[dict] = None):
        super().__init__(architecture, data_preprocessor, init_cfg)
        # Darts has two training mode: supernet training and subnet retraining.
        # fix_subnet is not None, means subnet retraining.
        if fix_subnet:
            # According to fix_subnet, delete the unchosen part of supernet
            self.load_fix_subnet(fix_subnet, prefix='architecture.')
            self.is_supernet = False
        else:
            assert mutator is not None, \
                'mutator cannot be None when fix_subnet is None.'
            if isinstance(mutator, DiffModuleMutator):
                self.mutator = mutator
            elif isinstance(mutator, dict):
                self.mutator = MODELS.build(mutator)
            else:
                raise TypeError('mutator should be a `dict` or '
                                f'`DiffModuleMutator` instance, but got '
                                f'{type(mutator)}')
            # Mutator is an essential component of the NAS algorithm. It
            # provides some APIs commonly used by NAS.
            # Before using it, you must do some preparations according to
            # the supernet.
            self.mutator.prepare_from_supernet(self.architecture)
            self.is_supernet = True
        self.norm_training = norm_training
        self.unroll = unroll
    def sample_subnet(self) -> SINGLE_MUTATOR_RANDOM_SUBNET:
        """Random sample subnet by mutator."""
        return self.mutator.sample_choices()
    def set_subnet(self, subnet: SINGLE_MUTATOR_RANDOM_SUBNET):
        """Set the subnet sampled by :meth:sample_subnet."""
        self.mutator.set_choices(subnet)
    def loss(
        self,
        batch_inputs: torch.Tensor,
        data_samples: Optional[List[BaseDataElement]] = None,
    ) -> LossResults:
        """Calculate losses from a batch of inputs and data samples."""
        if self.is_supernet:
            random_subnet = self.sample_subnet()
            self.set_subnet(random_subnet)
            return self.architecture(batch_inputs, data_samples, mode='loss')
        else:
            return self.architecture(batch_inputs, data_samples, mode='loss')
    def train(self, mode=True):
        """Convert the model into eval mode while keep normalization layer
        unfreezed."""
        super().train(mode)
        if self.norm_training and not mode:
            for module in self.architecture.modules():
                if isinstance(module, _BatchNorm):
                    module.training = True
    def train_step(self, data: List[dict],
                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
        """The iteration step during training.
        This method defines an iteration step during training, except for the
        back propagation and optimizer updating, which are done in an optimizer
        hook. Note that in some complicated cases or models, the whole process
        including back propagation and optimizer updating are also defined in
        this method, such as GAN.
        Args:
            data (dict): The output of dataloader.
            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
                runner is passed to ``train_step()``. This argument is unused
                and reserved.
        Returns:
            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
                ``num_samples``.
                ``loss`` is a tensor for back propagation, which can be a
                weighted sum of multiple losses.
                ``log_vars`` contains all the variables to be sent to the
                logger.
                ``num_samples`` indicates the batch size (when the model is
                DDP, it means the batch size on each GPU), which is used for
                averaging the logs.
        """
        if isinstance(data, (tuple, list)) and isinstance(
                optim_wrapper, OptimWrapperDict):
            assert len(data) == len(optim_wrapper), \
                f'The length of data {len(data)} should be equal to that of optimizers {len(optim_wrapper)}.'  # noqa: E501
            # TODO check the order of data
            train_supernet_data, train_arch_data = data
            # TODO mutator optimizer zero_grad
            optim_wrapper.zero_grad()
            if self.unroll:
                self._unrolled_backward(train_arch_data, train_supernet_data,
                                        optim_wrapper)  # TODO optimizer
            else:
                # TODO process the input
                arch_loss = self.loss(train_arch_data)  # noqa: F841
                # arch_loss.backward()
            # TODO mutator optimizer step
            optim_wrapper.step()
            model_loss = self.loss(train_supernet_data)
            # TODO optimizer architecture zero_grad
            optim_wrapper.zero_grad()
            # model_loss.backward()
            nn.utils.clip_grad_norm_(
                self.architecture.parameters(), max_norm=5, norm_type=2)
            # TODO optimizer architecture step
            optim_wrapper.step()
            outputs = dict(
                loss=model_loss,
                num_samples=len(train_supernet_data['img'].data))
        else:
            outputs = super().train_step(data, optim_wrapper)
        return outputs
    def _unrolled_backward(self, train_arch_data, train_supernet_data,
                           optimizer):
        """Compute unrolled loss and backward its gradients."""
        backup_params = copy.deepcopy(tuple(self.architecture.parameters()))
        # do virtual step on training data
        lr = optimizer['architecture'].param_groups[0]['lr']
        momentum = optimizer['architecture'].param_groups[0]['momentum']
        weight_decay = optimizer['architecture'].param_groups[0][
            'weight_decay']
        self._compute_virtual_model(train_supernet_data, lr, momentum,
                                    weight_decay, optimizer)
        # calculate unrolled loss on validation data
        # keep gradients for model here for compute hessian
        losses = self(**train_arch_data)
        loss, _ = self._parse_losses(losses)
        w_model, w_arch = tuple(self.architecture.parameters()), tuple(
            self.mutator.parameters())
        w_grads = torch.autograd.grad(loss, w_model + w_arch)
        d_model, d_arch = w_grads[:len(w_model)], w_grads[len(w_model):]
        # compute hessian and final gradients
        hessian = self._compute_hessian(backup_params, d_model,
                                        train_supernet_data)
        with torch.no_grad():
            for param, d, h in zip(w_arch, d_arch, hessian):
                # gradient = dalpha - lr * hessian
                param.grad = d - lr * h
        # restore weights
        self._restore_weights(backup_params)
    def _compute_virtual_model(self, data, lr, momentum, weight_decay,
                               optimizer):
        """Compute unrolled weights w`"""
        # don't need zero_grad, using autograd to calculate gradients
        losses = self(**data)
        loss, _ = self._parse_losses(losses)
        gradients = torch.autograd.grad(loss, self.architecture.parameters())
        with torch.no_grad():
            for w, g in zip(self.architecture.parameters(), gradients):
                m = optimizer['architecture'].state[w].get(
                    'momentum_buffer', 0.)
                w = w - lr * (momentum * m + g + weight_decay * w)
    def _restore_weights(self, backup_params):
        with torch.no_grad():
            for param, backup in zip(self.architecture.parameters(),
                                     backup_params):
                param.copy_(backup)
    def _compute_hessian(self, backup_params, dw, data):
        """
            dw = dw` { L_val(w`, alpha) }
            w+ = w + eps * dw
            w- = w - eps * dw
            hessian = (dalpha { L_trn(w+, alpha) }  \
                - dalpha { L_trn(w-, alpha) }) / (2*eps)
            eps = 0.01 / ||dw||
        """
        self._restore_weights(backup_params)
        norm = torch.cat([w.view(-1) for w in dw]).norm()
        eps = 0.01 / norm
        if norm < 1E-8:
            print(
                'In computing hessian, norm is smaller than 1E-8, \
                cause eps to be %.6f.', norm.item())
        dalphas = []
        for e in [eps, -2. * eps]:
            # w+ = w + eps*dw`, w- = w - eps*dw`
            with torch.no_grad():
                for p, d in zip(self.architecture.parameters(), dw):
                    p += e * d
            losses = self(**data)
            loss, _ = self._parse_losses(losses)
            dalphas.append(
                torch.autograd.grad(loss, tuple(self.mutator.parameters())))
        # dalpha { L_trn(w+) }, # dalpha { L_trn(w-) }
        dalpha_pos, dalpha_neg = dalphas
        hessian = [(p - n) / (2. * eps)
                   for p, n in zip(dalpha_pos, dalpha_neg)]
        return hessian
--- a/mmrazor/models/architectures/init.py
+++ b/mmrazor/models/architectures/init.py
@ -1,3 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .backbones import *  # noqa: F401,F403
 from .components import *  # noqa: F401,F403
 from .dynamic_op import *  # noqa: F401,F403
--- a/mmrazor/models/architectures/backbones/darts_backbone.py
+++ b/mmrazor/models/architectures/backbones/darts_backbone.py
@ -4,6 +4,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from mmcls.models.backbones.base_backbone import BaseBackbone
 from mmcv.cnn import build_activation_layer, build_norm_layer
 from torch import Tensor
@ -126,12 +127,8 @@ class Node(nn.Module):
        super().__init__()
        edges = nn.ModuleDict()
        for i in range(num_prev_nodes):
-            if i < num_downsample_nodes:
+            stride = 2 if i < num_downsample_nodes else 1
-                stride = 2
+            edge_id = f'{node_id}_p{i}'
            else:
                stride = 1
            edge_id = '{}_p{}'.format(node_id, i)
            module_kwargs = dict(
                in_channels=channels,
@ -143,13 +140,14 @@ class Node(nn.Module):
            mutable_cfg.update(alias=edge_id)
            edges.add_module(edge_id, MODELS.build(mutable_cfg))
        route_cfg.update(alias=node_id)
        route_cfg.update(edges=edges)
-        self.edges = MODELS.build(route_cfg)
+        self.route = MODELS.build(route_cfg)
    def forward(self, prev_nodes: Union[List[Tensor],
                                        Tuple[Tensor]]) -> Tensor:
        """Forward with the previous nodes list."""
-        return self.edges(prev_nodes)
+        return self.route(prev_nodes)
 class Cell(nn.Module):
@ -223,8 +221,7 @@ class Cell(nn.Module):
            cur_tensor = node(tensors)
            tensors.append(cur_tensor)
-        output = torch.cat(tensors[2:], dim=1)
+        return torch.cat(tensors[2:], dim=1)
        return output
 class AuxiliaryModule(nn.Module):
@ -263,7 +260,7 @@ class AuxiliaryModule(nn.Module):
@MODELS.register_module()
-class DartsBackbone(nn.Module, FixSubnetMixin):
+class DartsBackbone(BaseBackbone, FixSubnetMixin):
    """Backbone of Differentiable Architecture Search (DARTS).
    Args:
@ -348,7 +345,7 @@ class DartsBackbone(nn.Module, FixSubnetMixin):
            prev_reduction, reduction = reduction, False
            # Reduce featuremap size and double channels in 1/3
            # and 2/3 layer.
-            if i == self.num_layers // 3 or i == 2 * self.num_layers // 3:
+            if i in [self.num_layers // 3, 2 * self.num_layers // 3]:
                self.out_channels *= 2
                reduction = True
--- a/mmrazor/models/architectures/backbones/searchable_mobilenet.py
+++ b/mmrazor/models/architectures/backbones/searchable_mobilenet.py
@ -46,7 +46,7 @@ class SearchableMobileNet(BaseBackbone, FixSubnetMixin):
    Excamples:
        >>> mutable_cfg = dict(
        ...     type='OneShotMutableOP',
-        ...     candidate_ops=dict(
+        ...     candidates=dict(
        ...         mb_k3e1=dict(
        ...             type='MBBlock',
        ...             kernel_size=3,
@ -87,7 +87,7 @@ class SearchableMobileNet(BaseBackbone, FixSubnetMixin):
        ]
    ) -> None:
        for index in out_indices:
-            if index not in range(0, 8):
+            if index not in range(8):
                raise ValueError('the item in out_indices must in '
                                 f'range(0, 8). But received {index}')
@ -147,6 +147,7 @@ class SearchableMobileNet(BaseBackbone, FixSubnetMixin):
            conv_cfg=self.conv_cfg,
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg)
        self.add_module('conv2', layer)
        self.layers.append('conv2')
--- a/mmrazor/models/architectures/backbones/searchable_shufflenet_v2.py
+++ b/mmrazor/models/architectures/backbones/searchable_shufflenet_v2.py
@ -48,7 +48,7 @@ class SearchableShuffleNetV2(BaseBackbone, FixSubnetMixin):
    Excamples:
        >>> mutable_cfg = dict(
        ...     type='OneShotMutableOP',
-        ...     candidate_ops=dict(
+        ...     candidates=dict(
        ...         shuffle_3x3=dict(
        ...             type='ShuffleBlock',
        ...             kernel_size=3,
--- a/mmrazor/models/architectures/components/init.py
+++ b/mmrazor/models/architectures/components/init.py
@ -0,0 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .heads import CreamClsHead
 __all__ = ['CreamClsHead']
--- a/mmrazor/models/architectures/components/heads/init.py
+++ b/mmrazor/models/architectures/components/heads/init.py
@ -0,0 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .cream_head import CreamClsHead
 __all__ = ['CreamClsHead']
--- a/mmrazor/models/architectures/components/heads/cream_head.py
+++ b/mmrazor/models/architectures/components/heads/cream_head.py
@ -0,0 +1,72 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, Optional, Tuple
 from mmcls.models.heads import LinearClsHead
 from mmcv.cnn import ConvModule
 from torch import Tensor, nn
 from mmrazor.registry import MODELS
@MODELS.register_module()
 class CreamClsHead(LinearClsHead):
    """Linear classifier head for cream.
    Args:
        num_classes (int): Number of categories excluding the background
            category.
        in_channels (int): Number of channels in the input feature map.
        num_features (int): Number of features in the conv2d.
        act_cfg (dict): Config dict for activation layer.
            Default: dict(type='ReLU6').
        init_cfg (dict, optional): the config to control the initialization.
            Defaults to ``dict(type='Normal', layer='Linear', std=0.01)``.
    """
    def __init__(self,
                 num_classes: int,
                 in_channels: int,
                 num_features: int = 1280,
                 act_cfg: Dict = dict(type='ReLU6'),
                 init_cfg: Optional[dict] = dict(
                     type='Normal', layer='Linear', std=0.01),
                 **kwargs):
        super().__init__(
            num_classes=num_classes,
            in_channels=in_channels,
            init_cfg=init_cfg,
            **kwargs)
        layer = ConvModule(
            in_channels=self.in_channels,
            out_channels=num_features,
            kernel_size=1,
            stride=1,
            padding=0,
            conv_cfg=None,
            norm_cfg=None,
            act_cfg=act_cfg)
        self.add_module('conv2', layer)
        self.fc = nn.Linear(num_features, self.num_classes)
    # def pre_logits(self, feats: Tuple[Tensor]) -> Tensor:
    #     """The process before the final classification head.
    #     The input ``feats`` is a tuple of tensor, and each tensor is the
    #     feature of a backbone stage. In ``LinearClsHead``, we just obtain the
    #     feature of the last stage.
    #     """
    #     # The LinearClsHead doesn't have other module, just return after
    #     # unpacking.
    #     return feats[-1]
    def forward(self, feats: Tuple[Tensor]) -> Tensor:
        """The forward process."""
        logits = self.pre_logits(feats)
        logits = logits.unsqueeze(-1).unsqueeze(-1)
        logits = self.conv2(logits)
        logits = logits.flatten(1)
        return self.fc(logits)
--- a/mmrazor/models/mutables/mutable_module/diff_mutable_module.py
+++ b/mmrazor/models/mutables/mutable_module/diff_mutable_module.py
@ -99,7 +99,7 @@ class DiffMutableOP(DiffMutableModule[str, str]):
    DARTS. Search the best module by learnable parameters `arch_param`.
    Args:
-        candidate_ops (dict[str, dict]): the configs for the candidate
+        candidates (dict[str, dict]): the configs for the candidate
            operations.
        module_kwargs (dict[str, dict], optional): Module initialization named
            arguments. Defaults to None.
@ -110,23 +110,29 @@ class DiffMutableOP(DiffMutableModule[str, str]):
            and `Pretrained`.
    """
-    def __init__(self, candidate_ops: Dict[str, Dict], **kwargs) -> None:
+    def __init__(
-        super().__init__(**kwargs)
+        self,
-        assert len(candidate_ops) >= 1, \
+        candidates: Dict[str, Dict],
        module_kwargs: Optional[Dict[str, Dict]] = None,
        alias: Optional[str] = None,
        init_cfg: Optional[Dict] = None,
    ) -> None:
        super().__init__(
            module_kwargs=module_kwargs, alias=alias, init_cfg=init_cfg)
        assert len(candidates) >= 1, \
            f'Number of candidate op must greater than or equal to 1, ' \
-            f'but got: {len(candidate_ops)}'
+            f'but got: {len(candidates)}'
        self._is_fixed = False
-        self._candidate_ops = self._build_ops(candidate_ops,
+        self._candidates = self._build_ops(candidates, self.module_kwargs)
                                              self.module_kwargs)
    @staticmethod
-    def _build_ops(candidate_ops: Dict[str, Dict],
+    def _build_ops(candidates: Dict[str, Dict],
                   module_kwargs: Optional[Dict[str, Dict]]) -> nn.ModuleDict:
-        """Build candidate operations based on candidate_ops configures.
+        """Build candidate operations based on candidates configures.
        Args:
-            candidate_ops (dict[str, dict]): the configs for the candidate
+            candidates (dict[str, dict]): the configs for the candidate
                operations.
            module_kwargs (dict[str, dict], optional): Module initialization
                named arguments.
@ -137,7 +143,7 @@ class DiffMutableOP(DiffMutableModule[str, str]):
                is the corresponding candidate operation.
        """
        ops = nn.ModuleDict()
-        for name, op_cfg in candidate_ops.items():
+        for name, op_cfg in candidates.items():
            assert name not in ops
            if module_kwargs is not None:
                op_cfg.update(module_kwargs)
@ -154,7 +160,7 @@ class DiffMutableOP(DiffMutableModule[str, str]):
        Returns:
            Tensor: the result of forward the fixed operation.
        """
-        return self._candidate_ops[self._chosen](x)
+        return sum(self._candidates[choice](x) for choice in self._chosen)
    def forward_arch_param(self,
                           x: Any,
@ -180,7 +186,7 @@ class DiffMutableOP(DiffMutableModule[str, str]):
            # forward based on probs
            outputs = list()
-            for prob, module in zip(probs, self._candidate_ops.values()):
+            for prob, module in zip(probs, self._candidates.values()):
                if prob > 0.:
                    outputs.append(prob * module(x))
@ -197,11 +203,11 @@ class DiffMutableOP(DiffMutableModule[str, str]):
            Tensor: the result of forward all of the ``choice`` operation.
        """
        outputs = list()
-        for op in self._candidate_ops.values():
+        for op in self._candidates.values():
            outputs.append(op(x))
        return sum(outputs)
-    def fix_chosen(self, chosen: str) -> None:
+    def fix_chosen(self, chosen: Union[str, List[str]]) -> None:
        """Fix mutable with `choice`. This operation would convert `unfixed`
        mode to `fixed` mode. The :attr:`is_fixed` will be set to True and only
        the selected operations can be retained.
@ -215,9 +221,12 @@ class DiffMutableOP(DiffMutableModule[str, str]):
                'The mode of current MUTABLE is `fixed`. '
                'Please do not call `fix_chosen` function again.')
        if isinstance(chosen, str):
            chosen = [chosen]
        for c in self.choices:
-            if c != chosen:
+            if c not in chosen:
-                self._candidate_ops.pop(c)
+                self._candidates.pop(c)
        self._chosen = chosen
        self.is_fixed = True
@ -225,7 +234,7 @@ class DiffMutableOP(DiffMutableModule[str, str]):
    @property
    def choices(self) -> List[str]:
        """list: all choices. """
-        return list(self._candidate_ops.keys())
+        return list(self._candidates.keys())
@MODELS.register_module()
@ -241,6 +250,7 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        with_arch_param (bool): whether forward with arch_param. When set to
            `True`, a differentiable way is adopted. When set to `False`,
            a non-differentiable way is adopted.
        alias (str, optional): alias of the `DiffChoiceRoute`.
        init_cfg (dict, optional): initialization configuration dict for
            ``BaseModule``. OpenMMLab has implement 6 initializers including
            `Constant`, `Xavier`, `Normal`, `Uniform`, `Kaiming`,
@ -274,16 +284,17 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        self,
        edges: nn.ModuleDict,
        with_arch_param: bool = False,
        alias: Optional[str] = None,
        init_cfg: Optional[Dict] = None,
    ) -> None:
-        super().__init__(init_cfg=init_cfg)
+        super().__init__(alias=alias, init_cfg=init_cfg)
        assert len(edges) >= 1, \
            f'Number of edges must greater than or equal to 1, ' \
            f'but got: {len(edges)}'
        self._with_arch_param = with_arch_param
        self._is_fixed = False
-        self._edges: nn.ModuleDict = edges
+        self._candidates: nn.ModuleDict = edges
    def forward_fixed(self, inputs: Union[List, Tuple]) -> Tensor:
        """Forward when the mutable is in `fixed` mode.
@ -302,7 +313,7 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        outputs = list()
        for choice, x in zip(self._unfixed_choices, inputs):
            if choice in self._chosen:
-                outputs.append(self._edges[choice](x))
+                outputs.append(self._candidates[choice](x))
        return sum(outputs)
    def forward_arch_param(self,
@ -319,15 +330,16 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        Returns:
            Tensor: the result of forward with ``arch_param``.
        """
-        assert len(x) == len(self._edges), \
+        assert len(x) == len(self._candidates), \
-            f'Length of `edges` {len(self._edges)} should be same as ' \
+            f'Length of `edges` {len(self._candidates)} should be ' \
-            f'the length of inputs {len(x)}.'
+            f'same as the length of inputs {len(x)}.'
        if self._with_arch_param:
            probs = self.compute_arch_probs(arch_param=arch_param)
            outputs = list()
-            for prob, module, input in zip(probs, self._edges.values(), x):
+            for prob, module, input in zip(probs, self._candidates.values(),
                                           x):
                if prob > 0:
                    # prob may equal to 0 in gumbel softmax.
                    outputs.append(prob * module(input))
@ -346,12 +358,12 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        Returns:
            Tensor: the result of forward all of the ``choice`` operation.
        """
-        assert len(x) == len(self._edges), \
+        assert len(x) == len(self._candidates), \
-            f'Lenght of edges {len(self._edges)} should be same as ' \
+            f'Lenght of edges {len(self._candidates)} should be same as ' \
            f'the length of inputs {len(x)}.'
        outputs = list()
-        for op, input in zip(self._edges.values(), x):
+        for op, input in zip(self._candidates.values(), x):
            outputs.append(op(input))
        return sum(outputs)
@ -373,7 +385,7 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
        for c in self.choices:
            if c not in chosen:
-                self._edges.pop(c)
+                self._candidates.pop(c)
        self._chosen = chosen
        self.is_fixed = True
@ -381,7 +393,7 @@ class DiffChoiceRoute(DiffMutableModule[str, List[str]]):
    @property
    def choices(self) -> List[CHOSEN_TYPE]:
        """list: all choices. """
-        return list(self._edges.keys())
+        return list(self._candidates.keys())
@MODELS.register_module()
@ -413,10 +425,14 @@ class GumbelChoiceRoute(DiffChoiceRoute):
        tau: float = 1.0,
        hard: bool = True,
        with_arch_param: bool = False,
        alias: Optional[str] = None,
        init_cfg: Optional[Dict] = None,
    ) -> None:
        super().__init__(
-            edges=edges, with_arch_param=with_arch_param, init_cfg=init_cfg)
+            edges=edges,
            with_arch_param=with_arch_param,
            alias=alias,
            init_cfg=init_cfg)
        self.tau = tau
        self.hard = hard
--- a/mmrazor/models/mutables/mutable_module/one_shot_mutable_module.py
+++ b/mmrazor/models/mutables/mutable_module/one_shot_mutable_module.py
@ -100,7 +100,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
    blocks.
    Args:
-        candidate_ops (dict[str, dict]): the configs for the candidate
+        candidates (dict[str, dict]): the configs for the candidate
            operations.
        module_kwargs (dict[str, dict], optional): Module initialization named
            arguments. Defaults to None.
@ -114,13 +114,13 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        >>> import torch
        >>> from mmrazor.models.mutables import OneShotMutableOP
-        >>> candidate_ops = nn.ModuleDict({
+        >>> candidates = nn.ModuleDict({
        ...     'conv3x3': nn.Conv2d(32, 32, 3, 1, 1),
        ...     'conv5x5': nn.Conv2d(32, 32, 5, 1, 2),
        ...     'conv7x7': nn.Conv2d(32, 32, 7, 1, 3)})
        >>> input = torch.randn(1, 32, 64, 64)
-        >>> op = OneShotMutableOP(candidate_ops)
+        >>> op = OneShotMutableOP(candidates)
        >>> op.choices
        ['conv3x3', 'conv5x5', 'conv7x7']
@ -131,7 +131,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        >>> op.current_choice = 'conv3x3'
        >>> unfix_output = op.forward(input)
-        >>> torch.all(unfixed_output == candidate_ops['conv3x3'](input))
+        >>> torch.all(unfixed_output == candidates['conv3x3'](input))
        True
        >>> op.fix_chosen('conv3x3')
@ -147,36 +147,41 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        True
    """
-    def __init__(self, candidate_ops: Union[Dict[str, Dict], nn.ModuleDict],
+    def __init__(
-                 **kwargs) -> None:
+        self,
-        super().__init__(**kwargs)
+        candidates: Union[Dict[str, Dict], nn.ModuleDict],
-        assert len(candidate_ops) >= 1, \
+        module_kwargs: Optional[Dict[str, Dict]] = None,
        alias: Optional[str] = None,
        init_cfg: Optional[Dict] = None,
    ) -> None:
        super().__init__(
            module_kwargs=module_kwargs, alias=alias, init_cfg=init_cfg)
        assert len(candidates) >= 1, \
            f'Number of candidate op must greater than 1, ' \
-            f'but got: {len(candidate_ops)}'
+            f'but got: {len(candidates)}'
        self._chosen: Optional[str] = None
-        if isinstance(candidate_ops, dict):
+        if isinstance(candidates, dict):
-            self._candidate_ops = self._build_ops(candidate_ops,
+            self._candidates = self._build_ops(candidates, self.module_kwargs)
-                                                  self.module_kwargs)
+        elif isinstance(candidates, nn.ModuleDict):
-        elif isinstance(candidate_ops, nn.ModuleDict):
+            self._candidates = candidates
            self._candidate_ops = candidate_ops
        else:
            raise TypeError('candidata_ops should be a `dict` or '
                            f'`nn.ModuleDict` instance, but got '
-                            f'{type(candidate_ops)}')
+                            f'{type(candidates)}')
-        assert len(self._candidate_ops) >= 1, \
+        assert len(self._candidates) >= 1, \
            f'Number of candidate op must greater than or equal to 1, ' \
-            f'but got {len(self._candidate_ops)}'
+            f'but got {len(self._candidates)}'
    @staticmethod
    def _build_ops(
-            candidate_ops: Union[Dict[str, Dict], nn.ModuleDict],
+            candidates: Union[Dict[str, Dict], nn.ModuleDict],
            module_kwargs: Optional[Dict[str, Dict]] = None) -> nn.ModuleDict:
        """Build candidate operations based on choice configures.
        Args:
-            candidate_ops (dict[str, dict] | :obj:`nn.ModuleDict`): the configs
+            candidates (dict[str, dict] | :obj:`nn.ModuleDict`): the configs
                for the candidate operations or nn.ModuleDict.
            module_kwargs (dict[str, dict], optional): Module initialization
                named arguments.
@ -186,11 +191,11 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
                the name of each choice in configs and the value of ``ops``
                is the corresponding candidate operation.
        """
-        if isinstance(candidate_ops, nn.ModuleDict):
+        if isinstance(candidates, nn.ModuleDict):
-            return candidate_ops
+            return candidates
        ops = nn.ModuleDict()
-        for name, op_cfg in candidate_ops.items():
+        for name, op_cfg in candidates.items():
            assert name not in ops
            if module_kwargs is not None:
                op_cfg.update(module_kwargs)
@ -207,7 +212,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        Returns:
            Tensor: the result of forward the fixed operation.
        """
-        return self._candidate_ops[self._chosen](x)
+        return self._candidates[self._chosen](x)
    def forward_choice(self, x: Any, choice: str) -> Tensor:
        """Forward with the `unfixed` mutable and current choice is not None.
@ -221,7 +226,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
            Tensor: the result of forward the ``choice`` operation.
        """
        assert isinstance(choice, str) and choice in self.choices
-        return self._candidate_ops[choice](x)
+        return self._candidates[choice](x)
    def forward_all(self, x: Any) -> Tensor:
        """Forward all choices. Used to calculate FLOPs.
@ -233,7 +238,9 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        Returns:
            Tensor: the result of forward all of the ``choice`` operation.
        """
-        outputs = [op(x) for op in self._candidate_ops.values()]
+        outputs = list()
        for op in self._candidates.values():
            outputs.append(op(x))
        return sum(outputs)
    def fix_chosen(self, chosen: str) -> None:
@ -251,7 +258,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
        for c in self.choices:
            if c != chosen:
-                self._candidate_ops.pop(c)
+                self._candidates.pop(c)
        self._chosen = chosen
        self.is_fixed = True
@ -263,7 +270,7 @@ class OneShotMutableOP(OneShotMutableModule[str, str]):
    @property
    def choices(self) -> List[str]:
        """list: all choices. """
-        return list(self._candidate_ops.keys())
+        return list(self._candidates.keys())
    @property
    def num_choices(self):
@ -275,7 +282,7 @@ class OneShotProbMutableOP(OneShotMutableOP):
    """Sampling candidate operation according to probability.
    Args:
-        candidate_ops (dict[str, dict]): the configs for the candidate
+        candidates (dict[str, dict]): the configs for the candidate
            operations.
        choice_probs (list): the probability of sampling each
            candidate operation.
@ -289,13 +296,13 @@ class OneShotProbMutableOP(OneShotMutableOP):
    """
    def __init__(self,
-                 candidate_ops: Dict[str, Dict],
+                 candidates: Dict[str, Dict],
                 choice_probs: list = None,
                 module_kwargs: Optional[Dict[str, Dict]] = None,
                 alias: Optional[str] = None,
                 init_cfg: Optional[Dict] = None) -> None:
        super().__init__(
-            candidate_ops=candidate_ops,
+            candidates=candidates,
            module_kwargs=module_kwargs,
            alias=alias,
            init_cfg=init_cfg)
@ -306,5 +313,7 @@ class OneShotProbMutableOP(OneShotMutableOP):
    def sample_choice(self) -> str:
        """Sampling with probabilities."""
-        assert len(self.choice_probs) == len(self._candidate_ops.keys())
+        assert len(self.choice_probs) == len(self._candidates.keys())
-        return random.choices(self.choices, weights=self.choice_probs, k=1)[0]
+        choice = random.choices(
            self.choices, weights=self.choice_probs, k=1)[0]
        return choice
--- a/mmrazor/models/ops/init.py
+++ b/mmrazor/models/ops/init.py
@ -2,10 +2,12 @@
 from .common import Identity
 from .darts_series import (DartsDilConv, DartsPoolBN, DartsSepConv,
                           DartsSkipConnect, DartsZero)
 from .efficientnet_series import ConvBnAct, DepthwiseSeparableConv
 from .mobilenet_series import MBBlock
 from .shufflenet_series import ShuffleBlock, ShuffleXception
 __all__ = [
    'ShuffleBlock', 'ShuffleXception', 'DartsPoolBN', 'DartsDilConv',
-    'DartsSepConv', 'DartsSkipConnect', 'DartsZero', 'MBBlock', 'Identity'
+    'DartsSepConv', 'DartsSkipConnect', 'DartsZero', 'MBBlock', 'Identity',
    'ConvBnAct', 'DepthwiseSeparableConv'
 ]
--- a/mmrazor/models/ops/darts_series.py
+++ b/mmrazor/models/ops/darts_series.py
@ -27,10 +27,7 @@ class DartsPoolBN(BaseOP):
                self.kernel_size, self.stride, 1, count_include_pad=False)
        self.bn = build_norm_layer(self.norm_cfg, self.out_channels)[1]
-        if use_drop_path:
+        self.drop_path = DropPath() if use_drop_path else None
            self.drop_path = DropPath()
        else:
            self.drop_path = None
    def forward(self, x):
        out = self.pool(x)
@ -69,10 +66,7 @@ class DartsDilConv(BaseOP):
                self.in_channels, self.out_channels, 1, stride=1, bias=False),
            build_norm_layer(self.norm_cfg, self.in_channels)[1])
-        if use_drop_path:
+        self.drop_path = DropPath() if use_drop_path else None
            self.drop_path = DropPath()
        else:
            self.drop_path = None
    def forward(self, x):
        out = self.conv1(x)
@ -122,10 +116,7 @@ class DartsSepConv(BaseOP):
                self.out_channels, self.out_channels, 1, stride=1, bias=False),
            build_norm_layer(self.norm_cfg, self.out_channels)[1])
-        if use_drop_path:
+        self.drop_path = DropPath() if use_drop_path else None
            self.drop_path = DropPath()
        else:
            self.drop_path = None
    def forward(self, x):
        out = self.conv1(x)
@ -163,10 +154,7 @@ class DartsSkipConnect(BaseOP):
                bias=False)
            self.bn = build_norm_layer(self.norm_cfg, self.out_channels)[1]
-        if use_drop_path:
+        self.drop_path = DropPath() if use_drop_path else None
            self.drop_path = DropPath()
        else:
            self.drop_path = None
    def forward(self, x):
        if self.stride > 1:
--- a/mmrazor/models/ops/efficientnet_series.py
+++ b/mmrazor/models/ops/efficientnet_series.py
@ -0,0 +1,160 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, Optional
 import torch.nn as nn
 from mmcls.models.utils import SELayer
 from mmcv.cnn import ConvModule
 from mmrazor.registry import MODELS
 from .base import BaseOP
@MODELS.register_module()
 class ConvBnAct(BaseOP):
    """ConvBnAct block from timm.
    Args:
        in_channels (int): number of in channels.
        out_channels (int): number of out channels.
        kernel_size (int): kernel size of convolution.
        stride (int, optional): stride of convolution. Defaults to 1.
        dilation (int, optional): dilation rate of convolution. Defaults to 1.
        padding (int, optional): padding size of convolution. Defaults to 0.
        skip (bool, optional): whether using skip connect. Defaults to False.
        conv_cfg (Optional[dict], optional): Config dict for convolution layer.
            Default: None, which means using conv2d.
        norm_cfg (Dict, optional): Config dict for normalization layer.
            Default: dict(type='BN').
        act_cfg (Dict, optional):Config dict for activation layer.
            Default: dict(type='ReLU').
    """
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: int,
                 stride: int = 1,
                 dilation: int = 1,
                 padding: int = 0,
                 skip: bool = False,
                 conv_cfg: Optional[dict] = None,
                 se_cfg: Dict = None,
                 norm_cfg: Dict = dict(type='BN'),
                 act_cfg: Dict = dict(type='ReLU')):
        super().__init__(
            in_channels=in_channels, out_channels=out_channels, stride=stride)
        self.has_residual = skip and stride == 1 \
            and in_channels == out_channels
        self.with_se = se_cfg is not None
        if self.with_se:
            assert isinstance(se_cfg, dict)
            self.se = SELayer(self.out_channels, **se_cfg)
        self.convModule = ConvModule(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)
    def forward(self, x):
        """Forward function."""
        shortcut = x
        x = self.convModule(x)
        if self.has_residual:
            x += shortcut
        return x
@MODELS.register_module()
 class DepthwiseSeparableConv(BaseOP):
    """DepthwiseSeparable block Used for DS convs in MobileNet-V1 and in the
    place of IR blocks that have no expansion (factor of 1.0). This is an
    alternative to having a IR with an optional first pw conv.
    Args:
        in_channels (int): number of in channels.
        out_channels (int): number of out channels.
        dw_kernel_size (int, optional): the kernel size of depth-wise
            convolution. Defaults to 3.
        stride (int, optional): stride of convolution.
            Defaults to 1.
        dilation (int, optional): dilation rate of convolution.
            Defaults to 1.
        noskip (bool, optional): whether use skip connection.
            Defaults to False.
        pw_kernel_size (int, optional): kernel size of point wise convolution.
            Defaults to 1.
        pw_act (bool, optional): whether using activation in point-wise
            convolution. Defaults to False.
        se_cfg (Dict, optional): _description_. Defaults to None.
        conv_cfg (Optional[dict], optional): Config dict for convolution layer.
            Default: None, which means using conv2d.
        norm_cfg (Dict, optional): Config dict for normalization layer.
            Default: dict(type='BN').
        act_cfg (Dict, optional):Config dict for activation layer.
            Default: dict(type='ReLU').
    """
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 dw_kernel_size: int = 3,
                 stride: int = 1,
                 dilation: int = 1,
                 noskip: bool = False,
                 pw_kernel_size: int = 1,
                 pw_act: bool = False,
                 conv_cfg: Optional[dict] = None,
                 se_cfg: Dict = None,
                 norm_cfg: Dict = dict(type='BN'),
                 act_cfg: Dict = dict(type='ReLU')):
        super().__init__(
            in_channels=in_channels, out_channels=out_channels, stride=stride)
        self.has_residual = (stride == 1
                             and in_channels == out_channels) and not noskip
        self.has_pw_act = pw_act  # activation after point-wise conv
        self.se_cfg = se_cfg
        self.conv_dw = ConvModule(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=dw_kernel_size,
            stride=stride,
            dilation=dilation,
            padding=dw_kernel_size // 2,
            groups=in_channels,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg,
        )
        # Squeeze-and-excitation
        self.se = SELayer(out_channels, **
                          se_cfg) if self.se_cfg else nn.Identity()
        self.conv_pw = ConvModule(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=pw_kernel_size,
            padding=pw_kernel_size // 2,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg if self.has_pw_act else None,
        )
    def forward(self, x):
        shortcut = x
        x = self.conv_dw(x)
        x = self.se(x)
        x = self.conv_pw(x)
        if self.has_residual:
            x += shortcut
        return x
--- a/mmrazor/models/subnet/estimators/flops.py
+++ b/mmrazor/models/subnet/estimators/flops.py
@ -65,10 +65,10 @@ class FlopsEstimator:
        ...     def __init__(self) -> None:
        ...         super().__init__()
        ...
-        ...         candidate_ops = nn.ModuleDict({
+        ...         candidates = nn.ModuleDict({
        ...             'conv3x3': nn.Conv2d(3, 32, 3),
        ...             'conv5x5': nn.Conv2d(3, 32, 5)})
-        ...         self.op = OneShotMutableOP(candidate_ops)
+        ...         self.op = OneShotMutableOP(candidates)
        ...         self.op.current_choice = 'conv3x3'
        ...
        ...     def forward(self, x: Tensor) -> Tensor:
--- a/mmrazor/models/subnet/fix_subnet.py
+++ b/mmrazor/models/subnet/fix_subnet.py
@ -90,12 +90,19 @@ class FixSubnetMixin:
            # In the corresponding mutable, it will check whether the `chosen`
            # format is correct.
            if isinstance(module, BaseMutable):
-                mutable_name = name.lstrip(prefix)
+                if getattr(module, 'alias', None):
-                assert mutable_name in fix_modules, \
+                    alias = module.alias
-                    f'{mutable_name} is not in fix_modules {fix_modules}, '\
+                    assert alias in fix_modules, \
-                    'please check your `fix_subnet`.'
+                        f'The alias {alias} is not in fix_modules ' \
-
+                        f'{fix_modules}, please check your `fix_subnet`.'
-                chosen = fix_modules.get(mutable_name, None)
+                    chosen = fix_modules.get(alias, None)
                else:
                    mutable_name = name.lstrip(prefix)
                    assert mutable_name in fix_modules, \
                        f'The module name {mutable_name} is not in ' \
                        f'fix_modules  {fix_modules} ' \
                        'please check your `fix_subnet`.'
                    chosen = fix_modules.get(mutable_name, None)
                module.fix_chosen(chosen)
    # TODO support load fix channels after mr #29 merged
--- a/tests/test_models/test_architectures/test_backbones/test_dartsbackbone.py
+++ b/tests/test_models/test_architectures/test_backbones/test_dartsbackbone.py
@ -19,7 +19,7 @@ class TestDartsBackbone(TestCase):
    def setUp(self) -> None:
        self.mutable_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
@ -96,17 +96,17 @@ class TestDartsBackbone(TestCase):
        tmp_dict = dict()
        for key, _ in model.named_modules():
-            node_type = key.split('._candidate_ops')[0].split('.')[-1].split(
+            node_type = key.split('._candidates')[0].split('.')[-1].split(
                '_')[0]
            if node_type not in ['normal', 'reduce']:
                # not supported type
                continue
-            node_name = key.split('._candidate_ops')[0].split('.')[-1]
+            node_name = key.split('._candidates')[0].split('.')[-1]
            if node_name not in tmp_dict.keys():
-                tmp_dict[node_name] = [key.split('._candidate_ops')[0]]
+                tmp_dict[node_name] = [key.split('._candidates')[0]]
            else:
-                current_key = key.split('._candidate_ops')[0]
+                current_key = key.split('._candidates')[0]
                if current_key not in tmp_dict[node_name]:
                    tmp_dict[node_name].append(current_key)
--- a/tests/test_models/test_mutables/test_diffop.py
+++ b/tests/test_models/test_mutables/test_diffop.py
@ -18,7 +18,7 @@ class TestDiffOP(TestCase):
    def test_forward_arch_param(self):
        op_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
@ -56,7 +56,7 @@ class TestDiffOP(TestCase):
    def test_forward_fixed(self):
        op_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
@ -84,7 +84,7 @@ class TestDiffOP(TestCase):
    def test_forward(self):
        op_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
@ -119,7 +119,7 @@ class TestDiffOP(TestCase):
    def test_property(self):
        op_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
@ -158,7 +158,7 @@ class TestDiffOP(TestCase):
    def test_module_kwargs(self):
        op_cfg = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
--- a/tests/test_models/test_mutables/test_oneshotop.py
+++ b/tests/test_models/test_mutables/test_oneshotop.py
@ -15,7 +15,7 @@ class TestMutables(TestCase):
        norm_cfg = dict(type='BN', requires_grad=True)
        op_cfg = dict(
            type='OneShotMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=3),
                shuffle_5x5=dict(
@ -80,7 +80,7 @@ class TestMutables(TestCase):
        op_cfg = dict(
            type='OneShotProbMutableOP',
            choice_probs=[0.1, 0.2, 0.3, 0.4],
-            candidate_ops=dict(
+            candidates=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=3),
                shuffle_5x5=dict(
@ -142,7 +142,7 @@ class TestMutables(TestCase):
        norm_cfg = dict(type='BN', requires_grad=True)
        op_cfg = dict(
            type='OneShotMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=3),
                shuffle_5x5=dict(
@ -165,7 +165,7 @@ class TestMutables(TestCase):
        norm_cfg = dict(type='BN', requires_grad=True)
        op_cfg = dict(
            type='OneShotMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock', norm_cfg=norm_cfg, kernel_size=3),
                shuffle_5x5=dict(
@ -189,7 +189,7 @@ class TestMutables(TestCase):
        norm_cfg = dict(type='BN', requires_grad=True)
        op_cfg = dict(
            type='OneShotMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                shuffle_3x3=dict(
                    type='ShuffleBlock',
                    norm_cfg=norm_cfg,
@ -221,9 +221,9 @@ class TestMutables(TestCase):
        output = op.forward_all(input)
        assert output is not None
-    def test_candidate_ops(self):
+    def test_candidates(self):
-        candidate_ops = nn.ModuleDict({
+        candidates = nn.ModuleDict({
            'conv3x3': nn.Conv2d(32, 32, 3, 1, 1),
            'conv5x5': nn.Conv2d(32, 32, 5, 1, 2),
            'conv7x7': nn.Conv2d(32, 32, 7, 1, 3),
@ -231,7 +231,7 @@ class TestMutables(TestCase):
            'avgpool3x3': nn.AvgPool2d(3, 1, 1),
        })
-        op_cfg = dict(type='OneShotMutableOP', candidate_ops=candidate_ops)
+        op_cfg = dict(type='OneShotMutableOP', candidates=candidates)
        op = MODELS.build(op_cfg)
--- a/tests/test_models/test_mutators/test_diff_mutator.py
+++ b/tests/test_models/test_mutators/test_diff_mutator.py
@ -72,12 +72,12 @@ class SearchableModelAlias(nn.Module):
        return self.slayer3(x)
-class TestDiffMutator(TestCase):
+class TestDiffModuleMutator(TestCase):
    def setUp(self):
        self.MUTABLE_CFG = dict(
            type='DiffMutableOP',
-            candidate_ops=dict(
+            candidates=dict(
                torch_conv2d_3x3=dict(
                    type='torchConv2d',
                    kernel_size=3,
--- a/tests/test_models/test_mutators/test_one_shot_mutator.py
+++ b/tests/test_models/test_mutators/test_one_shot_mutator.py
@ -30,7 +30,7 @@ MUTATOR_CFG = dict(type='OneShotModuleMutator')
 MUTABLE_CFG = dict(
    type='OneShotMutableOP',
-    candidate_ops=dict(
+    candidates=dict(
        choice1=dict(
            type='MBBlock',
            in_channels=3,
--- a/tests/test_models/test_subnet/test_estimators/test_flops.py
+++ b/tests/test_models/test_subnet/test_estimators/test_flops.py
@ -13,7 +13,7 @@ from mmrazor.registry import MODELS
 _FIRST_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
-    candidate_ops=dict(
+    candidates=dict(
        mb_k3e1=dict(
            type='MBBlock',
            kernel_size=3,
@ -23,7 +23,7 @@ _FIRST_STAGE_MUTABLE = dict(
 _OTHER_STAGE_MUTABLE = dict(
    type='OneShotMutableOP',
-    candidate_ops=dict(
+    candidates=dict(
        mb_k3e3=dict(
            type='MBBlock',
            kernel_size=3,
--- a/tools/test.py
+++ b/tools/test.py
@ -3,6 +3,10 @@ import argparse
 import os
 import os.path as osp
 from mmcls.core import *  # noqa: F401,F403
 from mmcls.datasets import *  # noqa: F401,F403
 from mmcls.metrics import *  # noqa: F401,F403
 from mmcls.models import *  # noqa: F401,F403
 # TODO import mmcls and mmseg
 from mmdet.core import *  # noqa: F401,F403
 from mmdet.datasets import *  # noqa: F401,F403
--- a/tools/train.py
+++ b/tools/train.py
@ -38,7 +38,6 @@ def parse_args():
 def main():
    register_all_modules(False)
    args = parse_args()
    # load config