diff --git a/.circleci/test.yml b/.circleci/test.yml index 560e344f..67ce1b27 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -31,7 +31,7 @@ jobs: name: Check docstring coverage command: | pip install interrogate - interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 60 mmcls + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 60 mmcls build_cpu: parameters: # The python version must match available image tags in @@ -42,8 +42,6 @@ jobs: type: string torchvision: type: string - mmcv: - type: string docker: - image: cimg/python:<< parameters.python >> resource_class: large @@ -57,31 +55,32 @@ jobs: - run: name: Configure Python & pip command: | - python -m pip install --upgrade pip - python -m pip install wheel + pip install --upgrade pip + pip install wheel - run: name: Install PyTorch command: | python -V - python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html - run: name: Install mmcls dependencies command: | - python -m pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main - python -m pip install << parameters.mmcv >> - python -m pip install timm - python -m pip install -r requirements.txt + pip install git+https://github.com/open-mmlab/mmengine.git@main + pip install -U openmim + mim install 'mmcv >= 2.0.0rc1' + pip install timm + pip install -r requirements.txt python -c 'import mmcv; print(mmcv.__version__)' - run: name: Build and install command: | - python -m pip install -e . + pip install -e . - run: name: Run unittests command: | - python -m coverage run --branch --source mmcls -m pytest tests/ - python -m coverage xml - python -m coverage report -m + coverage run --branch --source mmcls -m pytest tests/ + coverage xml + coverage report -m build_cuda: machine: @@ -96,15 +95,13 @@ jobs: cudnn: type: integer default: 7 - mmcv: - type: string steps: - checkout - run: # Cloning repos in VM since Docker doesn't have access to the private key name: Clone Repos command: | - git clone -b main --depth 1 ssh://git@github.com/open-mmlab/mmengine.git /home/circleci/mmengine + git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine - run: name: Build Docker image command: | @@ -114,7 +111,8 @@ jobs: name: Install mmcls dependencies command: | docker exec mmcls pip install -e /mmengine - docker exec mmcls pip install << parameters.mmcv >> + docker exec mmcls pip install -U openmim + docker exec mmcls mim install 'mmcv >= 2.0.0rc1' docker exec mmcls pip install -r requirements.txt docker exec mmcls python -c 'import mmcv; print(mmcv.__version__)' - run: @@ -124,7 +122,7 @@ jobs: - run: name: Run unittests command: | - docker exec mmcls python -m pytest tests/ --ignore tests/test_models/test_backbones/test_timm_backbone.py + docker exec mmcls python -m pytest tests/ -k 'not timm' # Invoke jobs via workflows # See: https://circleci.com/docs/2.0/configuration-reference/#workflows @@ -138,6 +136,7 @@ workflows: branches: ignore: - dev-1.x + - 1.x pr_stage_test: when: not: @@ -154,15 +153,13 @@ workflows: torch: 1.6.0 torchvision: 0.7.0 python: 3.6.9 # The lowest python 3.6.x version available on CircleCI images - mmcv: https://download.openmmlab.com/mmcv/dev-2.x/cpu/torch1.6.0/mmcv_full-2.0.0rc0-cp36-cp36m-manylinux1_x86_64.whl requires: - lint - build_cpu: name: maximum_version_cpu - torch: 1.9.0 # TODO: Update the version after mmcv provides more pre-compiled packages. - torchvision: 0.10.0 + torch: 1.12.1 + torchvision: 0.13.1 python: 3.9.0 - mmcv: https://download.openmmlab.com/mmcv/dev-2.x/cpu/torch1.9.0/mmcv_full-2.0.0rc0-cp39-cp39-manylinux1_x86_64.whl requires: - minimum_version_cpu - hold: @@ -175,7 +172,6 @@ workflows: # Use double quotation mark to explicitly specify its type # as string instead of number cuda: "10.2" - mmcv: https://download.openmmlab.com/mmcv/dev-2.x/cu102/torch1.8.0/mmcv_full-2.0.0rc0-cp37-cp37m-manylinux1_x86_64.whl requires: - hold merge_stage_test: @@ -188,7 +184,6 @@ workflows: torch: 1.6.0 # Use double quotation mark to explicitly specify its type # as string instead of number - mmcv: https://download.openmmlab.com/mmcv/dev-2.x/cu101/torch1.6.0/mmcv_full-2.0.0rc0-cp37-cp37m-manylinux1_x86_64.whl cuda: "10.1" filters: branches: diff --git a/.dev_scripts/benchmark_regression/1-benchmark_valid.py b/.dev_scripts/benchmark_regression/1-benchmark_valid.py index a9e33c64..eb1541a1 100644 --- a/.dev_scripts/benchmark_regression/1-benchmark_valid.py +++ b/.dev_scripts/benchmark_regression/1-benchmark_valid.py @@ -2,15 +2,17 @@ import logging import re import tempfile from argparse import ArgumentParser +from collections import OrderedDict from pathlib import Path from time import time -from typing import OrderedDict import mmcv import numpy as np import torch -from mmengine import Config, MMLogger, Runner -from mmengine.dataset import Compose +from mmengine import Config, DictAction, MMLogger +from mmengine.dataset import Compose, default_collate +from mmengine.fileio import FileClient +from mmengine.runner import Runner from modelindex.load_model_index import load from rich.console import Console from rich.table import Table @@ -52,6 +54,16 @@ def parse_args(): '--flops-str', action='store_true', help='Output FLOPs and params counts in a string form.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') args = parser.parse_args() return args @@ -62,6 +74,8 @@ def inference(config_file, checkpoint, work_dir, args, exp_name): cfg.load_from = checkpoint cfg.log_level = 'WARN' cfg.experiment_name = exp_name + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) # build the data pipeline test_dataset = cfg.test_dataloader.dataset @@ -72,7 +86,8 @@ def inference(config_file, checkpoint, work_dir, args, exp_name): test_dataset.pipeline.insert(1, dict(type='Resize', scale=32)) data = Compose(test_dataset.pipeline)({'img_path': args.img}) - resolution = tuple(data['inputs'].shape[1:]) + data = default_collate([data]) + resolution = tuple(data['inputs'].shape[-2:]) runner: Runner = Runner.from_cfg(cfg) model = runner.model @@ -83,26 +98,30 @@ def inference(config_file, checkpoint, work_dir, args, exp_name): if args.inference_time: time_record = [] for _ in range(10): + model.val_step(data) # warmup before profiling + torch.cuda.synchronize() start = time() - model.val_step([data]) + model.val_step(data) + torch.cuda.synchronize() time_record.append((time() - start) * 1000) result['time_mean'] = np.mean(time_record[1:-1]) result['time_std'] = np.std(time_record[1:-1]) else: - model.val_step([data]) + model.val_step(data) result['model'] = config_file.stem if args.flops: - from mmcv.cnn.utils import get_model_complexity_info + from fvcore.nn import FlopCountAnalysis, parameter_count + from fvcore.nn.print_model_statistics import _format_size + _format_size = _format_size if args.flops_str else lambda x: x with torch.no_grad(): if hasattr(model, 'extract_feat'): model.forward = model.extract_feat - flops, params = get_model_complexity_info( - model, - input_shape=(3, ) + resolution, - print_per_layer_stat=False, - as_strings=args.flops_str) + model.to('cpu') + inputs = (torch.randn((1, 3, *resolution)), ) + flops = _format_size(FlopCountAnalysis(model, inputs).total()) + params = _format_size(parameter_count(model)['']) result['flops'] = flops if args.flops_str else int(flops) result['params'] = params if args.flops_str else int(params) else: @@ -184,7 +203,6 @@ def main(args): if args.checkpoint_root is not None: root = args.checkpoint_root if 's3://' in args.checkpoint_root: - from mmcv.fileio import FileClient from petrel_client.common.exception import AccessDeniedError file_client = FileClient.infer_client(uri=root) checkpoint = file_client.join_path( diff --git a/.dev_scripts/benchmark_regression/2-benchmark_test.py b/.dev_scripts/benchmark_regression/2-benchmark_test.py index 380e2519..cd49bd09 100644 --- a/.dev_scripts/benchmark_regression/2-benchmark_test.py +++ b/.dev_scripts/benchmark_regression/2-benchmark_test.py @@ -62,6 +62,12 @@ def parse_args(): action='store_true', help='Summarize benchmark test results.') parser.add_argument('--save', action='store_true', help='Save the summary') + parser.add_argument( + '--cfg-options', + nargs='+', + type=str, + default=[], + help='Config options for all config files.') args = parser.parse_args() return args @@ -76,7 +82,7 @@ def create_test_job_batch(commands, model_info, args, port, script_name): http_prefix = 'https://download.openmmlab.com/mmclassification/' if 's3://' in args.checkpoint_root: - from mmcv.fileio import FileClient + from mmengine.fileio import FileClient from petrel_client.common.exception import AccessDeniedError file_client = FileClient.infer_client(uri=args.checkpoint_root) checkpoint = file_client.join_path( @@ -125,6 +131,7 @@ def create_test_job_batch(commands, model_info, args, port, script_name): f'--work-dir={work_dir} ' f'--out={result_file} ' f'--cfg-option dist_params.port={port} ' + f'{" ".join(args.cfg_options)} ' f'--launcher={launcher}\n') with open(work_dir / 'job.sh', 'w') as f: diff --git a/.dev_scripts/benchmark_regression/3-benchmark_train.py b/.dev_scripts/benchmark_regression/3-benchmark_train.py index 6d78be50..9e240cc5 100644 --- a/.dev_scripts/benchmark_regression/3-benchmark_train.py +++ b/.dev_scripts/benchmark_regression/3-benchmark_train.py @@ -3,22 +3,48 @@ import json import os import os.path as osp import re +from collections import OrderedDict from datetime import datetime from pathlib import Path from zipfile import ZipFile +import yaml from modelindex.load_model_index import load from rich.console import Console from rich.syntax import Syntax from rich.table import Table console = Console() +MMCLS_ROOT = Path(__file__).absolute().parents[2] +CYCLE_LEVELS = ['month', 'quarter', 'half-year', 'no-training'] METRICS_MAP = { 'Top 1 Accuracy': 'accuracy/top1', 'Top 5 Accuracy': 'accuracy/top5' } +class RangeAction(argparse.Action): + + def __call__(self, parser, namespace, values: str, option_string): + matches = re.match(r'([><=]*)([-\w]+)', values) + if matches is None: + raise ValueError(f'Unavailable range option {values}') + symbol, range_str = matches.groups() + assert range_str in CYCLE_LEVELS, \ + f'{range_str} are not in {CYCLE_LEVELS}.' + level = CYCLE_LEVELS.index(range_str) + symbol = symbol or '<=' + ranges = set() + if '=' in symbol: + ranges.add(level) + if '>' in symbol: + ranges.update(range(level + 1, len(CYCLE_LEVELS))) + if '<' in symbol: + ranges.update(range(level)) + assert len(ranges) > 0, 'No range are selected.' + setattr(namespace, self.dest, ranges) + + def parse_args(): parser = argparse.ArgumentParser( description='Train models (in bench_train.yml) and compare accuracy.') @@ -32,6 +58,14 @@ def parse_args(): parser.add_argument('--port', type=int, default=29666, help='dist port') parser.add_argument( '--models', nargs='+', type=str, help='Specify model names to run.') + parser.add_argument( + '--range', + type=str, + default={0}, + action=RangeAction, + metavar='{month,quarter,half-year,no-training}', + help='The training benchmark range, "no-training" means all models ' + "including those we haven't trained.") parser.add_argument( '--work-dir', default='work_dirs/benchmark_train', @@ -63,18 +97,33 @@ def parse_args(): '--save', action='store_true', help='Save the summary and archive log files.') + parser.add_argument( + '--cfg-options', + nargs='+', + type=str, + default=[], + help='Config options for all config files.') args = parser.parse_args() return args +def get_gpu_number(model_info): + config = osp.basename(model_info.config) + matches = re.match(r'.*[-_](\d+)xb(\d+).*', config) + if matches is None: + raise ValueError( + 'Cannot get gpu numbers from the config name {config}') + gpus = int(matches.groups()[0]) + return gpus + + def create_train_job_batch(commands, model_info, args, port, script_name): fname = model_info.name - assert 'Gpus' in model_info.data, \ - f"Haven't specify gpu numbers for {fname}" - gpus = model_info.data['Gpus'] + gpus = get_gpu_number(model_info) + gpus_per_node = min(gpus, 8) config = Path(model_info.config) assert config.exists(), f'"{fname}": {config} not found.' @@ -101,15 +150,17 @@ def create_train_job_batch(commands, model_info, args, port, script_name): f'#SBATCH --output {work_dir}/job.%j.out\n' f'#SBATCH --partition={args.partition}\n' f'#SBATCH --job-name {job_name}\n' - f'#SBATCH --gres=gpu:8\n' + f'#SBATCH --gres=gpu:{gpus_per_node}\n' f'{mail_cfg}{quota_cfg}' - f'#SBATCH --ntasks-per-node=8\n' + f'#SBATCH --ntasks-per-node={gpus_per_node}\n' f'#SBATCH --ntasks={gpus}\n' f'#SBATCH --cpus-per-task=5\n\n' f'{runner} -u {script_name} {config} ' f'--work-dir={work_dir} --cfg-option ' - f'dist_params.port={port} ' - f'checkpoint_config.max_keep_ckpts=10 ' + f'env_cfg.dist_cfg.port={port} ' + f'{" ".join(args.cfg_options)} ' + f'default_hooks.checkpoint.max_keep_ckpts=2 ' + f'default_hooks.checkpoint.save_best="auto" ' f'--launcher={launcher}\n') with open(work_dir / 'job.sh', 'w') as f: @@ -124,33 +175,16 @@ def create_train_job_batch(commands, model_info, args, port, script_name): return work_dir / 'job.sh' -def train(args): - models_cfg = load(str(Path(__file__).parent / 'bench_train.yml')) - models_cfg.build_models_with_collections() - models = {model.name: model for model in models_cfg.models} - +def train(models, args): script_name = osp.join('tools', 'train.py') port = args.port commands = [] - if args.models: - patterns = [re.compile(pattern) for pattern in args.models] - filter_models = {} - for k, v in models.items(): - if any([re.match(pattern, k) for pattern in patterns]): - filter_models[k] = v - if len(filter_models) == 0: - print('No model found, please specify models in:') - print('\n'.join(models.keys())) - return - models = filter_models for model_info in models.values(): - months = model_info.data.get('Months', range(1, 13)) - if datetime.now().month in months: - script_path = create_train_job_batch(commands, model_info, args, - port, script_name) - port += 1 + script_path = create_train_job_batch(commands, model_info, args, port, + script_name) + port += 1 command_str = '\n'.join(commands) @@ -245,12 +279,14 @@ def show_summary(summary_data): metric = summary[metric_key] expect = metric['expect'] last = metric['last'] + last_epoch = metric['last_epoch'] last_color = set_color(last, expect) best = metric['best'] best_color = set_color(best, expect) best_epoch = metric['best_epoch'] row.append(f'{expect:.2f}') - row.append(f'[{last_color}]{last:.2f}[/{last_color}]') + row.append( + f'[{last_color}]{last:.2f}[/{last_color}] ({last_epoch})') row.append( f'[{best_color}]{best:.2f}[/{best_color}] ({best_epoch})') table.add_row(*row) @@ -258,25 +294,11 @@ def show_summary(summary_data): console.print(table) -def summary(args): - models_cfg = load(str(Path(__file__).parent / 'bench_train.yml')) - models = {model.name: model for model in models_cfg.models} +def summary(models, args): work_dir = Path(args.work_dir) dir_map = {p.name: p for p in work_dir.iterdir() if p.is_dir()} - if args.models: - patterns = [re.compile(pattern) for pattern in args.models] - filter_models = {} - for k, v in models.items(): - if any([re.match(pattern, k) for pattern in patterns]): - filter_models[k] = v - if len(filter_models) == 0: - print('No model found, please specify models in:') - print('\n'.join(models.keys())) - return - models = filter_models - summary_data = {} for model_name, model_info in models.items(): @@ -287,17 +309,19 @@ def summary(args): # Skip if not found any vis_data folder. sub_dir = dir_map[model_name] - vis_folders = [d for d in sub_dir.iterdir() if d.is_dir()] - if len(vis_folders) == 0: - continue - log_file = sorted(vis_folders)[-1] / 'vis_data' / 'scalars.json' - if not log_file.exists(): + log_files = [f for f in sub_dir.glob('*/vis_data/scalars.json')] + if len(log_files) == 0: continue + log_file = sorted(log_files)[-1] # parse train log with open(log_file) as f: json_logs = [json.loads(s) for s in f.readlines()] - val_logs = [log for log in json_logs if 'loss' not in log] + val_logs = [ + log for log in json_logs + # TODO: need a better method to extract validate log + if 'loss' not in log and 'accuracy/top1' in log + ] if len(val_logs) == 0: continue @@ -320,9 +344,10 @@ def summary(args): summary[key_yml] = dict( expect=expect_result, last=last, + last_epoch=len(val_logs), best=best, - best_epoch=best_epoch) - summary_data[model_name] = summary + best_epoch=best_epoch + 1) + summary_data[model_name].update(summary) show_summary(summary_data) if args.save: @@ -332,10 +357,39 @@ def summary(args): def main(): args = parse_args() + model_index_file = MMCLS_ROOT / 'model-index.yml' + model_index = load(str(model_index_file)) + model_index.build_models_with_collections() + all_models = {model.name: model for model in model_index.models} + + with open(Path(__file__).parent / 'bench_train.yml', 'r') as f: + train_items = yaml.safe_load(f) + models = OrderedDict() + for item in train_items: + name = item['Name'] + model_info = all_models[name] + model_info.cycle = item.get('Cycle', None) + cycle = getattr(model_info, 'cycle', 'month') + cycle_level = CYCLE_LEVELS.index(cycle) + if cycle_level in args.range: + models[name] = model_info + + if args.models: + patterns = [re.compile(pattern) for pattern in args.models] + filter_models = {} + for k, v in models.items(): + if any([re.match(pattern, k) for pattern in patterns]): + filter_models[k] = v + if len(filter_models) == 0: + print('No model found, please specify models in:') + print('\n'.join(models.keys())) + return + models = filter_models + if args.summary: - summary(args) + summary(models, args) else: - train(args) + train(models, args) if __name__ == '__main__': diff --git a/.dev_scripts/benchmark_regression/bench_train.yml b/.dev_scripts/benchmark_regression/bench_train.yml index c7326849..9f6e11eb 100644 --- a/.dev_scripts/benchmark_regression/bench_train.yml +++ b/.dev_scripts/benchmark_regression/bench_train.yml @@ -1,88 +1,86 @@ -Models: - - Name: resnet50 - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 76.55 - Top 5 Accuracy: 93.06 - Config: configs/resnet/resnet50_8xb32_in1k.py - Gpus: 8 +- Name: mobilenet-v2_8xb32_in1k + Cycle: month - - Name: seresnet50 - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 77.74 - Top 5 Accuracy: 93.84 - Config: configs/seresnet/seresnet50_8xb32_in1k.py - Gpus: 8 +- Name: resnet50_8xb32_in1k + Cycle: month - - Name: vit-base - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 82.37 - Top 5 Accuracy: 96.15 - Config: configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py - Gpus: 32 +- Name: seresnet50_8xb32_in1k + Cycle: month - - Name: mobilenetv2 - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 71.86 - Top 5 Accuracy: 90.42 - Config: configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py - Gpus: 8 +- Name: swin-small_16xb64_in1k + Cycle: month - - Name: swin_tiny - Results: - - Dataset: ImageNet - Metrics: - Top 1 Accuracy: 81.18 - Top 5 Accuracy: 95.61 - Weights: https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth - Config: configs/swin_transformer/swin-tiny_16xb64_in1k.py - Gpus: 16 +- Name: vit-base-p16_pt-32xb128-mae_in1k + Cycle: month - - Name: vgg16 - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 71.62 - Top 5 Accuracy: 90.49 - Config: configs/vgg/vgg16_8xb32_in1k.py - Gpus: 8 - Months: - - 1 - - 4 - - 7 - - 10 +- Name: resnet50_8xb256-rsb-a1-600e_in1k + Cycle: quarter - - Name: shufflenet_v2 - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 69.55 - Top 5 Accuracy: 88.92 - Config: configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py - Gpus: 16 - Months: - - 2 - - 5 - - 8 - - 11 +- Name: resnext50-32x4d_8xb32_in1k + Cycle: quarter - - Name: resnet-rsb - Results: - - Dataset: ImageNet-1k - Metrics: - Top 1 Accuracy: 80.12 - Top 5 Accuracy: 94.78 - Config: configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py - Gpus: 8 - Months: - - 3 - - 6 - - 9 - - 12 +- Name: shufflenet-v2-1x_16xb64_in1k + Cycle: quarter + +- Name: vgg16_8xb32_in1k + Cycle: quarter + +- Name: shufflenet-v1-1x_16xb64_in1k + Cycle: half-year + +- Name: t2t-vit-t-14_8xb64_in1k + Cycle: half-year + +- Name: regnetx-1.6gf_8xb128_in1k + Cycle: half-year + +- Name: van-small_8xb128_in1k + Cycle: no-training + +- Name: res2net50-w14-s8_3rdparty_8xb32_in1k + Cycle: no-training + +- Name: repvgg-A2_3rdparty_4xb64-coslr-120e_in1k + Cycle: no-training + +- Name: tnt-small-p16_3rdparty_in1k + Cycle: no-training + +- Name: mlp-mixer-base-p16_3rdparty_64xb64_in1k + Cycle: no-training + +- Name: conformer-small-p16_3rdparty_8xb128_in1k + Cycle: no-training + +- Name: twins-pcpvt-base_3rdparty_8xb128_in1k + Cycle: no-training + +- Name: efficientnet-b0_3rdparty_8xb32_in1k + Cycle: no-training + +- Name: convnext-small_3rdparty_32xb128_in1k + Cycle: no-training + +- Name: hrnet-w18_3rdparty_8xb32_in1k + Cycle: no-training + +- Name: repmlp-base_3rdparty_8xb64_in1k + Cycle: no-training + +- Name: wide-resnet50_3rdparty_8xb32_in1k + Cycle: no-training + +- Name: cspresnet50_3rdparty_8xb32_in1k + Cycle: no-training + +- Name: convmixer-768-32_10xb64_in1k + Cycle: no-training + +- Name: densenet169_4xb256_in1k + Cycle: no-training + +- Name: poolformer-s24_3rdparty_32xb128_in1k + Cycle: no-training + +- Name: inception-v3_3rdparty_8xb32_in1k + Cycle: no-training diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..0c1cd14d --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,27 @@ +name: lint + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install pre-commit hook + run: | + pip install pre-commit + pre-commit install + - name: Linting + run: pre-commit run --all-files + - name: Check docstring coverage + run: | + pip install interrogate + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 60 mmcls diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml new file mode 100644 index 00000000..67b154a6 --- /dev/null +++ b/.github/workflows/pr_stage_test.yml @@ -0,0 +1,87 @@ +name: pr_stage_test + +on: + pull_request: + paths-ignore: + - 'README.md' + - 'README_zh-CN.md' + - 'docs/**' + - 'demo/**' + - 'tools/**' + - 'configs/**' + - '.dev_scripts/**' + - '.circleci/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-18.04 + strategy: + matrix: + python-version: [3.7] + include: + - torch: 1.8.1 + torchvision: 0.9.1 + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install pip --upgrade + - name: Install PyTorch + run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + - name: Install mmcls dependencies + run: | + pip install git+https://github.com/open-mmlab/mmengine.git@main + pip install -U openmim + mim install 'mmcv >= 2.0.0rc1' + pip install -r requirements.txt + - name: Build and install + run: pip install -e . + - name: Run unittests and generate coverage report + run: | + coverage run --branch --source mmcls -m pytest tests/ -k 'not timm' + coverage xml + coverage report -m + # Upload coverage report for python3.7 && pytorch1.8.1 cpu + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1.0.14 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false + + build_windows: + runs-on: windows-2022 + strategy: + matrix: + python: [3.7] + platform: [cu111] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install pip --upgrade + - name: Install PyTorch + run: pip install torch==1.8.2+${{matrix.platform}} torchvision==0.9.2+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html + - name: Install mmcls dependencies + run: | + pip install git+https://github.com/open-mmlab/mmengine.git@main + pip install -U openmim + mim install 'mmcv >= 2.0.0rc1' + pip install -r requirements.txt + - name: Build and install + run: pip install -e . + - name: Run unittests + run: | + pytest tests/ -k 'not timm' --ignore tests/test_models/test_backbones diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml new file mode 100644 index 00000000..08936cb2 --- /dev/null +++ b/.github/workflows/publish-to-pypi.yml @@ -0,0 +1,22 @@ +name: deploy + +on: push + +jobs: + build-n-publish: + runs-on: ubuntu-latest + if: startsWith(github.event.ref, 'refs/tags') + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Build MMClassification + run: | + pip install wheel + python setup.py sdist bdist_wheel + - name: Publish distribution to PyPI + run: | + pip install twine + twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} diff --git a/.github/workflows/test_mim.yml b/.github/workflows/test_mim.yml new file mode 100644 index 00000000..f437e4e5 --- /dev/null +++ b/.github/workflows/test_mim.yml @@ -0,0 +1,44 @@ +name: test-mim + +on: + push: + paths: + - 'model-index.yml' + - 'configs/**' + + pull_request: + paths: + - 'model-index.yml' + - 'configs/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build_cpu: + runs-on: ubuntu-18.04 + strategy: + matrix: + python-version: [3.7] + torch: [1.8.0] + include: + - torch: 1.8.0 + torch_version: torch1.8 + torchvision: 0.9.0 + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: pip install pip --upgrade + - name: Install PyTorch + run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + - name: Install openmim + run: pip install openmim + - name: Build and install + run: mim install -e . + - name: test commands of mim + run: mim search mmcls diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d19d5f6..c55af0f2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,10 +44,10 @@ repos: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] - repo: https://github.com/open-mmlab/pre-commit-hooks - rev: v0.2.0 + rev: v0.4.0 hooks: - id: check-copyright - args: ["mmcls", "tests", "demo", "tools"] + args: ["mmcls", "tests", "demo", "tools", "--excludes", "mmcls/.mim/", "--ignore-file-not-found-error"] # - repo: local # hooks: # - id: clang-format diff --git a/README.md b/README.md index 4e2280de..30ffebac 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,11 @@ The `1.x` branch works with **PyTorch 1.6+**. ## What's new +v1.0.0rc1 was released in 30/9/2022. + +- Support MViT, EdgeNeXt, Swin-Transformer V2, EfficientFormer and MobileOne. +- Support BEiT type transformer layer. + v1.0.0rc0 was released in 31/8/2022. This release introduced a brand new and flexible training & test engine, but it's still in progress. Welcome @@ -115,6 +120,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea - [x] [MobileNetV2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobilenet_v2) - [x] [MobileNetV3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobilenet_v3) - [x] [Swin-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/swin_transformer) +- [x] [Swin-Transformer V2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/swin_transformer_v2) - [x] [RepVGG](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/repvgg) - [x] [Vision-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/vision_transformer) - [x] [Transformer-in-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/tnt) @@ -125,6 +131,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit) - [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins) - [x] [EfficientNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/efficientnet) +- [x] [EdgeNeXt](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/edgenext) - [x] [ConvNeXt](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/convnext) - [x] [HRNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/hrnet) - [x] [VAN](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/van) @@ -132,6 +139,9 @@ Results and models are available in the [model zoo](https://mmclassification.rea - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/cspnet) - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/poolformer) - [x] [Inception V3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/inception_v3) +- [x] [MobileOne](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobileone) +- [x] [EfficientFormer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/efficientformer) +- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mvit) diff --git a/README_zh-CN.md b/README_zh-CN.md index e626e269..0b3ae763 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,6 +57,11 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱,是 [O ## 更新日志 +2022/9/30 发布了 v1.0.0rc1 版本 + +- 支持了 MViT,EdgeNeXt,Swin-Transformer V2,EfficientFormer,MobileOne 等主干网络。 +- 支持了 BEiT 风格的 transformer 层。 + 2022/8/31 发布了 v1.0.0rc0 版本 这个版本引入一个全新的,可扩展性强的训练和测试引擎,但目前仍在开发中。欢迎根据[文档](https://mmclassification.readthedocs.io/zh_CN/1.x/)进行试用。 @@ -114,6 +119,7 @@ mim install -e . - [x] [MobileNetV2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobilenet_v2) - [x] [MobileNetV3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobilenet_v3) - [x] [Swin-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/swin_transformer) +- [x] [Swin-Transformer V2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/swin_transformer_v2) - [x] [RepVGG](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/repvgg) - [x] [Vision-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/vision_transformer) - [x] [Transformer-in-Transformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/tnt) @@ -124,6 +130,7 @@ mim install -e . - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit) - [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins) - [x] [EfficientNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/efficientnet) +- [x] [EdgeNeXt](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/edgenext) - [x] [ConvNeXt](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/convnext) - [x] [HRNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/hrnet) - [x] [VAN](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/van) @@ -131,6 +138,9 @@ mim install -e . - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/cspnet) - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/poolformer) - [x] [Inception V3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/inception_v3) +- [x] [MobileOne](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mobileone) +- [x] [EfficientFormer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/efficientformer) +- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mvit) diff --git a/configs/_base_/datasets/imagenet_bs64_edgenext_256.py b/configs/_base_/datasets/imagenet_bs64_edgenext_256.py new file mode 100644 index 00000000..18f78064 --- /dev/null +++ b/configs/_base_/datasets/imagenet_bs64_edgenext_256.py @@ -0,0 +1,83 @@ +# dataset settings +dataset_type = 'ImageNet' +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +bgr_mean = data_preprocessor['mean'][::-1] +bgr_std = data_preprocessor['std'][::-1] + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + scale=256, + backend='pillow', + interpolation='bicubic'), + dict(type='RandomFlip', prob=0.5, direction='horizontal'), + dict( + type='RandAugment', + policies='timm_increasing', + num_policies=2, + total_level=10, + magnitude_level=9, + magnitude_std=0.5, + hparams=dict( + pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')), + dict( + type='RandomErasing', + erase_prob=0.25, + mode='rand', + min_area_ratio=0.02, + max_area_ratio=1 / 3, + fill_color=bgr_mean, + fill_std=bgr_std), + dict(type='PackClsInputs'), +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='ResizeEdge', + scale=292, + edge='short', + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=256), + dict(type='PackClsInputs') +] + +train_dataloader = dict( + batch_size=64, + num_workers=5, + dataset=dict( + type=dataset_type, + data_root='data/imagenet', + ann_file='meta/train.txt', + data_prefix='train', + pipeline=train_pipeline), + sampler=dict(type='DefaultSampler', shuffle=True), + persistent_workers=True, +) + +val_dataloader = dict( + batch_size=64, + num_workers=5, + dataset=dict( + type=dataset_type, + data_root='data/imagenet', + ann_file='meta/val.txt', + data_prefix='val', + pipeline=test_pipeline), + sampler=dict(type='DefaultSampler', shuffle=False), + persistent_workers=True, +) +val_evaluator = dict(type='Accuracy', topk=(1, 5)) + +# If you want standard test, please manually configure the test dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator diff --git a/configs/_base_/datasets/imagenet_bs64_swin_256.py b/configs/_base_/datasets/imagenet_bs64_swin_256.py new file mode 100644 index 00000000..db32589e --- /dev/null +++ b/configs/_base_/datasets/imagenet_bs64_swin_256.py @@ -0,0 +1,83 @@ +# dataset settings +dataset_type = 'ImageNet' +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +bgr_mean = data_preprocessor['mean'][::-1] +bgr_std = data_preprocessor['std'][::-1] + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + scale=256, + backend='pillow', + interpolation='bicubic'), + dict(type='RandomFlip', prob=0.5, direction='horizontal'), + dict( + type='RandAugment', + policies='timm_increasing', + num_policies=2, + total_level=10, + magnitude_level=9, + magnitude_std=0.5, + hparams=dict( + pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')), + dict( + type='RandomErasing', + erase_prob=0.25, + mode='rand', + min_area_ratio=0.02, + max_area_ratio=1 / 3, + fill_color=bgr_mean, + fill_std=bgr_std), + dict(type='PackClsInputs'), +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='ResizeEdge', + scale=292, # ( 256 / 224 * 256 ) + edge='short', + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=256), + dict(type='PackClsInputs'), +] + +train_dataloader = dict( + batch_size=64, + num_workers=5, + dataset=dict( + type=dataset_type, + data_root='data/imagenet', + ann_file='meta/train.txt', + data_prefix='train', + pipeline=train_pipeline), + sampler=dict(type='DefaultSampler', shuffle=True), + persistent_workers=True, +) + +val_dataloader = dict( + batch_size=64, + num_workers=5, + dataset=dict( + type=dataset_type, + data_root='data/imagenet', + ann_file='meta/val.txt', + data_prefix='val', + pipeline=test_pipeline), + sampler=dict(type='DefaultSampler', shuffle=False), + persistent_workers=True, +) +val_evaluator = dict(type='Accuracy', topk=(1, 5)) + +# If you want standard test, please manually configure the test dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator diff --git a/configs/_base_/models/edgenext/edgenext-base.py b/configs/_base_/models/edgenext/edgenext-base.py new file mode 100644 index 00000000..37839729 --- /dev/null +++ b/configs/_base_/models/edgenext/edgenext-base.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='EdgeNeXt', + arch='base', + out_indices=(3, ), + drop_path_rate=0.1, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=584, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/edgenext/edgenext-small.py b/configs/_base_/models/edgenext/edgenext-small.py new file mode 100644 index 00000000..e1f7e172 --- /dev/null +++ b/configs/_base_/models/edgenext/edgenext-small.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='EdgeNeXt', + arch='small', + out_indices=(3, ), + drop_path_rate=0.1, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=304, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/edgenext/edgenext-xsmall.py b/configs/_base_/models/edgenext/edgenext-xsmall.py new file mode 100644 index 00000000..69c7d0d6 --- /dev/null +++ b/configs/_base_/models/edgenext/edgenext-xsmall.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='EdgeNeXt', + arch='xsmall', + out_indices=(3, ), + drop_path_rate=0.1, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=192, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/edgenext/edgenext-xxsmall.py b/configs/_base_/models/edgenext/edgenext-xxsmall.py new file mode 100644 index 00000000..fb688195 --- /dev/null +++ b/configs/_base_/models/edgenext/edgenext-xxsmall.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='EdgeNeXt', + arch='xxsmall', + out_indices=(3, ), + drop_path_rate=0.1, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=168, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/efficientformer-l1.py b/configs/_base_/models/efficientformer-l1.py new file mode 100644 index 00000000..37dc62cd --- /dev/null +++ b/configs/_base_/models/efficientformer-l1.py @@ -0,0 +1,18 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='EfficientFormer', + arch='l1', + drop_path_rate=0, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.), + dict(type='Constant', layer=['LayerScale'], val=1e-5) + ]), + neck=dict(type='GlobalAveragePooling', dim=1), + head=dict( + type='EfficientFormerClsHead', in_channels=448, num_classes=1000)) diff --git a/configs/_base_/models/mobileone/mobileone_s0.py b/configs/_base_/models/mobileone/mobileone_s0.py new file mode 100644 index 00000000..39624e55 --- /dev/null +++ b/configs/_base_/models/mobileone/mobileone_s0.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s0', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1024, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + ), + topk=(1, 5), + )) diff --git a/configs/_base_/models/mobileone/mobileone_s1.py b/configs/_base_/models/mobileone/mobileone_s1.py new file mode 100644 index 00000000..cea7762e --- /dev/null +++ b/configs/_base_/models/mobileone/mobileone_s1.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s1', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1280, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + ), + topk=(1, 5), + )) diff --git a/configs/_base_/models/mobileone/mobileone_s2.py b/configs/_base_/models/mobileone/mobileone_s2.py new file mode 100644 index 00000000..dfae0e1f --- /dev/null +++ b/configs/_base_/models/mobileone/mobileone_s2.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s2', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2048, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + ), + topk=(1, 5), + )) diff --git a/configs/_base_/models/mobileone/mobileone_s3.py b/configs/_base_/models/mobileone/mobileone_s3.py new file mode 100644 index 00000000..81356753 --- /dev/null +++ b/configs/_base_/models/mobileone/mobileone_s3.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s3', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2048, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + ), + topk=(1, 5), + )) diff --git a/configs/_base_/models/mobileone/mobileone_s4.py b/configs/_base_/models/mobileone/mobileone_s4.py new file mode 100644 index 00000000..282eec8b --- /dev/null +++ b/configs/_base_/models/mobileone/mobileone_s4.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s4', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2048, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + ), + topk=(1, 5), + )) diff --git a/configs/_base_/models/mvit/mvitv2-base.py b/configs/_base_/models/mvit/mvitv2-base.py new file mode 100644 index 00000000..137b15b2 --- /dev/null +++ b/configs/_base_/models/mvit/mvitv2-base.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict(type='MViT', arch='base', drop_path_rate=0.3), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + in_channels=768, + num_classes=1000, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + ), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) diff --git a/configs/_base_/models/mvit/mvitv2-large.py b/configs/_base_/models/mvit/mvitv2-large.py new file mode 100644 index 00000000..4a1abd2b --- /dev/null +++ b/configs/_base_/models/mvit/mvitv2-large.py @@ -0,0 +1,23 @@ +model = dict( + type='ImageClassifier', + backbone=dict( + type='MViT', + arch='large', + drop_path_rate=0.5, + dim_mul_in_attention=False), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + in_channels=1152, + num_classes=1000, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + ), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) diff --git a/configs/_base_/models/mvit/mvitv2-small.py b/configs/_base_/models/mvit/mvitv2-small.py new file mode 100644 index 00000000..e3640c3d --- /dev/null +++ b/configs/_base_/models/mvit/mvitv2-small.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict(type='MViT', arch='small', drop_path_rate=0.1), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + in_channels=768, + num_classes=1000, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + ), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) diff --git a/configs/_base_/models/mvit/mvitv2-tiny.py b/configs/_base_/models/mvit/mvitv2-tiny.py new file mode 100644 index 00000000..3b7fc690 --- /dev/null +++ b/configs/_base_/models/mvit/mvitv2-tiny.py @@ -0,0 +1,19 @@ +model = dict( + type='ImageClassifier', + backbone=dict(type='MViT', arch='tiny', drop_path_rate=0.1), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + in_channels=768, + num_classes=1000, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + ), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) diff --git a/configs/_base_/models/swin_transformer_v2/base_256.py b/configs/_base_/models/swin_transformer_v2/base_256.py new file mode 100644 index 00000000..0e00004e --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/base_256.py @@ -0,0 +1,26 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='base', + img_size=256, + drop_path_rate=0.5), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1024, + init_cfg=None, # suppress the default init_cfg of LinearClsHead. + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ]), +) diff --git a/configs/_base_/models/swin_transformer_v2/base_384.py b/configs/_base_/models/swin_transformer_v2/base_384.py new file mode 100644 index 00000000..5fb9aead --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/base_384.py @@ -0,0 +1,17 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='base', + img_size=384, + drop_path_rate=0.2), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1024, + init_cfg=None, # suppress the default init_cfg of LinearClsHead. + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False)) diff --git a/configs/_base_/models/swin_transformer_v2/large_256.py b/configs/_base_/models/swin_transformer_v2/large_256.py new file mode 100644 index 00000000..fe557c32 --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/large_256.py @@ -0,0 +1,16 @@ +# model settings +# Only for evaluation +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='large', + img_size=256, + drop_path_rate=0.2), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1536, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5))) diff --git a/configs/_base_/models/swin_transformer_v2/large_384.py b/configs/_base_/models/swin_transformer_v2/large_384.py new file mode 100644 index 00000000..a626c407 --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/large_384.py @@ -0,0 +1,16 @@ +# model settings +# Only for evaluation +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='large', + img_size=384, + drop_path_rate=0.2), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1536, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5))) diff --git a/configs/_base_/models/swin_transformer_v2/small_256.py b/configs/_base_/models/swin_transformer_v2/small_256.py new file mode 100644 index 00000000..5c296dc9 --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/small_256.py @@ -0,0 +1,26 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='small', + img_size=256, + drop_path_rate=0.3), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=768, + init_cfg=None, # suppress the default init_cfg of LinearClsHead. + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ]), +) diff --git a/configs/_base_/models/swin_transformer_v2/tiny_256.py b/configs/_base_/models/swin_transformer_v2/tiny_256.py new file mode 100644 index 00000000..73059428 --- /dev/null +++ b/configs/_base_/models/swin_transformer_v2/tiny_256.py @@ -0,0 +1,26 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='SwinTransformerV2', + arch='tiny', + img_size=256, + drop_path_rate=0.2), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=768, + init_cfg=None, # suppress the default init_cfg of LinearClsHead. + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.8, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ]), +) diff --git a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py index 3a2797bc..5b52ea6e 100644 --- a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py +++ b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py @@ -23,18 +23,11 @@ param_scheduler = [ type='LinearLR', start_factor=1e-3, by_epoch=True, - begin=0, end=20, # update by iter convert_to_iter_based=True), # main learning rate scheduler - dict( - type='CosineAnnealingLR', - T_max=280, - eta_min=1e-5, - by_epoch=True, - begin=20, - end=300) + dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20) ] # train, val, test setting diff --git a/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py b/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py index 18b8554d..cf38d473 100644 --- a/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py +++ b/configs/_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py @@ -6,14 +6,8 @@ optim_wrapper = dict( # learning policy param_scheduler = [ - dict( - type='ConstantLR', - factor=0.1, - by_epoch=True, - begin=0, - end=5, - convert_to_iter_based=True), - dict(type='PolyLR', eta_min=0, by_epoch=True, begin=5, end=300) + dict(type='ConstantLR', factor=0.1, by_epoch=False, begin=0, end=5000), + dict(type='PolyLR', eta_min=0, by_epoch=False, begin=5000) ] # train, val, test setting diff --git a/configs/_base_/schedules/imagenet_bs2048_AdamW.py b/configs/_base_/schedules/imagenet_bs2048_AdamW.py index 13011a5a..bbfae8ef 100644 --- a/configs/_base_/schedules/imagenet_bs2048_AdamW.py +++ b/configs/_base_/schedules/imagenet_bs2048_AdamW.py @@ -11,25 +11,22 @@ optim_wrapper = dict( ) # learning policy +warmup_epochs = 15 # about 10000 iterations for ImageNet-1k param_scheduler = [ # warm up learning rate scheduler dict( type='LinearLR', start_factor=1e-3, by_epoch=True, - begin=0, - # about 10000 iterations for ImageNet-1k - end=15, + end=warmup_epochs, # update by iter convert_to_iter_based=True), # main learning rate scheduler dict( type='CosineAnnealingLR', - T_max=285, eta_min=1e-5, by_epoch=True, - begin=15, - end=300) + begin=warmup_epochs) ] # train, val, test setting diff --git a/configs/conformer/README.md b/configs/conformer/README.md index 5b7d96b7..c0825847 100644 --- a/configs/conformer/README.md +++ b/configs/conformer/README.md @@ -16,12 +16,12 @@ Within Convolutional Neural Network (CNN), the convolution operations are good a ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------: | :-----------------------------------------------------------------------: | -| Conformer-tiny-p16\* | 23.52 | 4.90 | 81.31 | 95.60 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) | -| Conformer-small-p32\* | 38.85 | 7.09 | 81.96 | 96.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) | -| Conformer-small-p16\* | 37.67 | 10.31 | 83.32 | 96.46 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) | -| Conformer-base-p16\* | 83.29 | 22.89 | 83.82 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------------------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------: | :------------------------------------------------------------------------------------------------: | +| Conformer-tiny-p16\* | 23.52 | 4.90 | 81.31 | 95.60 | [config](./conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) | +| Conformer-small-p32\* | 38.85 | 7.09 | 81.96 | 96.02 | [config](./conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) | +| Conformer-small-p16\* | 37.67 | 10.31 | 83.32 | 96.46 | [config](./conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) | +| Conformer-base-p16\* | 83.29 | 22.89 | 83.82 | 96.59 | [config](./conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) | *Models with * are converted from the [official repo](https://github.com/pengzhiliang/Conformer). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/convmixer/README.md b/configs/convmixer/README.md index 763bad3c..d3ffa8af 100644 --- a/configs/convmixer/README.md +++ b/configs/convmixer/README.md @@ -20,11 +20,11 @@ Although convolutional networks have been the dominant architecture for vision t ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------------: | :------------------------------------------------------------------------: | -| ConvMixer-768/32\* | 21.11 | 19.62 | 80.16 | 95.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convmixer/convmixer-768-32_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth) | -| ConvMixer-1024/20\* | 24.38 | 5.55 | 76.94 | 93.36 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convmixer/convmixer-1024-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth) | -| ConvMixer-1536/20\* | 51.63 | 48.71 | 81.37 | 95.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convmixer/convmixer-1536-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------: | :----------------------------------------------------------------------------------------------------: | +| ConvMixer-768/32\* | 21.11 | 19.62 | 80.16 | 95.08 | [config](./convmixer-768-32_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth) | +| ConvMixer-1024/20\* | 24.38 | 5.55 | 76.94 | 93.36 | [config](./convmixer-1024-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth) | +| ConvMixer-1536/20\* | 51.63 | 48.71 | 81.37 | 95.61 | [config](./convmixer-1536-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth) | *Models with * are converted from the [official repo](https://github.com/locuslab/convmixer). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/convmixer/metafile.yml b/configs/convmixer/metafile.yml index 7831d746..7f75d527 100644 --- a/configs/convmixer/metafile.yml +++ b/configs/convmixer/metafile.yml @@ -15,7 +15,7 @@ Models: Metadata: FLOPs: 19623051264 Parameters: 21110248 - In Collections: ConvMixer + In Collection: ConvMixer Results: - Dataset: ImageNet-1k Metrics: @@ -31,7 +31,7 @@ Models: Metadata: FLOPs: 5550112768 Parameters: 24383464 - In Collections: ConvMixer + In Collection: ConvMixer Results: - Dataset: ImageNet-1k Metrics: @@ -47,7 +47,7 @@ Models: Metadata: FLOPs: 48713170944 Parameters: 51625960 - In Collections: ConvMixer + In Collection: ConvMixer Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/convnext/README.md b/configs/convnext/README.md index 7db81366..077931ff 100644 --- a/configs/convnext/README.md +++ b/configs/convnext/README.md @@ -20,15 +20,15 @@ The "Roaring 20s" of visual recognition began with the introduction of Vision Tr ### ImageNet-1k -| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------: | :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------: | :---------------------------------------------------------------------: | -| ConvNeXt-T\* | From scratch | 28.59 | 4.46 | 82.05 | 95.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) | -| ConvNeXt-S\* | From scratch | 50.22 | 8.69 | 83.13 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) | -| ConvNeXt-B\* | From scratch | 88.59 | 15.36 | 83.85 | 96.74 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) | -| ConvNeXt-B\* | ImageNet-21k | 88.59 | 15.36 | 85.81 | 97.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) | -| ConvNeXt-L\* | From scratch | 197.77 | 34.37 | 84.30 | 96.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) | -| ConvNeXt-L\* | ImageNet-21k | 197.77 | 34.37 | 86.61 | 98.04 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) | -| ConvNeXt-XL\* | ImageNet-21k | 350.20 | 60.93 | 86.97 | 98.20 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) | +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------: | :----------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------------------------------: | +| ConvNeXt-T\* | From scratch | 28.59 | 4.46 | 82.05 | 95.86 | [config](./convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) | +| ConvNeXt-S\* | From scratch | 50.22 | 8.69 | 83.13 | 96.44 | [config](./convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) | +| ConvNeXt-B\* | From scratch | 88.59 | 15.36 | 83.85 | 96.74 | [config](./convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) | +| ConvNeXt-B\* | ImageNet-21k | 88.59 | 15.36 | 85.81 | 97.86 | [config](./convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) | +| ConvNeXt-L\* | From scratch | 197.77 | 34.37 | 84.30 | 96.89 | [config](./convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) | +| ConvNeXt-L\* | ImageNet-21k | 197.77 | 34.37 | 86.61 | 98.04 | [config](./convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) | +| ConvNeXt-XL\* | ImageNet-21k | 350.20 | 60.93 | 86.97 | 98.20 | [config](./convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) | *Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/convnext/metafile.yml b/configs/convnext/metafile.yml index 823f3327..74b0a041 100644 --- a/configs/convnext/metafile.yml +++ b/configs/convnext/metafile.yml @@ -18,7 +18,7 @@ Models: Metadata: FLOPs: 4457472768 Parameters: 28589128 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -35,7 +35,7 @@ Models: Training Data: ImageNet-1k FLOPs: 4457472768 Parameters: 28589128 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -49,9 +49,10 @@ Models: Code: https://github.com/facebookresearch/ConvNeXt - Name: convnext-small_3rdparty_32xb128_in1k Metadata: + Training Data: ImageNet-1k FLOPs: 8687008512 Parameters: 50223688 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -68,7 +69,7 @@ Models: Training Data: ImageNet-1k FLOPs: 8687008512 Parameters: 50223688 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -82,9 +83,10 @@ Models: Code: https://github.com/facebookresearch/ConvNeXt - Name: convnext-base_3rdparty_32xb128_in1k Metadata: + Training Data: ImageNet-1k FLOPs: 15359124480 Parameters: 88591464 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -101,7 +103,7 @@ Models: Training Data: ImageNet-1k FLOPs: 15359124480 Parameters: 88591464 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -118,7 +120,7 @@ Models: Training Data: ImageNet-21k FLOPs: 15359124480 Parameters: 88591464 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: null Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth Converted From: @@ -131,7 +133,7 @@ Models: - ImageNet-1k FLOPs: 15359124480 Parameters: 88591464 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -145,9 +147,10 @@ Models: Code: https://github.com/facebookresearch/ConvNeXt - Name: convnext-large_3rdparty_64xb64_in1k Metadata: + Training Data: ImageNet-1k FLOPs: 34368026112 Parameters: 197767336 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -164,7 +167,7 @@ Models: Training Data: ImageNet-21k FLOPs: 34368026112 Parameters: 197767336 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: null Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth Converted From: @@ -177,7 +180,7 @@ Models: - ImageNet-1k FLOPs: 34368026112 Parameters: 197767336 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: @@ -194,7 +197,7 @@ Models: Training Data: ImageNet-21k FLOPs: 60929820672 Parameters: 350196968 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: null Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth Converted From: @@ -207,7 +210,7 @@ Models: - ImageNet-1k FLOPs: 60929820672 Parameters: 350196968 - In Collections: ConvNeXt + In Collection: ConvNeXt Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/cspnet/README.md b/configs/cspnet/README.md index 10eb9d0d..63267da5 100644 --- a/configs/cspnet/README.md +++ b/configs/cspnet/README.md @@ -20,11 +20,11 @@ Neural networks have enabled state-of-the-art approaches to achieve incredible r ### ImageNet-1k -| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------: | :---------------------------------------------------------------------: | -| CSPDarkNet50\* | From scratch | 27.64 | 5.04 | 80.05 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspdarknet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth) | -| CSPResNet50\* | From scratch | 21.62 | 3.48 | 79.55 | 94.68 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth) | -| CSPResNeXt50\* | From scratch | 20.57 | 3.11 | 79.96 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnext50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth) | +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :---------------------------------------------------------------------------------------------------: | +| CSPDarkNet50\* | From scratch | 27.64 | 5.04 | 80.05 | 95.07 | [config](./cspdarknet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth) | +| CSPResNet50\* | From scratch | 21.62 | 3.48 | 79.55 | 94.68 | [config](./cspresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth) | +| CSPResNeXt50\* | From scratch | 20.57 | 3.11 | 79.96 | 94.96 | [config](./cspresnext50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth) | *Models with * are converted from the [timm repo](https://github.com/rwightman/pytorch-image-models). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/cspnet/metafile.yml b/configs/cspnet/metafile.yml index 8c4a78ed..815ca56a 100644 --- a/configs/cspnet/metafile.yml +++ b/configs/cspnet/metafile.yml @@ -17,7 +17,7 @@ Models: Metadata: FLOPs: 5040000000 Parameters: 27640000 - In Collections: CSPNet + In Collection: CSPNet Results: - Dataset: ImageNet-1k Metrics: @@ -34,7 +34,7 @@ Models: Training Data: ImageNet-1k FLOPs: 3480000000 Parameters: 21620000 - In Collections: CSPNet + In Collection: CSPNet Results: - Dataset: ImageNet-1k Metrics: @@ -50,7 +50,7 @@ Models: Metadata: FLOPs: 3110000000 Parameters: 20570000 - In Collections: CSPNet + In Collection: CSPNet Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/deit/README.md b/configs/deit/README.md index e3103658..d3a32cc0 100644 --- a/configs/deit/README.md +++ b/configs/deit/README.md @@ -18,17 +18,17 @@ Recently, neural networks purely based on attention were shown to address image The teacher of the distilled version DeiT is RegNetY-16GF. -| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------: | :--------------------------------------------------------------: | -| DeiT-tiny | From scratch | 5.72 | 1.08 | 74.50 | 92.24 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-tiny_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) | -| DeiT-tiny distilled\* | From scratch | 5.72 | 1.08 | 74.51 | 91.90 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) | -| DeiT-small | From scratch | 22.05 | 4.24 | 80.69 | 95.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-small_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) | -| DeiT-small distilled\* | From scratch | 22.05 | 4.24 | 81.17 | 95.40 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-small-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) | -| DeiT-base | From scratch | 86.57 | 16.86 | 81.76 | 95.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) | -| DeiT-base\* | From scratch | 86.57 | 16.86 | 81.79 | 95.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth) | -| DeiT-base distilled\* | From scratch | 86.57 | 16.86 | 83.33 | 96.49 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base-distilled_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) | -| DeiT-base 384px\* | ImageNet-1k | 86.86 | 49.37 | 83.04 | 96.31 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) | -| DeiT-base distilled 384px\* | ImageNet-1k | 86.86 | 49.37 | 85.55 | 97.35 | [config](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) | +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------: | :---------------------------------------------------------------------: | +| DeiT-tiny | From scratch | 5.72 | 1.08 | 74.50 | 92.24 | [config](./deit-tiny_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) | +| DeiT-tiny distilled\* | From scratch | 5.72 | 1.08 | 74.51 | 91.90 | [config](./deit-tiny-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) | +| DeiT-small | From scratch | 22.05 | 4.24 | 80.69 | 95.06 | [config](./deit-small_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) | +| DeiT-small distilled\* | From scratch | 22.05 | 4.24 | 81.17 | 95.40 | [config](./deit-small-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) | +| DeiT-base | From scratch | 86.57 | 16.86 | 81.76 | 95.81 | [config](./deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) | +| DeiT-base\* | From scratch | 86.57 | 16.86 | 81.79 | 95.59 | [config](./deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_pt-16xb64_in1k_20211124-6f40c188.pth) | +| DeiT-base distilled\* | From scratch | 86.57 | 16.86 | 83.33 | 96.49 | [config](./deit-base-distilled_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) | +| DeiT-base 384px\* | ImageNet-1k | 86.86 | 49.37 | 83.04 | 96.31 | [config](./deit-base_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) | +| DeiT-base distilled 384px\* | ImageNet-1k | 86.86 | 49.37 | 85.55 | 97.35 | [config](./deit-base-distilled_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) | *Models with * are converted from the [official repo](https://github.com/facebookresearch/deit). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/densenet/README.md b/configs/densenet/README.md index c0d3e1fe..b30f1dcf 100644 --- a/configs/densenet/README.md +++ b/configs/densenet/README.md @@ -16,12 +16,12 @@ Recent work has shown that convolutional networks can be substantially deeper, m ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :---------------------------------------------------------------------------: | -| DenseNet121\* | 7.98 | 2.88 | 74.96 | 92.21 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet121_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet121_4xb256_in1k_20220426-07450f99.pth) | -| DenseNet169\* | 14.15 | 3.42 | 76.08 | 93.11 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet169_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet169_4xb256_in1k_20220426-a2889902.pth) | -| DenseNet201\* | 20.01 | 4.37 | 77.32 | 93.64 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet201_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet201_4xb256_in1k_20220426-05cae4ef.pth) | -| DenseNet161\* | 28.68 | 7.82 | 77.61 | 93.83 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet161_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet161_4xb256_in1k_20220426-ee6a80a9.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :----------------------------------------------------------------------------------------------------------------: | +| DenseNet121\* | 7.98 | 2.88 | 74.96 | 92.21 | [config](./densenet121_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet121_4xb256_in1k_20220426-07450f99.pth) | +| DenseNet169\* | 14.15 | 3.42 | 76.08 | 93.11 | [config](./densenet169_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet169_4xb256_in1k_20220426-a2889902.pth) | +| DenseNet201\* | 20.01 | 4.37 | 77.32 | 93.64 | [config](./densenet201_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet201_4xb256_in1k_20220426-05cae4ef.pth) | +| DenseNet161\* | 28.68 | 7.82 | 77.61 | 93.83 | [config](./densenet161_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet161_4xb256_in1k_20220426-ee6a80a9.pth) | *Models with * are converted from [pytorch](https://pytorch.org/vision/stable/models.html), guided by [original repo](https://github.com/liuzhuang13/DenseNet). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/densenet/metafile.yml b/configs/densenet/metafile.yml index 84366b23..ea93537a 100644 --- a/configs/densenet/metafile.yml +++ b/configs/densenet/metafile.yml @@ -14,7 +14,7 @@ Models: Metadata: FLOPs: 2881695488 Parameters: 7978856 - In Collections: DenseNet + In Collection: DenseNet Results: - Dataset: ImageNet-1k Metrics: @@ -30,7 +30,7 @@ Models: Metadata: FLOPs: 3416860160 Parameters: 14149480 - In Collections: DenseNet + In Collection: DenseNet Results: - Dataset: ImageNet-1k Metrics: @@ -46,7 +46,7 @@ Models: Metadata: FLOPs: 4365236736 Parameters: 20013928 - In Collections: DenseNet + In Collection: DenseNet Results: - Dataset: ImageNet-1k Metrics: @@ -62,7 +62,7 @@ Models: Metadata: FLOPs: 7816363968 Parameters: 28681000 - In Collections: DenseNet + In Collection: DenseNet Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/edgenext/README.md b/configs/edgenext/README.md new file mode 100644 index 00000000..e2ff2983 --- /dev/null +++ b/configs/edgenext/README.md @@ -0,0 +1,43 @@ +# EdgeNeXt + +> [EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications](https://arxiv.org/abs/2206.10589) + + + +## Abstract + + + +In the pursuit of achieving ever-increasing accuracy, large and complex neural networks are usually developed. Such models demand high computational resources and therefore cannot be deployed on edge devices. It is of great interest to build resource-efficient general purpose networks due to their usefulness in several application areas. In this work, we strive to effectively combine the strengths of both CNN and Transformer models and propose a new efficient hybrid architecture EdgeNeXt. Specifically in EdgeNeXt, we introduce split depth-wise transpose attention (SDTA) encoder that splits input tensors into multiple channel groups and utilizes depth-wise convolution along with self-attention across channel dimensions to implicitly increase the receptive field and encode multi-scale features. Our extensive experiments on classification, detection and segmentation tasks, reveal the merits of the proposed approach, outperforming state-of-the-art methods with comparatively lower compute requirements. Our EdgeNeXt model with 1.3M parameters achieves 71.2% top-1 accuracy on ImageNet-1K, outperforming MobileViT with an absolute gain of 2.2% with 28% reduction in FLOPs. Further, our EdgeNeXt model with 5.6M parameters achieves 79.4% top-1 accuracy on ImageNet-1K. + + + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------: | :--------------------------------------------------------------------------------------: | +| EdgeNeXt-Base-usi\* | From scratch | 18.51 | 3.84 | 83.67 | 96.7 | [config](./edgenext-base_8xb256-usi_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-base_3rdparty-usi_in1k_20220801-909e8939.pth) | +| EdgeNeXt-Base\* | From scratch | 18.51 | 3.84 | 82.48 | 96.2 | [config](./edgenext-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-base_3rdparty_in1k_20220801-9ade408b.pth) | +| EdgeNeXt-Small-usi\* | From scratch | 5.59 | 1.26 | 81.06 | 95.34 | [config](./edgenext-small_8xb256-usi_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-small_3rdparty-usi_in1k_20220801-ae6d8dd3.pth) | +| EdgeNeXt-Small\* | From scratch | 5.59 | 1.26 | 79.41 | 94.53 | [config](./edgenext-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-small_3rdparty_in1k_20220801-d00db5f8.pth) | +| EdgeNeXt-X-Small\* | From scratch | 2.34 | 0.538 | 74.86 | 92.31 | [config](./edgenext-xsmall_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-xsmall_3rdparty_in1k_20220801-974f9fe7.pth) | +| EdgeNeXt-XX-Small\* | From scratch | 1.33 | 0.261 | 71.2 | 89.91 | [config](./edgenext-xxsmall_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-xxsmall_3rdparty_in1k_20220801-7ca8a81d.pth) | + +*Models with * are converted from the [official repo](https://github.com/mmaaz60/EdgeNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +```bibtex +@article{Maaz2022EdgeNeXt, + title={EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications}, + author={Muhammad Maaz and Abdelrahman Shaker and Hisham Cholakkal and Salman Khan and Syed Waqas Zamir and Rao Muhammad Anwer and Fahad Shahbaz Khan}, + journal={2206.10589}, + year={2022} +} +``` diff --git a/configs/edgenext/edgenext-base_8xb256-usi_in1k.py b/configs/edgenext/edgenext-base_8xb256-usi_in1k.py new file mode 100644 index 00000000..f8de0926 --- /dev/null +++ b/configs/edgenext/edgenext-base_8xb256-usi_in1k.py @@ -0,0 +1,19 @@ +_base_ = ['./edgenext-base_8xb256_in1k.py'] + +# dataset setting + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='ResizeEdge', + scale=269, + edge='short', + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=256), + dict(type='PackClsInputs') +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) + +test_dataloader = val_dataloader diff --git a/configs/edgenext/edgenext-base_8xb256_in1k.py b/configs/edgenext/edgenext-base_8xb256_in1k.py new file mode 100644 index 00000000..5d0a75c6 --- /dev/null +++ b/configs/edgenext/edgenext-base_8xb256_in1k.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/edgenext/edgenext-base.py', + '../_base_/datasets/imagenet_bs64_edgenext_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +# schedule setting +optim_wrapper = dict( + optimizer=dict(lr=6e-3), + clip_grad=dict(max_norm=5.0), +) + +# runtime setting +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] + +# NOTE: `auto_scale_lr` is for automatically scaling LR +# based on the actual training batch size. +# base_batch_size = (32 GPUs) x (128 samples per GPU) +auto_scale_lr = dict(base_batch_size=4096) diff --git a/configs/edgenext/edgenext-small_8xb256-usi_in1k.py b/configs/edgenext/edgenext-small_8xb256-usi_in1k.py new file mode 100644 index 00000000..bf996e27 --- /dev/null +++ b/configs/edgenext/edgenext-small_8xb256-usi_in1k.py @@ -0,0 +1,19 @@ +_base_ = ['./edgenext-small_8xb256_in1k.py'] + +# dataset setting + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='ResizeEdge', + scale=269, + edge='short', + backend='pillow', + interpolation='bicubic'), + dict(type='CenterCrop', crop_size=256), + dict(type='PackClsInputs') +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) + +test_dataloader = val_dataloader diff --git a/configs/edgenext/edgenext-small_8xb256_in1k.py b/configs/edgenext/edgenext-small_8xb256_in1k.py new file mode 100644 index 00000000..f1d99bdc --- /dev/null +++ b/configs/edgenext/edgenext-small_8xb256_in1k.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/edgenext/edgenext-small.py', + '../_base_/datasets/imagenet_bs64_edgenext_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +# schedule setting +optim_wrapper = dict( + optimizer=dict(lr=6e-3), + clip_grad=dict(max_norm=5.0), +) + +# runtime setting +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] + +# NOTE: `auto_scale_lr` is for automatically scaling LR +# based on the actual training batch size. +# base_batch_size = (32 GPUs) x (128 samples per GPU) +auto_scale_lr = dict(base_batch_size=4096) diff --git a/configs/edgenext/edgenext-xsmall_8xb256_in1k.py b/configs/edgenext/edgenext-xsmall_8xb256_in1k.py new file mode 100644 index 00000000..9d2326fc --- /dev/null +++ b/configs/edgenext/edgenext-xsmall_8xb256_in1k.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/edgenext/edgenext-xsmall.py', + '../_base_/datasets/imagenet_bs64_edgenext_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +# schedule setting +optim_wrapper = dict( + optimizer=dict(lr=6e-3), + clip_grad=dict(max_norm=5.0), +) + +# runtime setting +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] + +# NOTE: `auto_scale_lr` is for automatically scaling LR +# based on the actual training batch size. +# base_batch_size = (32 GPUs) x (128 samples per GPU) +auto_scale_lr = dict(base_batch_size=4096) diff --git a/configs/edgenext/edgenext-xxsmall_8xb256_in1k.py b/configs/edgenext/edgenext-xxsmall_8xb256_in1k.py new file mode 100644 index 00000000..507c3cb5 --- /dev/null +++ b/configs/edgenext/edgenext-xxsmall_8xb256_in1k.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/edgenext/edgenext-xxsmall.py', + '../_base_/datasets/imagenet_bs64_edgenext_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +# schedule setting +optim_wrapper = dict( + optimizer=dict(lr=6e-3), + clip_grad=dict(max_norm=5.0), +) + +# runtime setting +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] + +# NOTE: `auto_scale_lr` is for automatically scaling LR +# based on the actual training batch size. +# base_batch_size = (32 GPUs) x (128 samples per GPU) +auto_scale_lr = dict(base_batch_size=4096) diff --git a/configs/edgenext/metafile.yml b/configs/edgenext/metafile.yml new file mode 100644 index 00000000..0a332b52 --- /dev/null +++ b/configs/edgenext/metafile.yml @@ -0,0 +1,118 @@ +Collections: + - Name: EdgeNeXt + Metadata: + Training Data: ImageNet-1k + Architecture: + - SDTA + - 1x1 Convolution + - Channel Self-attention + Paper: + URL: https://arxiv.org/abs/2206.10589 + Title: 'EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications' + README: configs/edgenext/README.md + Code: + Version: v1.0.0rc1 + URL: https://github.com/open-mmlab/mmclassification/blob/v0.23.2/mmcls/models/backbones/edgenext.py + +Models: + - Name: edgenext-xxsmall_3rdparty_in1k + Metadata: + FLOPs: 255640144 + Parameters: 1327216 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 71.20 + Top 5 Accuracy: 89.91 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-xxsmall_3rdparty_in1k_20220801-7ca8a81d.pth + Config: configs/edgenext/edgenext-xxsmall_8xb256_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_xxsmall.pth + Code: https://github.com/mmaaz60/EdgeNeXt + - Name: edgenext-xsmall_3rdparty_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 529970560 + Parameters: 2336804 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 74.86 + Top 5 Accuracy: 92.31 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-xsmall_3rdparty_in1k_20220801-974f9fe7.pth + Config: configs/edgenext/edgenext-xsmall_8xb256_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_xsmall.pth + Code: https://github.com/mmaaz60/EdgeNeXt + - Name: edgenext-small_3rdparty_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 1249788000 + Parameters: 5586832 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.41 + Top 5 Accuracy: 94.53 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-small_3rdparty_in1k_20220801-d00db5f8.pth + Config: configs/edgenext/edgenext-small_8xb256_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_small.pth + Code: https://github.com/mmaaz60/EdgeNeXt + - Name: edgenext-small-usi_3rdparty_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 1249788000 + Parameters: 5586832 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.06 + Top 5 Accuracy: 95.34 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-small_3rdparty-usi_in1k_20220801-ae6d8dd3.pth + Config: configs/edgenext/edgenext-small_8xb256-usi_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.1/edgenext_small_usi.pth + Code: https://github.com/mmaaz60/EdgeNeXt + - Name: edgenext-base_3rdparty_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 3814395280 + Parameters: 18511292 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.48 + Top 5 Accuracy: 96.2 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-base_3rdparty_in1k_20220801-9ade408b.pth + Config: configs/edgenext/edgenext-base_8xb256_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base.pth + Code: https://github.com/mmaaz60/EdgeNeXt + - Name: edgenext-base_3rdparty-usi_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 3814395280 + Parameters: 18511292 + In Collection: EdgeNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.67 + Top 5 Accuracy: 96.7 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/edgenext/edgenext-base_3rdparty-usi_in1k_20220801-909e8939.pth + Config: configs/edgenext/edgenext-base_8xb256-usi_in1k.py + Converted From: + Weights: https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base_usi.pth + Code: https://github.com/mmaaz60/EdgeNeXt diff --git a/configs/efficientformer/README.md b/configs/efficientformer/README.md new file mode 100644 index 00000000..60b9c0d0 --- /dev/null +++ b/configs/efficientformer/README.md @@ -0,0 +1,47 @@ +# EfficientFormer + +> [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) + + + +## Abstract + +Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on iPhone 12 (compiled with CoreML), which runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1), and our largest model, EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can reach extremely low latency on mobile devices while maintaining high performance. + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------: | :--------------------------------------------------------------------------------------------------: | +| EfficientFormer-l1\* | 12.19 | 1.30 | 80.46 | 94.99 | [config](./efficientformer-l1_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l1_3rdparty_in1k_20220915-cc3e1ac6.pth) | +| EfficientFormer-l3\* | 31.41 | 3.93 | 82.45 | 96.18 | [config](./efficientformer-l3_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l3_3rdparty_in1k_20220915-466793d6.pth) | +| EfficientFormer-l7\* | 82.23 | 10.16 | 83.40 | 96.60 | [config](./efficientformer-l7_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l7_3rdparty_in1k_20220915-185e30af.pth) | + +*Models with * are converted from the [official repo](https://github.com/snap-research/EfficientFormer). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2206.01191, + doi = {10.48550/ARXIV.2206.01191}, + + url = {https://arxiv.org/abs/2206.01191}, + + author = {Li, Yanyu and Yuan, Geng and Wen, Yang and Hu, Eric and Evangelidis, Georgios and Tulyakov, Sergey and Wang, Yanzhi and Ren, Jian}, + + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + + title = {EfficientFormer: Vision Transformers at MobileNet Speed}, + + publisher = {arXiv}, + + year = {2022}, + + copyright = {Creative Commons Attribution 4.0 International} +} +``` diff --git a/configs/efficientformer/efficientformer-l1_8xb128_in1k.py b/configs/efficientformer/efficientformer-l1_8xb128_in1k.py new file mode 100644 index 00000000..7f55dc65 --- /dev/null +++ b/configs/efficientformer/efficientformer-l1_8xb128_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/efficientformer-l1.py', + '../_base_/datasets/imagenet_bs128_poolformer_small_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] diff --git a/configs/efficientformer/efficientformer-l3_8xb128_in1k.py b/configs/efficientformer/efficientformer-l3_8xb128_in1k.py new file mode 100644 index 00000000..d8be5efa --- /dev/null +++ b/configs/efficientformer/efficientformer-l3_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = './efficientformer-l1_8xb128_in1k.py' + +model = dict(backbone=dict(arch='l3'), head=dict(in_channels=512)) diff --git a/configs/efficientformer/efficientformer-l7_8xb128_in1k.py b/configs/efficientformer/efficientformer-l7_8xb128_in1k.py new file mode 100644 index 00000000..c2252652 --- /dev/null +++ b/configs/efficientformer/efficientformer-l7_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = './efficientformer-l1_8xb128_in1k.py' + +model = dict(backbone=dict(arch='l7'), head=dict(in_channels=768)) diff --git a/configs/efficientformer/metafile.yml b/configs/efficientformer/metafile.yml new file mode 100644 index 00000000..d958e0a4 --- /dev/null +++ b/configs/efficientformer/metafile.yml @@ -0,0 +1,67 @@ +Collections: + - Name: EfficientFormer + Metadata: + Training Data: ImageNet-1k + Architecture: + - Pooling + - 1x1 Convolution + - LayerScale + - MetaFormer + Paper: + URL: https://arxiv.org/pdf/2206.01191.pdf + Title: "EfficientFormer: Vision Transformers at MobileNet Speed" + README: configs/efficientformer/README.md + Code: + Version: v1.0.0rc1 + URL: https://github.com/open-mmlab/mmclassification/blob/v1.0.0rc1/configs/efficientformer/metafile.yml + +Models: + - Name: efficientformer-l1_3rdparty_8xb128_in1k + Metadata: + FLOPs: 1304601088 # 1.3G + Parameters: 12278696 # 12M + In Collection: EfficientFormer + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 80.46 + Top 5 Accuracy: 94.99 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l1_3rdparty_in1k_20220915-cc3e1ac6.pth + Config: configs/efficientformer/efficientformer-l1_8xb128_in1k.py + Converted From: + Weights: https://drive.google.com/file/d/11SbX-3cfqTOc247xKYubrAjBiUmr818y/view?usp=sharing + Code: https://github.com/snap-research/EfficientFormer + - Name: efficientformer-l3_3rdparty_8xb128_in1k + Metadata: + Training Data: ImageNet-1k + FLOPs: 3737045760 # 3.7G + Parameters: 31406000 # 31M + In Collection: EfficientFormer + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.45 + Top 5 Accuracy: 96.18 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l3_3rdparty_in1k_20220915-466793d6.pth + Config: configs/efficientformer/efficientformer-l3_8xb128_in1k.py + Converted From: + Weights: https://drive.google.com/file/d/1OyyjKKxDyMj-BcfInp4GlDdwLu3hc30m/view?usp=sharing + Code: https://github.com/snap-research/EfficientFormer + - Name: efficientformer-l7_3rdparty_8xb128_in1k + Metadata: + FLOPs: 10163951616 # 10.2G + Parameters: 82229328 # 82M + In Collection: EfficientFormer + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.40 + Top 5 Accuracy: 96.60 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l7_3rdparty_in1k_20220915-185e30af.pth + Config: configs/efficientformer/efficientformer-l7_8xb128_in1k.py + Converted From: + Weights: https://drive.google.com/file/d/1cVw-pctJwgvGafeouynqWWCwgkcoFMM5/view?usp=sharing + Code: https://github.com/snap-research/EfficientFormer diff --git a/configs/efficientnet/README.md b/configs/efficientnet/README.md index 832f5c6b..5742b667 100644 --- a/configs/efficientnet/README.md +++ b/configs/efficientnet/README.md @@ -20,31 +20,31 @@ In the result table, AA means trained with AutoAugment pre-processing, more deta Note: In MMClassification, we support training with AutoAugment, don't support AdvProp by now. -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :------------------------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------: | :------------------------------------------------------------------: | -| EfficientNet-B0\* | 5.29 | 0.02 | 76.74 | 93.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth) | -| EfficientNet-B0 (AA)\* | 5.29 | 0.02 | 77.26 | 93.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth) | -| EfficientNet-B0 (AA + AdvProp)\* | 5.29 | 0.02 | 77.53 | 93.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth) | -| EfficientNet-B1\* | 7.79 | 0.03 | 78.68 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth) | -| EfficientNet-B1 (AA)\* | 7.79 | 0.03 | 79.20 | 94.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth) | -| EfficientNet-B1 (AA + AdvProp)\* | 7.79 | 0.03 | 79.52 | 94.43 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth) | -| EfficientNet-B2\* | 9.11 | 0.03 | 79.64 | 94.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth) | -| EfficientNet-B2 (AA)\* | 9.11 | 0.03 | 80.21 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth) | -| EfficientNet-B2 (AA + AdvProp)\* | 9.11 | 0.03 | 80.45 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth) | -| EfficientNet-B3\* | 12.23 | 0.06 | 81.01 | 95.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth) | -| EfficientNet-B3 (AA)\* | 12.23 | 0.06 | 81.58 | 95.67 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth) | -| EfficientNet-B3 (AA + AdvProp)\* | 12.23 | 0.06 | 81.81 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth) | -| EfficientNet-B4\* | 19.34 | 0.12 | 82.57 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth) | -| EfficientNet-B4 (AA)\* | 19.34 | 0.12 | 82.95 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth) | -| EfficientNet-B4 (AA + AdvProp)\* | 19.34 | 0.12 | 83.25 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth) | -| EfficientNet-B5\* | 30.39 | 0.24 | 83.18 | 96.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth) | -| EfficientNet-B5 (AA)\* | 30.39 | 0.24 | 83.82 | 96.76 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth) | -| EfficientNet-B5 (AA + AdvProp)\* | 30.39 | 0.24 | 84.21 | 96.98 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth) | -| EfficientNet-B6 (AA)\* | 43.04 | 0.41 | 84.05 | 96.82 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth) | -| EfficientNet-B6 (AA + AdvProp)\* | 43.04 | 0.41 | 84.74 | 97.14 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth) | -| EfficientNet-B7 (AA)\* | 66.35 | 0.72 | 84.38 | 96.88 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth) | -| EfficientNet-B7 (AA + AdvProp)\* | 66.35 | 0.72 | 85.14 | 97.23 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth) | -| EfficientNet-B8 (AA + AdvProp)\* | 87.41 | 1.09 | 85.38 | 97.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------------------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------: | :-----------------------------------------------------------------------------------: | +| EfficientNet-B0\* | 5.29 | 0.02 | 76.74 | 93.17 | [config](./efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth) | +| EfficientNet-B0 (AA)\* | 5.29 | 0.02 | 77.26 | 93.41 | [config](./efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth) | +| EfficientNet-B0 (AA + AdvProp)\* | 5.29 | 0.02 | 77.53 | 93.61 | [config](./efficientnet-b0_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth) | +| EfficientNet-B1\* | 7.79 | 0.03 | 78.68 | 94.28 | [config](./efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth) | +| EfficientNet-B1 (AA)\* | 7.79 | 0.03 | 79.20 | 94.42 | [config](./efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth) | +| EfficientNet-B1 (AA + AdvProp)\* | 7.79 | 0.03 | 79.52 | 94.43 | [config](./efficientnet-b1_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth) | +| EfficientNet-B2\* | 9.11 | 0.03 | 79.64 | 94.80 | [config](./efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth) | +| EfficientNet-B2 (AA)\* | 9.11 | 0.03 | 80.21 | 94.96 | [config](./efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth) | +| EfficientNet-B2 (AA + AdvProp)\* | 9.11 | 0.03 | 80.45 | 95.07 | [config](./efficientnet-b2_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth) | +| EfficientNet-B3\* | 12.23 | 0.06 | 81.01 | 95.34 | [config](./efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth) | +| EfficientNet-B3 (AA)\* | 12.23 | 0.06 | 81.58 | 95.67 | [config](./efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth) | +| EfficientNet-B3 (AA + AdvProp)\* | 12.23 | 0.06 | 81.81 | 95.69 | [config](./efficientnet-b3_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth) | +| EfficientNet-B4\* | 19.34 | 0.12 | 82.57 | 96.09 | [config](./efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth) | +| EfficientNet-B4 (AA)\* | 19.34 | 0.12 | 82.95 | 96.26 | [config](./efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth) | +| EfficientNet-B4 (AA + AdvProp)\* | 19.34 | 0.12 | 83.25 | 96.44 | [config](./efficientnet-b4_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth) | +| EfficientNet-B5\* | 30.39 | 0.24 | 83.18 | 96.47 | [config](./efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth) | +| EfficientNet-B5 (AA)\* | 30.39 | 0.24 | 83.82 | 96.76 | [config](./efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth) | +| EfficientNet-B5 (AA + AdvProp)\* | 30.39 | 0.24 | 84.21 | 96.98 | [config](./efficientnet-b5_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth) | +| EfficientNet-B6 (AA)\* | 43.04 | 0.41 | 84.05 | 96.82 | [config](./efficientnet-b6_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth) | +| EfficientNet-B6 (AA + AdvProp)\* | 43.04 | 0.41 | 84.74 | 97.14 | [config](./efficientnet-b6_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth) | +| EfficientNet-B7 (AA)\* | 66.35 | 0.72 | 84.38 | 96.88 | [config](./efficientnet-b7_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth) | +| EfficientNet-B7 (AA + AdvProp)\* | 66.35 | 0.72 | 85.14 | 97.23 | [config](./efficientnet-b7_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth) | +| EfficientNet-B8 (AA + AdvProp)\* | 87.41 | 1.09 | 85.38 | 97.28 | [config](./efficientnet-b8_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth) | *Models with * are converted from the [official repo](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/efficientnet/metafile.yml b/configs/efficientnet/metafile.yml index c8bbf0dd..66e73e41 100644 --- a/configs/efficientnet/metafile.yml +++ b/configs/efficientnet/metafile.yml @@ -25,7 +25,7 @@ Models: Metadata: FLOPs: 16481180 Parameters: 5288548 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -41,7 +41,7 @@ Models: Metadata: FLOPs: 16481180 Parameters: 5288548 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -57,7 +57,7 @@ Models: Metadata: FLOPs: 16481180 Parameters: 5288548 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -73,7 +73,7 @@ Models: Metadata: FLOPs: 27052224 Parameters: 7794184 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -89,7 +89,7 @@ Models: Metadata: FLOPs: 27052224 Parameters: 7794184 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -105,7 +105,7 @@ Models: Metadata: FLOPs: 27052224 Parameters: 7794184 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -121,7 +121,7 @@ Models: Metadata: FLOPs: 34346386 Parameters: 9109994 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -137,7 +137,7 @@ Models: Metadata: FLOPs: 34346386 Parameters: 9109994 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -153,7 +153,7 @@ Models: Metadata: FLOPs: 34346386 Parameters: 9109994 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -169,7 +169,7 @@ Models: Metadata: FLOPs: 58641904 Parameters: 12233232 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -185,7 +185,7 @@ Models: Metadata: FLOPs: 58641904 Parameters: 12233232 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -201,7 +201,7 @@ Models: Metadata: FLOPs: 58641904 Parameters: 12233232 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -217,7 +217,7 @@ Models: Metadata: FLOPs: 121870624 Parameters: 19341616 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -233,7 +233,7 @@ Models: Metadata: FLOPs: 121870624 Parameters: 19341616 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -249,7 +249,7 @@ Models: Metadata: FLOPs: 121870624 Parameters: 19341616 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -265,7 +265,7 @@ Models: Metadata: FLOPs: 243879440 Parameters: 30389784 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -281,7 +281,7 @@ Models: Metadata: FLOPs: 243879440 Parameters: 30389784 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -297,7 +297,7 @@ Models: Metadata: FLOPs: 243879440 Parameters: 30389784 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -313,7 +313,7 @@ Models: Metadata: FLOPs: 412002408 Parameters: 43040704 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -329,7 +329,7 @@ Models: Metadata: FLOPs: 412002408 Parameters: 43040704 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -345,7 +345,7 @@ Models: Metadata: FLOPs: 715526512 Parameters: 66347960 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -361,7 +361,7 @@ Models: Metadata: FLOPs: 715526512 Parameters: 66347960 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: @@ -377,7 +377,7 @@ Models: Metadata: FLOPs: 1092755326 Parameters: 87413142 - In Collections: EfficientNet + In Collection: EfficientNet Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md index 0a30ccd1..5e75d28e 100644 --- a/configs/hrnet/README.md +++ b/configs/hrnet/README.md @@ -16,17 +16,17 @@ High-resolution representations are essential for position-sensitive vision prob ## ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :----------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------------: | :-------------------------------------------------------------------------: | -| HRNet-W18\* | 21.30 | 4.33 | 76.75 | 93.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth) | -| HRNet-W30\* | 37.71 | 8.17 | 78.19 | 94.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w30_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth) | -| HRNet-W32\* | 41.23 | 8.99 | 78.44 | 94.19 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w32_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth) | -| HRNet-W40\* | 57.55 | 12.77 | 78.94 | 94.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w40_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth) | -| HRNet-W44\* | 67.06 | 14.96 | 78.88 | 94.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w44_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth) | -| HRNet-W48\* | 77.47 | 17.36 | 79.32 | 94.52 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth) | -| HRNet-W64\* | 128.06 | 29.00 | 79.46 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w64_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth) | -| HRNet-W18 (ssld)\* | 21.30 | 4.33 | 81.06 | 95.70 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth) | -| HRNet-W48 (ssld)\* | 77.47 | 17.36 | 83.63 | 96.79 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :----------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------: | :--------------------------------------------------------------------------------------------------------------: | +| HRNet-W18\* | 21.30 | 4.33 | 76.75 | 93.44 | [config](./hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth) | +| HRNet-W30\* | 37.71 | 8.17 | 78.19 | 94.22 | [config](./hrnet-w30_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth) | +| HRNet-W32\* | 41.23 | 8.99 | 78.44 | 94.19 | [config](./hrnet-w32_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth) | +| HRNet-W40\* | 57.55 | 12.77 | 78.94 | 94.47 | [config](./hrnet-w40_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth) | +| HRNet-W44\* | 67.06 | 14.96 | 78.88 | 94.37 | [config](./hrnet-w44_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth) | +| HRNet-W48\* | 77.47 | 17.36 | 79.32 | 94.52 | [config](./hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth) | +| HRNet-W64\* | 128.06 | 29.00 | 79.46 | 94.65 | [config](./hrnet-w64_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth) | +| HRNet-W18 (ssld)\* | 21.30 | 4.33 | 81.06 | 95.70 | [config](./hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth) | +| HRNet-W48 (ssld)\* | 77.47 | 17.36 | 83.63 | 96.79 | [config](./hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth) | *Models with * are converted from the [official repo](https://github.com/HRNet/HRNet-Image-Classification). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/inception_v3/README.md b/configs/inception_v3/README.md index e303b742..c77bf048 100644 --- a/configs/inception_v3/README.md +++ b/configs/inception_v3/README.md @@ -16,9 +16,9 @@ Convolutional networks are at the core of most state-of-the-art computer vision ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------------: | :---------------------------------------------------------------------------: | -| Inception V3\* | 23.83 | 5.75 | 77.57 | 93.58 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/inception_v3/inception-v3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/inception-v3/inception-v3_3rdparty_8xb32_in1k_20220615-dcd4d910.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :---------------------------------------------------------------------------------------------------------------: | +| Inception V3\* | 23.83 | 5.75 | 77.57 | 93.58 | [config](./inception-v3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/inception-v3/inception-v3_3rdparty_8xb32_in1k_20220615-dcd4d910.pth) | *Models with * are converted from the [official repo](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py#L28). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/inception_v3/metafile.yml b/configs/inception_v3/metafile.yml index bf93bd2c..c127ca66 100644 --- a/configs/inception_v3/metafile.yml +++ b/configs/inception_v3/metafile.yml @@ -15,8 +15,8 @@ Collections: Title: "Rethinking the Inception Architecture for Computer Vision" README: configs/inception_v3/README.md Code: - URL: TODO - Version: TODO + URL: https://github.com/open-mmlab/mmclassification/blob/v1.0.0rc1/configs/inception_v3/metafile.yml + Version: v1.0.0rc1 Models: - Name: inception-v3_3rdparty_8xb32_in1k diff --git a/configs/mlp_mixer/README.md b/configs/mlp_mixer/README.md index 5ec98871..dc2b616b 100644 --- a/configs/mlp_mixer/README.md +++ b/configs/mlp_mixer/README.md @@ -16,10 +16,10 @@ Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Re ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | -| Mixer-B/16\* | 59.88 | 12.61 | 76.68 | 92.25 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth) | -| Mixer-L/16\* | 208.2 | 44.57 | 72.34 | 88.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :----------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------: | :---------------------------------------------------------------------------------------------------------: | +| Mixer-B/16\* | 59.88 | 12.61 | 76.68 | 92.25 | [config](./mlp-mixer-base-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth) | +| Mixer-L/16\* | 208.2 | 44.57 | 72.34 | 88.02 | [config](./mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) | *Models with * are converted from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/mobilenet_v2/README.md b/configs/mobilenet_v2/README.md index 675c8dd4..26cee8aa 100644 --- a/configs/mobilenet_v2/README.md +++ b/configs/mobilenet_v2/README.md @@ -18,9 +18,9 @@ The MobileNetV2 architecture is based on an inverted residual structure where th ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | -| MobileNet V2 | 3.5 | 0.319 | 71.86 | 90.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :-----------------------------------------------------------------------------------------------------------------: | +| MobileNet V2 | 3.5 | 0.319 | 71.86 | 90.42 | [config](./mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) | ## Citation diff --git a/configs/mobilenet_v3/README.md b/configs/mobilenet_v3/README.md index c93d540c..254f189e 100644 --- a/configs/mobilenet_v3/README.md +++ b/configs/mobilenet_v3/README.md @@ -16,10 +16,10 @@ We present the next generation of MobileNets based on a combination of complemen ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------------: | :------------------------------------------------------------------------: | -| MobileNetV3-Small\* | 2.54 | 0.06 | 67.66 | 87.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v3/mobilenet-v3-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth) | -| MobileNetV3-Large\* | 5.48 | 0.23 | 74.04 | 91.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------: | :---------------------------------------------------------------------------------------------------: | +| MobileNetV3-Small\* | 2.54 | 0.06 | 67.66 | 87.41 | [config](./mobilenet-v3-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth) | +| MobileNetV3-Large\* | 5.48 | 0.23 | 74.04 | 91.34 | [config](./mobilenet-v3-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth) | *Models with * are converted from [torchvision](https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/mobileone/README.md b/configs/mobileone/README.md new file mode 100644 index 00000000..80fbb148 --- /dev/null +++ b/configs/mobileone/README.md @@ -0,0 +1,132 @@ +# MobileOne + +> [An Improved One millisecond Mobile Backbone](https://arxiv.org/abs/2206.04040) + + + +## Abstract + +Efficient neural network backbones for mobile devices are often optimized for metrics such as FLOPs or parameter count. However, these metrics may not correlate well with latency of the network when deployed on a mobile device. Therefore, we perform extensive analysis of different metrics by deploying several mobile-friendly networks on a mobile device. We identify and analyze architectural and optimization bottlenecks in recent efficient neural networks and provide ways to mitigate these bottlenecks. To this end, we design an efficient backbone MobileOne, with variants achieving an inference time under 1 ms on an iPhone12 with 75.9% top-1 accuracy on ImageNet. We show that MobileOne achieves state-of-the-art performance within the efficient architectures while being many times faster on mobile. Our best model obtains similar performance on ImageNet as MobileFormer while being 38x faster. Our model obtains 2.3% better top-1 accuracy on ImageNet than EfficientNet at similar latency. Furthermore, we show that our model generalizes to multiple tasks - image classification, object detection, and semantic segmentation with significant improvements in latency and accuracy as compared to existing efficient architectures when deployed on a mobile device. + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------: | :-----------------------------: | :----------------------------: | :-------: | :-------: | :--------------------------------------------------: | :-----------------------------------------------------: | +| MobileOne-s0\* | 5.29(train) \| 2.08 (deploy) | 1.09 (train) \| 0.28 (deploy) | 71.36 | 89.87 | [config (train)](./mobileone-s0_8xb128_in1k.py) \| [config (deploy)](./deploy/mobileone-s0_deploy_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_3rdparty_in1k_20220915-007ae971.pth) | +| MobileOne-s1\* | 4.83 (train) \| 4.76 (deploy) | 0.86 (train) \| 0.84 (deploy) | 75.76 | 92.77 | [config (train)](./mobileone-s1_8xb128_in1k.py) \| [config (deploy)](./deploy/mobileone-s1_deploy_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_3rdparty_in1k_20220915-473c8469.pth) | +| MobileOne-s2\* | 7.88 (train) \| 7.88 (deploy) | 1.34 (train) \| 1.31 (deploy) | 77.39 | 93.63 | [config (train)](./mobileone-s2_8xb128_in1k.py) \|[config (deploy)](./deploy/mobileone-s2_deploy_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_3rdparty_in1k_20220915-ed2e4c30.pth) | +| MobileOne-s3\* | 10.17 (train) \| 10.08 (deploy) | 1.95 (train) \| 1.91 (deploy) | 77.93 | 93.89 | [config (train)](./mobileone-s3_8xb128_in1k.py) \|[config (deploy)](./deploy/mobileone-s3_deploy_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_3rdparty_in1k_20220915-84d6a02c.pth) | +| MobileOne-s4\* | 14.95 (train) \| 14.84 (deploy) | 3.05 (train) \| 3.00 (deploy) | 79.30 | 94.37 | [config (train)](./mobileone-s4_8xb128_in1k.py) \|[config (deploy)](./deploy/mobileone-s4_deploy_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_3rdparty_in1k_20220915-ce9509ee.pth) | + +*Models with * are converted from the [official repo](https://github.com/apple/ml-mobileone). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +*Because the [official repo.](https://github.com/apple/ml-mobileone) does not give a strategy for training and testing, the test data pipline of [RepVGG](https://github.com/open-mmlab/mmclassification/tree/master/configs/repvgg) is used here, and the result is about 0.1 lower than that in the paper. Refer to [this issue](https://github.com/apple/ml-mobileone/issues/2).* + +## How to use + +The checkpoints provided are all `training-time` models. Use the reparameterize tool to switch them to more efficient `inference-time` architecture, which not only has fewer parameters but also less calculations. + +### Use tool + +Use provided tool to reparameterize the given model and save the checkpoint: + +```bash +python tools/convert_models/reparameterize_model.py ${CFG_PATH} ${SRC_CKPT_PATH} ${TARGET_CKPT_PATH} +``` + +`${CFG_PATH}` is the config file path, `${SRC_CKPT_PATH}` is the source chenpoint file path, `${TARGET_CKPT_PATH}` is the target deploy weight file path. + +For example: + +```shell +python ./tools/convert_models/reparameterize_model.py ./configs/mobileone/mobileone-s0_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_3rdparty_in1k_20220811-db5ce29b.pth ./mobileone_s0_deploy.pth +``` + +To use reparameterized weights, the config file must switch to **the deploy config files**. + +```bash +python tools/test.py ${Deploy_CFG} ${Deploy_Checkpoint} --metrics accuracy +``` + +For example of using the reparameterized weights above: + +```shell +python ./tools/test.py ./configs/mobileone/deploy/mobileone-s0_deploy_8xb128_in1k.py mobileone_s0_deploy.pth --metrics accuracy +``` + +### In the code + +Use the API `switch_to_deploy` of `MobileOne` backbone to to switch to the deploy mode. Usually called like `backbone.switch_to_deploy()` or `classificer.backbone.switch_to_deploy()`. + +For Backbones: + +```python +from mmcls.models import build_backbone +import torch + +x = torch.randn( (1, 3, 224, 224) ) +backbone_cfg=dict(type='MobileOne', arch='s0') +backbone = build_backbone(backbone_cfg) +backbone.init_weights() +backbone.eval() +outs_ori = backbone(x) + +backbone.switch_to_deploy() +outs_dep = backbone(x) + +for out1, out2 in zip(outs_ori, outs_dep): + assert torch.allclose(out1, out2) +``` + +For ImageClassifiers: + +```python +from mmcls.models import build_classifier +import torch +import numpy as np + +cfg = dict( + type='ImageClassifier', + backbone=dict( + type='MobileOne', + arch='s0', + out_indices=(3, ), + ), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1024, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) + +x = torch.randn( (1, 3, 224, 224) ) +classifier = build_classifier(cfg) +classifier.init_weights() +classifier.eval() +y_ori = classifier(x, return_loss=False) + +classifier.backbone.switch_to_deploy() +y_dep = classifier(x, return_loss=False) + +for y1, y2 in zip(y_ori, y_dep): + assert np.allclose(y1, y2) +``` + +## Citation + +```bibtex +@article{mobileone2022, + title={An Improved One millisecond Mobile Backbone}, + author={Vasu, Pavan Kumar Anasosalu and Gabriel, James and Zhu, Jeff and Tuzel, Oncel and Ranjan, Anurag}, + journal={arXiv preprint arXiv:2206.04040}, + year={2022} +} +``` diff --git a/configs/mobileone/deploy/mobileone-s0_deploy_8xb128_in1k.py b/configs/mobileone/deploy/mobileone-s0_deploy_8xb128_in1k.py new file mode 100644 index 00000000..8902483c --- /dev/null +++ b/configs/mobileone/deploy/mobileone-s0_deploy_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['../mobileone-s0_8xb128_in1k.py'] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/mobileone/deploy/mobileone-s1_deploy_8xb128_in1k.py b/configs/mobileone/deploy/mobileone-s1_deploy_8xb128_in1k.py new file mode 100644 index 00000000..7bcf3211 --- /dev/null +++ b/configs/mobileone/deploy/mobileone-s1_deploy_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['../mobileone-s1_8xb128_in1k.py'] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/mobileone/deploy/mobileone-s2_deploy_8xb128_in1k.py b/configs/mobileone/deploy/mobileone-s2_deploy_8xb128_in1k.py new file mode 100644 index 00000000..5d64d519 --- /dev/null +++ b/configs/mobileone/deploy/mobileone-s2_deploy_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['../mobileone-s2_8xb128_in1k.py'] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/mobileone/deploy/mobileone-s3_deploy_8xb128_in1k.py b/configs/mobileone/deploy/mobileone-s3_deploy_8xb128_in1k.py new file mode 100644 index 00000000..8c710f78 --- /dev/null +++ b/configs/mobileone/deploy/mobileone-s3_deploy_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['../mobileone-s3_8xb128_in1k.py'] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/mobileone/deploy/mobileone-s4_deploy_8xb128_in1k.py b/configs/mobileone/deploy/mobileone-s4_deploy_8xb128_in1k.py new file mode 100644 index 00000000..6ca4d18e --- /dev/null +++ b/configs/mobileone/deploy/mobileone-s4_deploy_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['../mobileone-s4_8xb128_in1k.py'] + +model = dict(backbone=dict(deploy=True)) diff --git a/configs/mobileone/metafile.yml b/configs/mobileone/metafile.yml new file mode 100644 index 00000000..04eaceff --- /dev/null +++ b/configs/mobileone/metafile.yml @@ -0,0 +1,98 @@ +Collections: + - Name: MobileOne + Metadata: + Training Data: ImageNet-1k + Architecture: + - re-parameterization Convolution + - VGG-style Neural Network + - Depthwise Convolution + - Pointwise Convolution + Paper: + URL: https://arxiv.org/abs/2206.04040 + Title: 'An Improved One millisecond Mobile Backbone' + README: configs/mobileone/README.md + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v1.0.0rc1/configs/mobileone/metafile.yml + Version: v1.0.0rc1 + +Models: + - Name: mobileone-s0_3rdparty_8xb128_in1k + In Collection: MobileOne + Config: configs/mobileone/mobileone-s0_8xb128_in1k.py + Metadata: + FLOPs: 1091227648 # 1.09G + Parameters: 5293272 # 5.29M + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 71.36 + Top 5 Accuracy: 89.87 + Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_3rdparty_in1k_20220915-007ae971.pth + Converted From: + Weights: https://docs-assets.developer.apple.com/ml-research/datasets/mobileone/mobileone_s0_unfused.pth.tar + Code: https://github.com/apple/ml-mobileone + - Name: mobileone-s1_3rdparty_8xb128_in1k + In Collection: MobileOne + Config: configs/mobileone/mobileone-s1_8xb128_in1k.py + Metadata: + FLOPs: 863491328 # 8.6G + Parameters: 4825192 # 4.82M + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 75.76 + Top 5 Accuracy: 92.77 + Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_3rdparty_in1k_20220915-473c8469.pth + Converted From: + Weights: https://docs-assets.developer.apple.com/ml-research/datasets/mobileone/mobileone_s1_unfused.pth.tar + Code: https://github.com/apple/ml-mobileone + - Name: mobileone-s2_3rdparty_8xb128_in1k + In Collection: MobileOne + Config: configs/mobileone/mobileone-s2_8xb128_in1k.py + Metadata: + FLOPs: 1344083328 + Parameters: 7884648 + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 77.39 + Top 5 Accuracy: 93.63 + Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_3rdparty_in1k_20220915-ed2e4c30.pth + Converted From: + Weights: https://docs-assets.developer.apple.com/ml-research/datasets/mobileone/mobileone_s2_unfused.pth.tar + Code: https://github.com/apple/ml-mobileone + - Name: mobileone-s3_3rdparty_8xb128_in1k + In Collection: MobileOne + Config: configs/mobileone/mobileone-s3_8xb128_in1k.py + Metadata: + FLOPs: 1951043584 + Parameters: 10170600 + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 77.93 + Top 5 Accuracy: 93.89 + Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_3rdparty_in1k_20220915-84d6a02c.pth + Converted From: + Weights: https://docs-assets.developer.apple.com/ml-research/datasets/mobileone/mobileone_s3_unfused.pth.tar + Code: https://github.com/apple/ml-mobileone + - Name: mobileone-s4_3rdparty_8xb128_in1k + In Collection: MobileOne + Config: configs/mobileone/mobileone-s4_8xb128_in1k.py + Metadata: + FLOPs: 3052580688 + Parameters: 14951248 + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 79.30 + Top 5 Accuracy: 94.37 + Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_3rdparty_in1k_20220915-ce9509ee.pth + Converted From: + Weights: https://docs-assets.developer.apple.com/ml-research/datasets/mobileone/mobileone_s4_unfused.pth.tar + Code: https://github.com/apple/ml-mobileone diff --git a/configs/mobileone/mobileone-s0_8xb128_in1k.py b/configs/mobileone/mobileone-s0_8xb128_in1k.py new file mode 100644 index 00000000..ceeb21b7 --- /dev/null +++ b/configs/mobileone/mobileone-s0_8xb128_in1k.py @@ -0,0 +1,56 @@ +_base_ = [ + '../_base_/models/mobileone/mobileone_s0.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=128) +val_dataloader = dict(batch_size=128) +test_dataloader = dict(batch_size=128) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001), + paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), +) + +# learning policy +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=0.001, + by_epoch=True, + begin=0, + end=5, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict( + type='CosineAnnealingLR', + T_max=295, + eta_min=1.0e-6, + by_epoch=True, + begin=5, + end=300), + dict( + type='CosineAnnealingParamScheduler', + param_name='weight_decay', + eta_min=0.00001, + by_epoch=True, + begin=0, + end=300) +] + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1) +val_cfg = dict() +test_cfg = dict() + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=1024) + +# runtime setting +custom_hooks = [dict(type='EMAHook', momentum=5e-4, priority='ABOVE_NORMAL')] diff --git a/configs/mobileone/mobileone-s1_8xb128_in1k.py b/configs/mobileone/mobileone-s1_8xb128_in1k.py new file mode 100644 index 00000000..b14c7c17 --- /dev/null +++ b/configs/mobileone/mobileone-s1_8xb128_in1k.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/mobileone/mobileone_s1.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=128) +val_dataloader = dict(batch_size=128) +test_dataloader = dict(batch_size=128) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=1024) diff --git a/configs/mobileone/mobileone-s2_8xb128_in1k.py b/configs/mobileone/mobileone-s2_8xb128_in1k.py new file mode 100644 index 00000000..dca0d4d3 --- /dev/null +++ b/configs/mobileone/mobileone-s2_8xb128_in1k.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/mobileone/mobileone_s2.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=128) +val_dataloader = dict(batch_size=128) +test_dataloader = dict(batch_size=128) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=1024) diff --git a/configs/mobileone/mobileone-s3_8xb128_in1k.py b/configs/mobileone/mobileone-s3_8xb128_in1k.py new file mode 100644 index 00000000..89343d5d --- /dev/null +++ b/configs/mobileone/mobileone-s3_8xb128_in1k.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/mobileone/mobileone_s3.py', + '../_base_/datasets/imagenet_bs64_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=128) +val_dataloader = dict(batch_size=128) +test_dataloader = dict(batch_size=128) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=1024) diff --git a/configs/mobileone/mobileone-s4_8xb128_in1k.py b/configs/mobileone/mobileone-s4_8xb128_in1k.py new file mode 100644 index 00000000..1984ef35 --- /dev/null +++ b/configs/mobileone/mobileone-s4_8xb128_in1k.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/mobileone/mobileone_s4.py', + '../_base_/datasets/imagenet_bs64_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=128) +val_dataloader = dict(batch_size=128) +test_dataloader = dict(batch_size=128) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=1024) diff --git a/configs/mvit/README.md b/configs/mvit/README.md new file mode 100644 index 00000000..7aa46e4d --- /dev/null +++ b/configs/mvit/README.md @@ -0,0 +1,44 @@ +# MViT V2 + +> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf) + + + +## Abstract + +In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video +classification, as well as object detection. We present an improved version of MViT that incorporates +decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture +in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where +it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where +it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art +performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as +well as 86.1% on Kinetics-400 video classification. + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------: | :--------------------------------------------------------------------------------------------------: | +| MViTv2-tiny\* | From scratch | 24.17 | 4.70 | 82.33 | 96.15 | [config](./mvitv2-tiny_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) | +| MViTv2-small\* | From scratch | 34.87 | 7.00 | 83.63 | 96.51 | [config](./mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) | +| MViTv2-base\* | From scratch | 51.47 | 10.20 | 84.34 | 96.86 | [config](./mvitv2-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) | +| MViTv2-large\* | From scratch | 217.99 | 42.10 | 85.25 | 97.14 | [config](./mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) | + +*Models with * are converted from the [official repo](https://github.com/facebookresearch/mvit). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +```bibtex +@inproceedings{li2021improved, + title={MViTv2: Improved multiscale vision transformers for classification and detection}, + author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph}, + booktitle={CVPR}, + year={2022} +} +``` diff --git a/configs/mvit/metafile.yml b/configs/mvit/metafile.yml new file mode 100644 index 00000000..4dfae023 --- /dev/null +++ b/configs/mvit/metafile.yml @@ -0,0 +1,95 @@ +Collections: + - Name: MViT V2 + Metadata: + Architecture: + - Attention Dropout + - Convolution + - Dense Connections + - GELU + - Layer Normalization + - Scaled Dot-Product Attention + - Attention Pooling + Paper: + URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf + Title: 'MViTv2: Improved Multiscale Vision Transformers for Classification and Detection' + README: configs/mvit/README.md + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/backbones/mvit.py + Version: v0.24.0 + +Models: + - Name: mvitv2-tiny_3rdparty_in1k + In Collection: MViT V2 + Metadata: + FLOPs: 4703510768 + Parameters: 24173320 + Training Data: + - ImageNet-1k + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 82.33 + Top 5 Accuracy: 96.15 + Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth + Code: https://github.com/facebookresearch/mvit + Config: configs/mvit/mvitv2-tiny_8xb256_in1k.py + + - Name: mvitv2-small_3rdparty_in1k + In Collection: MViT V2 + Metadata: + FLOPs: 6997555136 + Parameters: 34870216 + Training Data: + - ImageNet-1k + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 83.63 + Top 5 Accuracy: 96.51 + Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth + Code: https://github.com/facebookresearch/mvit + Config: configs/mvit/mvitv2-small_8xb256_in1k.py + + - Name: mvitv2-base_3rdparty_in1k + In Collection: MViT V2 + Metadata: + FLOPs: 10157964400 + Parameters: 51472744 + Training Data: + - ImageNet-1k + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 84.34 + Top 5 Accuracy: 96.86 + Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth + Code: https://github.com/facebookresearch/mvit + Config: configs/mvit/mvitv2-base_8xb256_in1k.py + + - Name: mvitv2-large_3rdparty_in1k + In Collection: MViT V2 + Metadata: + FLOPs: 43868151412 + Parameters: 217992952 + Training Data: + - ImageNet-1k + Results: + - Dataset: ImageNet-1k + Task: Image Classification + Metrics: + Top 1 Accuracy: 85.25 + Top 5 Accuracy: 97.14 + Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth + Code: https://github.com/facebookresearch/mvit + Config: configs/mvit/mvitv2-large_8xb256_in1k.py diff --git a/configs/mvit/mvitv2-base_8xb256_in1k.py b/configs/mvit/mvitv2-base_8xb256_in1k.py new file mode 100644 index 00000000..ee3ec11e --- /dev/null +++ b/configs/mvit/mvitv2-base_8xb256_in1k.py @@ -0,0 +1,43 @@ +_base_ = [ + '../_base_/models/mvit/mvitv2-base.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=256) +val_dataloader = dict(batch_size=256) +test_dataloader = dict(batch_size=256) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(lr=2.5e-4), + paramwise_cfg=dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys={ + '.pos_embed': dict(decay_mult=0.0), + '.rel_pos_h': dict(decay_mult=0.0), + '.rel_pos_w': dict(decay_mult=0.0) + }), + clip_grad=dict(max_norm=1.0), +) + +# learning policy +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=1e-3, + by_epoch=True, + end=70, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=2048) diff --git a/configs/mvit/mvitv2-large_8xb256_in1k.py b/configs/mvit/mvitv2-large_8xb256_in1k.py new file mode 100644 index 00000000..eacddf96 --- /dev/null +++ b/configs/mvit/mvitv2-large_8xb256_in1k.py @@ -0,0 +1,43 @@ +_base_ = [ + '../_base_/models/mvit/mvitv2-large.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs2048_AdamW.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=256) +val_dataloader = dict(batch_size=256) +test_dataloader = dict(batch_size=256) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(lr=2.5e-4), + paramwise_cfg=dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys={ + '.pos_embed': dict(decay_mult=0.0), + '.rel_pos_h': dict(decay_mult=0.0), + '.rel_pos_w': dict(decay_mult=0.0) + }), + clip_grad=dict(max_norm=1.0), +) + +# learning policy +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=1e-3, + by_epoch=True, + end=70, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=2048) diff --git a/configs/mvit/mvitv2-small_8xb256_in1k.py b/configs/mvit/mvitv2-small_8xb256_in1k.py new file mode 100644 index 00000000..74cfd0a3 --- /dev/null +++ b/configs/mvit/mvitv2-small_8xb256_in1k.py @@ -0,0 +1,43 @@ +_base_ = [ + '../_base_/models/mvit/mvitv2-small.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs2048_AdamW.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=256) +val_dataloader = dict(batch_size=256) +test_dataloader = dict(batch_size=256) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(lr=2.5e-4), + paramwise_cfg=dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys={ + '.pos_embed': dict(decay_mult=0.0), + '.rel_pos_h': dict(decay_mult=0.0), + '.rel_pos_w': dict(decay_mult=0.0) + }), + clip_grad=dict(max_norm=1.0), +) + +# learning policy +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=1e-3, + by_epoch=True, + end=70, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=2048) diff --git a/configs/mvit/mvitv2-tiny_8xb256_in1k.py b/configs/mvit/mvitv2-tiny_8xb256_in1k.py new file mode 100644 index 00000000..4e563a2c --- /dev/null +++ b/configs/mvit/mvitv2-tiny_8xb256_in1k.py @@ -0,0 +1,43 @@ +_base_ = [ + '../_base_/models/mvit/mvitv2-tiny.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs2048_AdamW.py', + '../_base_/default_runtime.py' +] + +# dataset settings +train_dataloader = dict(batch_size=256) +val_dataloader = dict(batch_size=256) +test_dataloader = dict(batch_size=256) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(lr=2.5e-4), + paramwise_cfg=dict( + norm_decay_mult=0.0, + bias_decay_mult=0.0, + custom_keys={ + '.pos_embed': dict(decay_mult=0.0), + '.rel_pos_h': dict(decay_mult=0.0), + '.rel_pos_w': dict(decay_mult=0.0) + }), + clip_grad=dict(max_norm=1.0), +) + +# learning policy +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=1e-3, + by_epoch=True, + end=70, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# based on the actual training batch size. +auto_scale_lr = dict(base_batch_size=2048) diff --git a/configs/poolformer/README.md b/configs/poolformer/README.md index cc557e10..65f4e1da 100644 --- a/configs/poolformer/README.md +++ b/configs/poolformer/README.md @@ -16,13 +16,13 @@ Transformers have shown great potential in computer vision tasks. A common belie ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :--------------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------: | :--------------------------------------------------------------------------: | -| PoolFormer-S12\* | 11.92 | 1.87 | 77.24 | 93.51 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/poolformer/poolformer-s12_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth) | -| PoolFormer-S24\* | 21.39 | 3.51 | 80.33 | 95.05 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/poolformer/poolformer-s24_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth) | -| PoolFormer-S36\* | 30.86 | 5.15 | 81.43 | 95.45 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/poolformer/poolformer-s36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth) | -| PoolFormer-M36\* | 56.17 | 8.96 | 82.14 | 95.71 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/poolformer/poolformer-m36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth) | -| PoolFormer-M48\* | 73.47 | 11.80 | 82.51 | 95.95 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/poolformer/poolformer-m48_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :--------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :---------------------------------------------------------------------------------------------------------: | +| PoolFormer-S12\* | 11.92 | 1.87 | 77.24 | 93.51 | [config](./poolformer-s12_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth) | +| PoolFormer-S24\* | 21.39 | 3.51 | 80.33 | 95.05 | [config](./poolformer-s24_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth) | +| PoolFormer-S36\* | 30.86 | 5.15 | 81.43 | 95.45 | [config](./poolformer-s36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth) | +| PoolFormer-M36\* | 56.17 | 8.96 | 82.14 | 95.71 | [config](./poolformer-m36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth) | +| PoolFormer-M48\* | 73.47 | 11.80 | 82.51 | 95.95 | [config](./poolformer-m48_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth) | *Models with * are converted from the [official repo](https://github.com/sail-sg/poolformer). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/poolformer/metafile.yml b/configs/poolformer/metafile.yml index d94219d1..d0e40d2e 100644 --- a/configs/poolformer/metafile.yml +++ b/configs/poolformer/metafile.yml @@ -19,7 +19,7 @@ Models: Metadata: FLOPs: 1871399424 Parameters: 11915176 - In Collections: PoolFormer + In Collection: PoolFormer Results: - Dataset: ImageNet-1k Metrics: @@ -36,7 +36,7 @@ Models: Training Data: ImageNet-1k FLOPs: 3510411008 Parameters: 21388968 - In Collections: PoolFormer + In Collection: PoolFormer Results: - Dataset: ImageNet-1k Metrics: @@ -52,7 +52,7 @@ Models: Metadata: FLOPs: 5149422592 Parameters: 30862760 - In Collections: PoolFormer + In Collection: PoolFormer Results: - Dataset: ImageNet-1k Metrics: @@ -69,7 +69,7 @@ Models: Training Data: ImageNet-1k FLOPs: 8960175744 Parameters: 56172520 - In Collections: PoolFormer + In Collection: PoolFormer Results: - Dataset: ImageNet-1k Metrics: @@ -85,7 +85,7 @@ Models: Metadata: FLOPs: 11801805696 Parameters: 73473448 - In Collections: PoolFormer + In Collection: PoolFormer Results: - Dataset: ImageNet-1k Metrics: diff --git a/configs/regnet/README.md b/configs/regnet/README.md index 1ae074d6..9e718f75 100644 --- a/configs/regnet/README.md +++ b/configs/regnet/README.md @@ -16,24 +16,24 @@ In this work, we present a new network design paradigm. Our goal is to help adva ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------: | -| RegNetX-400MF | 5.16 | 0.41 | 72.56 | 90.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211208_143316.log.json) | -| RegNetX-800MF | 7.26 | 0.81 | 74.76 | 92.32 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211207_143037.log.json) | -| RegNetX-1.6GF | 9.19 | 1.63 | 76.84 | 93.31 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211208_143018.log.json) | -| RegNetX-3.2GF | 15.3 | 3.21 | 78.09 | 94.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211208_142720.log.json) | -| RegNetX-4.0GF | 22.12 | 4.0 | 78.60 | 94.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211207_150431.log.json) | -| RegNetX-6.4GF | 26.21 | 6.51 | 79.38 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211213_172748.log.json) | -| RegNetX-8.0GF | 39.57 | 8.03 | 79.12 | 94.51 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211208_103250.log.json) | -| RegNetX-12GF | 46.11 | 12.15 | 79.67 | 95.03 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211208_143713.log.json) | -| RegNetX-400MF\* | 5.16 | 0.41 | 72.55 | 90.91 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-400mf_8xb128_in1k) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-400MF-0db9f35c.pth) | -| RegNetX-800MF\* | 7.26 | 0.81 | 75.21 | 92.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-800MF-4f9d1e8a.pth) | -| RegNetX-1.6GF\* | 9.19 | 1.63 | 77.04 | 93.51 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-1.6GF-cfb32375.pth) | -| RegNetX-3.2GF\* | 15.3 | 3.21 | 78.26 | 94.20 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-3.2GF-82c43fd5.pth) | -| RegNetX-4.0GF\* | 22.12 | 4.0 | 78.72 | 94.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-4.0GF-ef8bb32c.pth) | -| RegNetX-6.4GF\* | 26.21 | 6.51 | 79.22 | 94.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-6.4GF-6888c0ea.pth) | -| RegNetX-8.0GF\* | 39.57 | 8.03 | 79.31 | 94.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-8.0GF-cb4c77ec.pth) | -| RegNetX-12GF\* | 46.11 | 12.15 | 79.91 | 94.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-12GF-0574538f.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------: | :------------------------------------------------------------------------------------------------------------: | +| RegNetX-400MF | 5.16 | 0.41 | 72.56 | 90.78 | [config](./regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211208_143316.log.json) | +| RegNetX-800MF | 7.26 | 0.81 | 74.76 | 92.32 | [config](./regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211207_143037.log.json) | +| RegNetX-1.6GF | 9.19 | 1.63 | 76.84 | 93.31 | [config](./regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211208_143018.log.json) | +| RegNetX-3.2GF | 15.3 | 3.21 | 78.09 | 94.08 | [config](./regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211208_142720.log.json) | +| RegNetX-4.0GF | 22.12 | 4.0 | 78.60 | 94.17 | [config](./regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211207_150431.log.json) | +| RegNetX-6.4GF | 26.21 | 6.51 | 79.38 | 94.65 | [config](./regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211213_172748.log.json) | +| RegNetX-8.0GF | 39.57 | 8.03 | 79.12 | 94.51 | [config](./regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211208_103250.log.json) | +| RegNetX-12GF | 46.11 | 12.15 | 79.67 | 95.03 | [config](./regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211208_143713.log.json) | +| RegNetX-400MF\* | 5.16 | 0.41 | 72.55 | 90.91 | [config](./regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-400MF-0db9f35c.pth) | +| RegNetX-800MF\* | 7.26 | 0.81 | 75.21 | 92.37 | [config](./regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-800MF-4f9d1e8a.pth) | +| RegNetX-1.6GF\* | 9.19 | 1.63 | 77.04 | 93.51 | [config](./regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-1.6GF-cfb32375.pth) | +| RegNetX-3.2GF\* | 15.3 | 3.21 | 78.26 | 94.20 | [config](./regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-3.2GF-82c43fd5.pth) | +| RegNetX-4.0GF\* | 22.12 | 4.0 | 78.72 | 94.22 | [config](./regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-4.0GF-ef8bb32c.pth) | +| RegNetX-6.4GF\* | 26.21 | 6.51 | 79.22 | 94.61 | [config](./regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-6.4GF-6888c0ea.pth) | +| RegNetX-8.0GF\* | 39.57 | 8.03 | 79.31 | 94.57 | [config](./regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-8.0GF-cb4c77ec.pth) | +| RegNetX-12GF\* | 46.11 | 12.15 | 79.91 | 94.78 | [config](./regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-12GF-0574538f.pth) | *Models with * are converted from [pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). The config files of these models are only for validation.* diff --git a/configs/repmlp/README.md b/configs/repmlp/README.md index 45334635..181a9e69 100644 --- a/configs/repmlp/README.md +++ b/configs/repmlp/README.md @@ -18,8 +18,8 @@ We propose RepMLP, a multi-layer-perceptron-style neural network building block | Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | | :-----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :---------------------------------------------------------------------------: | -| RepMLP-B224\* | 68.24 | 6.71 | 80.41 | 95.12 | [train_cfg](https://github.com/open-mmlab/mmclassification/blob/master/configs/repmlp/repmlp-base_8xb64_in1k.py) \| [deploy_cfg](https://github.com/open-mmlab/mmclassification/blob/master/configs/repmlp/repmlp-base_delopy_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repmlp/repmlp-base_3rdparty_8xb64_in1k_20220330-1cb1f11b.pth) | -| RepMLP-B256\* | 96.45 | 9.69 | 81.11 | 95.5 | [train_cfg](https://github.com/open-mmlab/mmclassification/blob/master/configs/repmlp/repmlp-base_8xb64_in1k-256px.py) \| [deploy_cfg](https://github.com/open-mmlab/mmclassification/blob/master/configs/repmlp/repmlp-b256_deploy_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repmlp/repmlp-base_3rdparty_8xb64_in1k-256px_20220330-7c5a91ce.pth) | +| RepMLP-B224\* | 68.24 | 6.71 | 80.41 | 95.12 | [train_cfg](./repmlp-base_8xb64_in1k.py) \| [deploy_cfg](./repmlp-base_delopy_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repmlp/repmlp-base_3rdparty_8xb64_in1k_20220330-1cb1f11b.pth) | +| RepMLP-B256\* | 96.45 | 9.69 | 81.11 | 95.5 | [train_cfg](./repmlp-base_8xb64_in1k-256px.py) \| [deploy_cfg](./repmlp-base_deploy_8xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/repmlp/repmlp-base_3rdparty_8xb64_in1k-256px_20220330-7c5a91ce.pth) | *Models with * are converted from [the official repo.](https://github.com/DingXiaoH/RepMLP). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/repvgg/README.md b/configs/repvgg/README.md index b9341326..a1bded13 100644 --- a/configs/repvgg/README.md +++ b/configs/repvgg/README.md @@ -18,18 +18,18 @@ We present a simple but powerful architecture of convolutional neural network, w | Model | Epochs | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | | :-----------: | :----: | :-------------------------------: | :-----------------------------: | :-------: | :-------: | :----------------------------------------------: | :-------------------------------------------------: | -| RepVGG-A0\* | 120 | 9.11(train) \| 8.31 (deploy) | 1.52 (train) \| 1.36 (deploy) | 72.41 | 90.50 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth) | -| RepVGG-A1\* | 120 | 14.09 (train) \| 12.79 (deploy) | 2.64 (train) \| 2.37 (deploy) | 74.47 | 91.85 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth) | -| RepVGG-A2\* | 120 | 28.21 (train) \| 25.5 (deploy) | 5.7 (train) \| 5.12 (deploy) | 76.48 | 93.01 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth) | -| RepVGG-B0\* | 120 | 15.82 (train) \| 14.34 (deploy) | 3.42 (train) \| 3.06 (deploy) | 75.14 | 92.42 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth) | -| RepVGG-B1\* | 120 | 57.42 (train) \| 51.83 (deploy) | 13.16 (train) \| 11.82 (deploy) | 78.37 | 94.11 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth) | -| RepVGG-B1g2\* | 120 | 45.78 (train) \| 41.36 (deploy) | 9.82 (train) \| 8.82 (deploy) | 77.79 | 93.88 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth) | -| RepVGG-B1g4\* | 120 | 39.97 (train) \| 36.13 (deploy) | 8.15 (train) \| 7.32 (deploy) | 77.58 | 93.84 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth) | -| RepVGG-B2\* | 120 | 89.02 (train) \| 80.32 (deploy) | 20.46 (train) \| 18.39 (deploy) | 78.78 | 94.42 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth) | -| RepVGG-B2g4\* | 200 | 61.76 (train) \| 55.78 (deploy) | 12.63 (train) \| 11.34 (deploy) | 79.38 | 94.68 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth) | -| RepVGG-B3\* | 200 | 123.09 (train) \| 110.96 (deploy) | 29.17 (train) \| 26.22 (deploy) | 80.52 | 95.26 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth) | -| RepVGG-B3g4\* | 200 | 83.83 (train) \| 75.63 (deploy) | 17.9 (train) \| 16.08 (deploy) | 80.22 | 95.10 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth) | -| RepVGG-D2se\* | 200 | 133.33 (train) \| 120.39 (deploy) | 36.56 (train) \| 32.85 (deploy) | 81.81 | 95.94 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth) | +| RepVGG-A0\* | 120 | 9.11(train) \| 8.31 (deploy) | 1.52 (train) \| 1.36 (deploy) | 72.41 | 90.50 | [config (train)](./repvgg-A0_4xb64-coslr-120e_in1k.py) \| [config (deploy)](./deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth) | +| RepVGG-A1\* | 120 | 14.09 (train) \| 12.79 (deploy) | 2.64 (train) \| 2.37 (deploy) | 74.47 | 91.85 | [config (train)](./repvgg-A1_4xb64-coslr-120e_in1k.py) \| [config (deploy)](./deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth) | +| RepVGG-A2\* | 120 | 28.21 (train) \| 25.5 (deploy) | 5.7 (train) \| 5.12 (deploy) | 76.48 | 93.01 | [config (train)](./repvgg-A2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth) | +| RepVGG-B0\* | 120 | 15.82 (train) \| 14.34 (deploy) | 3.42 (train) \| 3.06 (deploy) | 75.14 | 92.42 | [config (train)](./repvgg-B0_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth) | +| RepVGG-B1\* | 120 | 57.42 (train) \| 51.83 (deploy) | 13.16 (train) \| 11.82 (deploy) | 78.37 | 94.11 | [config (train)](./repvgg-B1_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth) | +| RepVGG-B1g2\* | 120 | 45.78 (train) \| 41.36 (deploy) | 9.82 (train) \| 8.82 (deploy) | 77.79 | 93.88 | [config (train)](./repvgg-B1g2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth) | +| RepVGG-B1g4\* | 120 | 39.97 (train) \| 36.13 (deploy) | 8.15 (train) \| 7.32 (deploy) | 77.58 | 93.84 | [config (train)](./repvgg-B1g4_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth) | +| RepVGG-B2\* | 120 | 89.02 (train) \| 80.32 (deploy) | 20.46 (train) \| 18.39 (deploy) | 78.78 | 94.42 | [config (train)](./repvgg-B2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth) | +| RepVGG-B2g4\* | 200 | 61.76 (train) \| 55.78 (deploy) | 12.63 (train) \| 11.34 (deploy) | 79.38 | 94.68 | [config (train)](./repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth) | +| RepVGG-B3\* | 200 | 123.09 (train) \| 110.96 (deploy) | 29.17 (train) \| 26.22 (deploy) | 80.52 | 95.26 | [config (train)](./repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth) | +| RepVGG-B3g4\* | 200 | 83.83 (train) \| 75.63 (deploy) | 17.9 (train) \| 16.08 (deploy) | 80.22 | 95.10 | [config (train)](./repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth) | +| RepVGG-D2se\* | 200 | 133.33 (train) \| 120.39 (deploy) | 36.56 (train) \| 32.85 (deploy) | 81.81 | 95.94 | [config (train)](./repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth) | *Models with * are converted from the [official repo](https://github.com/DingXiaoH/RepVGG). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/res2net/README.md b/configs/res2net/README.md index 895d0c5b..05ca9191 100644 --- a/configs/res2net/README.md +++ b/configs/res2net/README.md @@ -16,11 +16,11 @@ Representing features at multiple scales is of great importance for numerous vis ### ImageNet-1k -| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :------------------: | :--------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------: | :-------------------------------------------------------------------: | -| Res2Net-50-14w-8s\* | 224x224 | 25.06 | 4.22 | 78.14 | 93.85 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net50-w14-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth) | -| Res2Net-50-26w-8s\* | 224x224 | 48.40 | 8.39 | 79.20 | 94.36 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net50-w26-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth) | -| Res2Net-101-26w-4s\* | 224x224 | 45.21 | 8.12 | 79.19 | 94.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net101-w26-s4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth) | +| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------------------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------: | :------------------------------------------------------------------------------------------: | +| Res2Net-50-14w-8s\* | 224x224 | 25.06 | 4.22 | 78.14 | 93.85 | [config](./res2net50-w14-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth) | +| Res2Net-50-26w-8s\* | 224x224 | 48.40 | 8.39 | 79.20 | 94.36 | [config](./res2net50-w26-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth) | +| Res2Net-101-26w-4s\* | 224x224 | 45.21 | 8.12 | 79.19 | 94.44 | [config](./res2net101-w26-s4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth) | *Models with * are converted from the [official repo](https://github.com/Res2Net/Res2Net-PretrainedModels). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/resnet/README.md b/configs/resnet/README.md index f1d32eff..69f128df 100644 --- a/configs/resnet/README.md +++ b/configs/resnet/README.md @@ -26,41 +26,41 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' ### Cifar10 -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------------------: | :-----------------------------------------------------------------------------: | -| ResNet-18 | 11.17 | 0.56 | 94.82 | 99.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet18_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_b16x8_cifar10_20210528-bd6371c8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_b16x8_cifar10_20210528-bd6371c8.log.json) | -| ResNet-34 | 21.28 | 1.16 | 95.34 | 99.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet34_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_b16x8_cifar10_20210528-a8aa36a6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_b16x8_cifar10_20210528-a8aa36a6.log.json) | -| ResNet-50 | 23.52 | 1.31 | 95.55 | 99.91 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar10_20210528-f54bfad9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar10_20210528-f54bfad9.log.json) | -| ResNet-101 | 42.51 | 2.52 | 95.58 | 99.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.log.json) | -| ResNet-152 | 58.16 | 3.74 | 95.76 | 99.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :--------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: | +| ResNet-18 | 11.17 | 0.56 | 94.82 | 99.87 | [config](./resnet18_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_b16x8_cifar10_20210528-bd6371c8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_b16x8_cifar10_20210528-bd6371c8.log.json) | +| ResNet-34 | 21.28 | 1.16 | 95.34 | 99.87 | [config](./resnet34_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_b16x8_cifar10_20210528-a8aa36a6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_b16x8_cifar10_20210528-a8aa36a6.log.json) | +| ResNet-50 | 23.52 | 1.31 | 95.55 | 99.91 | [config](./resnet50_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar10_20210528-f54bfad9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar10_20210528-f54bfad9.log.json) | +| ResNet-101 | 42.51 | 2.52 | 95.58 | 99.87 | [config](./resnet101_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.log.json) | +| ResNet-152 | 58.16 | 3.74 | 95.76 | 99.89 | [config](./resnet152_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.log.json) | ### Cifar100 -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------: | :-----------------------------------------------------------------------------: | -| ResNet-50 | 23.71 | 1.31 | 79.90 | 95.19 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb16_cifar100.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar100_20210528-67b58a1b.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar100_20210528-67b58a1b.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | +| ResNet-50 | 23.71 | 1.31 | 79.90 | 95.19 | [config](./resnet50_8xb16_cifar100.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar100_20210528-67b58a1b.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar100_20210528-67b58a1b.log.json) | ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :----------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------------: | :-------------------------------------------------------------------------: | -| ResNet-18 | 11.69 | 1.82 | 69.90 | 89.43 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet18_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.log.json) | -| ResNet-34 | 21.8 | 3.68 | 73.62 | 91.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet34_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.log.json) | -| ResNet-50 | 25.56 | 4.12 | 76.55 | 93.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.log.json) | -| ResNet-101 | 44.55 | 7.85 | 77.97 | 94.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.log.json) | -| ResNet-152 | 60.19 | 11.58 | 78.48 | 94.13 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.log.json) | -| ResNetV1C-50 | 25.58 | 4.36 | 77.01 | 93.58 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1c50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c50_8xb32_in1k_20220214-3343eccd.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c50_8xb32_in1k_20220214-3343eccd.log.json) | -| ResNetV1C-101 | 44.57 | 8.09 | 78.30 | 94.27 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1c101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c101_8xb32_in1k_20220214-434fe45f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c101_8xb32_in1k_20220214-434fe45f.log.json) | -| ResNetV1C-152 | 60.21 | 11.82 | 78.76 | 94.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1c152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c152_8xb32_in1k_20220214-c013291f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c152_8xb32_in1k_20220214-c013291f.log.json) | -| ResNetV1D-50 | 25.58 | 4.36 | 77.54 | 93.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.log.json) | -| ResNetV1D-101 | 44.57 | 8.09 | 78.93 | 94.48 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.log.json) | -| ResNetV1D-152 | 60.21 | 11.82 | 79.41 | 94.70 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.log.json) | -| ResNet-50 (fp16) | 25.56 | 4.12 | 76.30 | 93.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb32-fp16_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/fp16/resnet50_batch256_fp16_imagenet_20210320-b3964210.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/fp16/resnet50_batch256_fp16_imagenet_20210320-b3964210.log.json) | -| Wide-ResNet-50\* | 68.88 | 11.44 | 78.48 | 94.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/wide-resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth) | -| Wide-ResNet-101\* | 126.89 | 22.81 | 78.84 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) | -| ResNet-50 (rsb-a1) | 25.56 | 4.12 | 80.12 | 94.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.log.json) | -| ResNet-50 (rsb-a2) | 25.56 | 4.12 | 79.55 | 94.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb256-rsb-a2-300e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a2-300e_in1k_20211228-0fd8be6e.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a2-300e_in1k_20211228-0fd8be6e.log.json) | -| ResNet-50 (rsb-a3) | 25.56 | 4.12 | 78.30 | 93.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :----------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------: | :--------------------------------------------------------------------------------------------------: | +| ResNet-18 | 11.69 | 1.82 | 69.90 | 89.43 | [config](./resnet18_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.log.json) | +| ResNet-34 | 21.8 | 3.68 | 73.62 | 91.59 | [config](./resnet34_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.log.json) | +| ResNet-50 | 25.56 | 4.12 | 76.55 | 93.06 | [config](./resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.log.json) | +| ResNet-101 | 44.55 | 7.85 | 77.97 | 94.06 | [config](./resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.log.json) | +| ResNet-152 | 60.19 | 11.58 | 78.48 | 94.13 | [config](./resnet152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.log.json) | +| ResNetV1C-50 | 25.58 | 4.36 | 77.01 | 93.58 | [config](./resnetv1c50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c50_8xb32_in1k_20220214-3343eccd.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c50_8xb32_in1k_20220214-3343eccd.log.json) | +| ResNetV1C-101 | 44.57 | 8.09 | 78.30 | 94.27 | [config](./resnetv1c101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c101_8xb32_in1k_20220214-434fe45f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c101_8xb32_in1k_20220214-434fe45f.log.json) | +| ResNetV1C-152 | 60.21 | 11.82 | 78.76 | 94.41 | [config](./resnetv1c152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c152_8xb32_in1k_20220214-c013291f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1c152_8xb32_in1k_20220214-c013291f.log.json) | +| ResNetV1D-50 | 25.58 | 4.36 | 77.54 | 93.57 | [config](./resnetv1d50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.log.json) | +| ResNetV1D-101 | 44.57 | 8.09 | 78.93 | 94.48 | [config](./resnetv1d101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.log.json) | +| ResNetV1D-152 | 60.21 | 11.82 | 79.41 | 94.70 | [config](./resnetv1d152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.log.json) | +| ResNet-50 (fp16) | 25.56 | 4.12 | 76.30 | 93.07 | [config](./resnet50_8xb32-fp16_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/fp16/resnet50_batch256_fp16_imagenet_20210320-b3964210.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/fp16/resnet50_batch256_fp16_imagenet_20210320-b3964210.log.json) | +| Wide-ResNet-50\* | 68.88 | 11.44 | 78.48 | 94.08 | [config](../wrn/wide-resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth) | +| Wide-ResNet-101\* | 126.89 | 22.81 | 78.84 | 94.28 | [config](../wrn/wide-resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) | +| ResNet-50 (rsb-a1) | 25.56 | 4.12 | 80.12 | 94.78 | [config](./resnet50_8xb256-rsb-a1-600e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.log.json) | +| ResNet-50 (rsb-a2) | 25.56 | 4.12 | 79.55 | 94.37 | [config](./resnet50_8xb256-rsb-a2-300e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a2-300e_in1k_20211228-0fd8be6e.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a2-300e_in1k_20211228-0fd8be6e.log.json) | +| ResNet-50 (rsb-a3) | 25.56 | 4.12 | 78.30 | 93.80 | [config](./resnet50_8xb256-rsb-a3-100e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.log.json) | *The "rsb" means using the training settings from [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476).* @@ -68,9 +68,9 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' ### CUB-200-2011 -| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Config | Download | -| :-------: | :--------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :------------------------------------------------: | :---------------------------------------------------: | -| ResNet-50 | [ImageNet-21k-mill](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth) | 448x448 | 23.92 | 16.48 | 88.45 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb8_cub.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.log.json) | +| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Config | Download | +| :-------: | :-----------------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :------------------------------: | :------------------------------------------------------------: | +| ResNet-50 | [ImageNet-21k-mill](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth) | 448x448 | 23.92 | 16.48 | 88.45 | [config](./resnet50_8xb8_cub.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.log.json) | ## Citation diff --git a/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py b/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py index 3d2d5894..1c213127 100644 --- a/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py +++ b/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py @@ -16,6 +16,7 @@ model = dict( type='LabelSmoothLoss', label_smooth_val=0.1, mode='original', + use_sigmoid=True, )), train_cfg=dict(augments=[ dict(type='Mixup', alpha=0.2, num_classes=1000), diff --git a/configs/resnext/README.md b/configs/resnext/README.md index 56df277e..7aec9d48 100644 --- a/configs/resnext/README.md +++ b/configs/resnext/README.md @@ -16,12 +16,12 @@ We present a simple, highly modularized network architecture for image classific ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :---------------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------: | :-------------------------------------------------------------------------: | -| ResNeXt-32x4d-50 | 25.03 | 4.27 | 77.90 | 93.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext50-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.log.json) | -| ResNeXt-32x4d-101 | 44.18 | 8.03 | 78.61 | 94.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.log.json) | -| ResNeXt-32x8d-101 | 88.79 | 16.5 | 79.27 | 94.58 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x8d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.log.json) | -| ResNeXt-32x4d-152 | 59.95 | 11.8 | 78.88 | 94.33 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext152-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :---------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :--------------------------------------------------------------------------------------------------------: | +| ResNeXt-32x4d-50 | 25.03 | 4.27 | 77.90 | 93.66 | [config](./resnext50-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.log.json) | +| ResNeXt-32x4d-101 | 44.18 | 8.03 | 78.61 | 94.17 | [config](./resnext101-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.log.json) | +| ResNeXt-32x8d-101 | 88.79 | 16.5 | 79.27 | 94.58 | [config](./resnext101-32x8d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.log.json) | +| ResNeXt-32x4d-152 | 59.95 | 11.8 | 78.88 | 94.33 | [config](./resnext152-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.log.json) | ## Citation diff --git a/configs/seresnet/README.md b/configs/seresnet/README.md index ccfd1d15..43ba2444 100644 --- a/configs/seresnet/README.md +++ b/configs/seresnet/README.md @@ -16,10 +16,10 @@ The central building block of convolutional neural networks (CNNs) is the convol ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :---------------------------------------------------------------------------: | -| SE-ResNet-50 | 28.09 | 4.13 | 77.74 | 93.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200708-657b3c36.log.json) | -| SE-ResNet-101 | 49.33 | 7.86 | 78.26 | 94.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200708-038a4d04.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------: | :-----------------------------------------------------------------------------------------------------------------: | +| SE-ResNet-50 | 28.09 | 4.13 | 77.74 | 93.84 | [config](./seresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200708-657b3c36.log.json) | +| SE-ResNet-101 | 49.33 | 7.86 | 78.26 | 94.07 | [config](./seresnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200708-038a4d04.log.json) | ## Citation diff --git a/configs/shufflenet_v1/README.md b/configs/shufflenet_v1/README.md index fd131279..2299862c 100644 --- a/configs/shufflenet_v1/README.md +++ b/configs/shufflenet_v1/README.md @@ -16,9 +16,9 @@ We introduce an extremely computation-efficient CNN architecture named ShuffleNe ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------------------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------: | :--------------------------------------------------------------------: | -| ShuffleNetV1 1.0x (group=3) | 1.87 | 0.146 | 68.13 | 87.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------------------------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------: | :---------------------------------------------------------------------------------------------: | +| ShuffleNetV1 1.0x (group=3) | 1.87 | 0.146 | 68.13 | 87.81 | [config](./shufflenet-v1-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) | ## Citation diff --git a/configs/shufflenet_v2/README.md b/configs/shufflenet_v2/README.md index 78271543..fd2b7bc2 100644 --- a/configs/shufflenet_v2/README.md +++ b/configs/shufflenet_v2/README.md @@ -16,9 +16,9 @@ Currently, the neural network architecture design is mostly guided by the *indir ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :---------------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------: | :-------------------------------------------------------------------------: | -| ShuffleNetV2 1.0x | 2.28 | 0.149 | 69.55 | 88.92 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :---------------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------: | :-------------------------------------------------------------------------------------------------------: | +| ShuffleNetV2 1.0x | 2.28 | 0.149 | 69.55 | 88.92 | [config](./shufflenet-v2-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) | ## Citation diff --git a/configs/swin_transformer/README.md b/configs/swin_transformer/README.md index 86975ec8..ad859ff7 100644 --- a/configs/swin_transformer/README.md +++ b/configs/swin_transformer/README.md @@ -27,26 +27,26 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' ### ImageNet-1k -| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------: | :-------------------------------------------------------------------: | -| Swin-T | From scratch | 224x224 | 28.29 | 4.36 | 81.18 | 95.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-tiny_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925.log.json) | -| Swin-S | From scratch | 224x224 | 49.61 | 8.52 | 83.02 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-small_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219.log.json) | -| Swin-B | From scratch | 224x224 | 87.77 | 15.14 | 83.36 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742-93230b0d.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742.log.json) | -| Swin-S\* | From scratch | 224x224 | 49.61 | 8.52 | 83.21 | 96.25 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-small_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_small_patch4_window7_224-cc7a01c9.pth) | -| Swin-B\* | From scratch | 224x224 | 87.77 | 15.14 | 83.42 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224-4670dd19.pth) | -| Swin-B\* | From scratch | 384x384 | 87.90 | 44.49 | 84.49 | 96.95 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-base_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window12_384-02c598a4.pth) | -| Swin-B\* | ImageNet-21k | 224x224 | 87.77 | 15.14 | 85.16 | 97.50 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth) | -| Swin-B\* | ImageNet-21k | 384x384 | 87.90 | 44.49 | 86.44 | 98.05 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-base_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window12_384_22kto1k-d59b0d1d.pth) | -| Swin-L\* | ImageNet-21k | 224x224 | 196.53 | 34.04 | 86.24 | 97.88 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth) | -| Swin-L\* | ImageNet-21k | 384x384 | 196.74 | 100.04 | 87.25 | 98.25 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-large_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window12_384_22kto1k-0a40944b.pth) | +| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------: | :------------------------------------------------------------------------------------------: | +| Swin-T | From scratch | 224x224 | 28.29 | 4.36 | 81.18 | 95.61 | [config](./swin-tiny_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925.log.json) | +| Swin-S | From scratch | 224x224 | 49.61 | 8.52 | 83.02 | 96.29 | [config](./swin-small_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219.log.json) | +| Swin-B | From scratch | 224x224 | 87.77 | 15.14 | 83.36 | 96.44 | [config](./swin-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742-93230b0d.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742.log.json) | +| Swin-S\* | From scratch | 224x224 | 49.61 | 8.52 | 83.21 | 96.25 | [config](./swin-small_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_small_patch4_window7_224-cc7a01c9.pth) | +| Swin-B\* | From scratch | 224x224 | 87.77 | 15.14 | 83.42 | 96.44 | [config](./swin-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224-4670dd19.pth) | +| Swin-B\* | From scratch | 384x384 | 87.90 | 44.49 | 84.49 | 96.95 | [config](./swin-base_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window12_384-02c598a4.pth) | +| Swin-B\* | ImageNet-21k | 224x224 | 87.77 | 15.14 | 85.16 | 97.50 | [config](./swin-base_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth) | +| Swin-B\* | ImageNet-21k | 384x384 | 87.90 | 44.49 | 86.44 | 98.05 | [config](./swin-base_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window12_384_22kto1k-d59b0d1d.pth) | +| Swin-L\* | ImageNet-21k | 224x224 | 196.53 | 34.04 | 86.24 | 97.88 | [config](./swin-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth) | +| Swin-L\* | ImageNet-21k | 384x384 | 196.74 | 100.04 | 87.25 | 98.25 | [config](./swin-large_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window12_384_22kto1k-0a40944b.pth) | *Models with * are converted from the [official repo](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* ### CUB-200-2011 -| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Config | Download | -| :----: | :---------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :-------------------------------------------------: | :----------------------------------------------------: | -| Swin-L | [ImageNet-21k](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin-base_3rdparty_in21k-384px.pth) | 384x384 | 195.51 | 100.04 | 91.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-large_8xb8_cub_384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin-large_8xb8_cub_384px_20220307-1bbaee6a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin-large_8xb8_cub_384px_20220307-1bbaee6a.log.json) | +| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Config | Download | +| :----: | :---------------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :--------------------------------------: | :---------------------------------------------------------: | +| Swin-L | [ImageNet-21k](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin-base_3rdparty_in21k-384px.pth) | 384x384 | 195.51 | 100.04 | 91.87 | [config](./swin-large_8xb8_cub_384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin-large_8xb8_cub_384px_20220307-1bbaee6a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin-large_8xb8_cub_384px_20220307-1bbaee6a.log.json) | ## Citation diff --git a/configs/swin_transformer_v2/README.md b/configs/swin_transformer_v2/README.md new file mode 100644 index 00000000..52b8d365 --- /dev/null +++ b/configs/swin_transformer_v2/README.md @@ -0,0 +1,58 @@ +# Swin Transformer V2 + +> [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883.pdf) + + + +## Abstract + +Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time. + +
+ +
+ +## Results and models + +### ImageNet-21k + +The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don't have evaluation results. + +| Model | resolution | Params(M) | Flops(G) | Download | +| :------: | :--------: | :-------: | :------: | :--------------------------------------------------------------------------------------------------------------------------------------: | +| Swin-B\* | 192x192 | 87.92 | 8.51 | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-base-w12_3rdparty_in21k-192px_20220803-f7dc9763.pth) | +| Swin-L\* | 192x192 | 196.74 | 19.04 | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-large-w12_3rdparty_in21k-192px_20220803-d9073fee.pth) | + +### ImageNet-1k + +| Model | Pretrain | resolution | window | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :------: | :----------: | :--------: | :----: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------: | :--------------------------------------------------------------------: | +| Swin-T\* | From scratch | 256x256 | 8x8 | 28.35 | 4.35 | 81.76 | 95.87 | [config](./swinv2-tiny-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w8_3rdparty_in1k-256px_20220803-e318968f.pth) | +| Swin-T\* | From scratch | 256x256 | 16x16 | 28.35 | 4.4 | 82.81 | 96.23 | [config](./swinv2-tiny-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w16_3rdparty_in1k-256px_20220803-9651cdd7.pth) | +| Swin-S\* | From scratch | 256x256 | 8x8 | 49.73 | 8.45 | 83.74 | 96.6 | [config](./swinv2-small-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w8_3rdparty_in1k-256px_20220803-b01a4332.pth) | +| Swin-S\* | From scratch | 256x256 | 16x16 | 49.73 | 8.57 | 84.13 | 96.83 | [config](./swinv2-small-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w16_3rdparty_in1k-256px_20220803-b707d206.pth) | +| Swin-B\* | From scratch | 256x256 | 8x8 | 87.92 | 14.99 | 84.2 | 96.86 | [config](./swinv2-base-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w8_3rdparty_in1k-256px_20220803-8ff28f2b.pth) | +| Swin-B\* | From scratch | 256x256 | 16x16 | 87.92 | 15.14 | 84.6 | 97.05 | [config](./swinv2-base-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_3rdparty_in1k-256px_20220803-5a1886b7.pth) | +| Swin-B\* | ImageNet-21k | 256x256 | 16x16 | 87.92 | 15.14 | 86.17 | 97.88 | [config](./swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_in21k-pre_3rdparty_in1k-256px_20220803-8d7aa8ad.pth) | +| Swin-B\* | ImageNet-21k | 384x384 | 24x24 | 87.92 | 34.07 | 87.14 | 98.23 | [config](./swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w24_in21k-pre_3rdparty_in1k-384px_20220803-44eb70f8.pth) | +| Swin-L\* | ImageNet-21k | 256X256 | 16x16 | 196.75 | 33.86 | 86.93 | 98.06 | [config](./swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w16_in21k-pre_3rdparty_in1k-256px_20220803-c40cbed7.pth) | +| Swin-L\* | ImageNet-21k | 384x384 | 24x24 | 196.75 | 76.2 | 87.59 | 98.27 | [config](./swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w24_in21k-pre_3rdparty_in1k-384px_20220803-3b36c165.pth) | + +*Models with * are converted from the [official repo](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +*ImageNet-21k pretrained models with input resolution of 256x256 and 384x384 both fine-tuned from the same pre-training model using a smaller input resolution of 192x192.* + +## Citation + +``` +@article{https://doi.org/10.48550/arxiv.2111.09883, + doi = {10.48550/ARXIV.2111.09883}, + url = {https://arxiv.org/abs/2111.09883}, + author = {Liu, Ze and Hu, Han and Lin, Yutong and Yao, Zhuliang and Xie, Zhenda and Wei, Yixuan and Ning, Jia and Cao, Yue and Zhang, Zheng and Dong, Li and Wei, Furu and Guo, Baining}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {Swin Transformer V2: Scaling Up Capacity and Resolution}, + publisher = {arXiv}, + year = {2021}, + copyright = {Creative Commons Attribution 4.0 International} +} +``` diff --git a/configs/swin_transformer_v2/metafile.yml b/configs/swin_transformer_v2/metafile.yml new file mode 100644 index 00000000..5d36e97a --- /dev/null +++ b/configs/swin_transformer_v2/metafile.yml @@ -0,0 +1,204 @@ +Collections: + - Name: Swin-Transformer V2 + Metadata: + Training Data: ImageNet-1k + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 16x V100 GPUs + Epochs: 300 + Batch Size: 1024 + Architecture: + - Shift Window Multihead Self Attention + Paper: + URL: https://arxiv.org/abs/2111.09883.pdf + Title: "Swin Transformer V2: Scaling Up Capacity and Resolution" + README: configs/swin_transformer_v2/README.md + +Models: + - Name: swinv2-tiny-w8_3rdparty_in1k-256px + Metadata: + FLOPs: 4350000000 + Parameters: 28350000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.76 + Top 5 Accuracy: 95.87 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w8_3rdparty_in1k-256px_20220803-e318968f.pth + Config: configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-tiny-w16_3rdparty_in1k-256px + Metadata: + FLOPs: 4400000000 + Parameters: 28350000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.81 + Top 5 Accuracy: 96.23 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w16_3rdparty_in1k-256px_20220803-9651cdd7.pth + Config: configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-small-w8_3rdparty_in1k-256px + Metadata: + FLOPs: 8450000000 + Parameters: 49730000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.74 + Top 5 Accuracy: 96.6 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w8_3rdparty_in1k-256px_20220803-b01a4332.pth + Config: configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-small-w16_3rdparty_in1k-256px + Metadata: + FLOPs: 8570000000 + Parameters: 49730000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.13 + Top 5 Accuracy: 96.83 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w16_3rdparty_in1k-256px_20220803-b707d206.pth + Config: configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-base-w8_3rdparty_in1k-256px + Metadata: + FLOPs: 14990000000 + Parameters: 87920000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.2 + Top 5 Accuracy: 96.86 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w8_3rdparty_in1k-256px_20220803-8ff28f2b.pth + Config: configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-base-w16_3rdparty_in1k-256px + Metadata: + FLOPs: 15140000000 + Parameters: 87920000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.6 + Top 5 Accuracy: 97.05 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_3rdparty_in1k-256px_20220803-5a1886b7.pth + Config: configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-base-w16_in21k-pre_3rdparty_in1k-256px + Metadata: + Training Data: ImageNet-21k + FLOPs: 15140000000 + Parameters: 87920000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 86.17 + Top 5 Accuracy: 97.88 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_in21k-pre_3rdparty_in1k-256px_20220803-8d7aa8ad.pth + Config: configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-base-w24_in21k-pre_3rdparty_in1k-384px + Metadata: + Training Data: ImageNet-21k + FLOPs: 34070000000 + Parameters: 87920000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 87.14 + Top 5 Accuracy: 98.23 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w24_in21k-pre_3rdparty_in1k-384px_20220803-44eb70f8.pth + Config: configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-large-w16_in21k-pre_3rdparty_in1k-256px + Metadata: + Training Data: ImageNet-21k + FLOPs: 33860000000 + Parameters: 196750000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 86.93 + Top 5 Accuracy: 98.06 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w16_in21k-pre_3rdparty_in1k-256px_20220803-c40cbed7.pth + Config: configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-large-w24_in21k-pre_3rdparty_in1k-384px + Metadata: + Training Data: ImageNet-21k + FLOPs: 76200000000 + Parameters: 196750000 + In Collection: Swin-Transformer V2 + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 87.59 + Top 5 Accuracy: 98.27 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w24_in21k-pre_3rdparty_in1k-384px_20220803-3b36c165.pth + Config: configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-base-w12_3rdparty_in21k-192px + Metadata: + Training Data: ImageNet-21k + FLOPs: 8510000000 + Parameters: 87920000 + In Collection: Swin-Transformer V2 + Results: null + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-base-w12_3rdparty_in21k-192px_20220803-f7dc9763.pth + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pth + Code: https://github.com/microsoft/Swin-Transformer + - Name: swinv2-large-w12_3rdparty_in21k-192px + Metadata: + Training Data: ImageNet-21k + FLOPs: 19040000000 + Parameters: 196740000 + In Collection: Swin-Transformer V2 + Results: null + Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-large-w12_3rdparty_in21k-192px_20220803-d9073fee.pth + Converted From: + Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth + Code: https://github.com/microsoft/Swin-Transformer diff --git a/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py new file mode 100644 index 00000000..5f375ee1 --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py @@ -0,0 +1,8 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/base_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict(backbone=dict(window_size=[16, 16, 16, 8])) diff --git a/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py new file mode 100644 index 00000000..0725f9e7 --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/base_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict( + type='ImageClassifier', + backbone=dict( + window_size=[16, 16, 16, 8], + drop_path_rate=0.2, + pretrained_window_sizes=[12, 12, 12, 6])) diff --git a/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py b/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py new file mode 100644 index 00000000..3dd4e5fd --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py @@ -0,0 +1,14 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/base_384.py', + '../_base_/datasets/imagenet_bs64_swin_384.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict( + type='ImageClassifier', + backbone=dict( + img_size=384, + window_size=[24, 24, 24, 12], + drop_path_rate=0.2, + pretrained_window_sizes=[12, 12, 12, 6])) diff --git a/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py new file mode 100644 index 00000000..23fc4070 --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/base_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] diff --git a/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py new file mode 100644 index 00000000..62a2a29b --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py @@ -0,0 +1,13 @@ +# Only for evaluation +_base_ = [ + '../_base_/models/swin_transformer_v2/large_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict( + type='ImageClassifier', + backbone=dict( + window_size=[16, 16, 16, 8], pretrained_window_sizes=[12, 12, 12, 6]), +) diff --git a/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py b/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py new file mode 100644 index 00000000..d97d9b2b --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py @@ -0,0 +1,15 @@ +# Only for evaluation +_base_ = [ + '../_base_/models/swin_transformer_v2/large_384.py', + '../_base_/datasets/imagenet_bs64_swin_384.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict( + type='ImageClassifier', + backbone=dict( + img_size=384, + window_size=[24, 24, 24, 12], + pretrained_window_sizes=[12, 12, 12, 6]), +) diff --git a/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py new file mode 100644 index 00000000..f87265dd --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py @@ -0,0 +1,8 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/small_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict(backbone=dict(window_size=[16, 16, 16, 8])) diff --git a/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py new file mode 100644 index 00000000..f1001f1b --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/small_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] diff --git a/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py new file mode 100644 index 00000000..7e1f290f --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py @@ -0,0 +1,8 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/tiny_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +model = dict(backbone=dict(window_size=[16, 16, 16, 8])) diff --git a/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py b/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py new file mode 100644 index 00000000..2cdc9a25 --- /dev/null +++ b/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/swin_transformer_v2/tiny_256.py', + '../_base_/datasets/imagenet_bs64_swin_256.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] diff --git a/configs/t2t_vit/README.md b/configs/t2t_vit/README.md index 1e3a0827..ad108dc0 100644 --- a/configs/t2t_vit/README.md +++ b/configs/t2t_vit/README.md @@ -16,11 +16,11 @@ Transformers, which are popular for language modeling, have been explored for so ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | -| T2T-ViT_t-14 | 21.47 | 4.34 | 81.83 | 95.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.log.json) | -| T2T-ViT_t-19 | 39.08 | 7.80 | 82.63 | 96.18 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.log.json) | -| T2T-ViT_t-24 | 64.00 | 12.69 | 82.71 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------: | :-----------------------------------------------------------------------------------------------------------------: | +| T2T-ViT_t-14 | 21.47 | 4.34 | 81.83 | 95.84 | [config](./t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.log.json) | +| T2T-ViT_t-19 | 39.08 | 7.80 | 82.63 | 96.18 | [config](./t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.log.json) | +| T2T-ViT_t-24 | 64.00 | 12.69 | 82.71 | 96.09 | [config](./t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.log.json) | *In consistent with the [official repo](https://github.com/yitu-opensource/T2T-ViT), we adopt the best checkpoints during training.* diff --git a/configs/tnt/README.md b/configs/tnt/README.md index 948eef74..c90ec9db 100644 --- a/configs/tnt/README.md +++ b/configs/tnt/README.md @@ -16,9 +16,9 @@ Transformer is a new kind of neural architecture which encodes the input data as ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :---------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | -| TNT-small\* | 23.76 | 3.36 | 81.52 | 95.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/tnt/tnt-s-p16_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :---------: | :-------: | :------: | :-------: | :-------: | :----------------------------------: | :---------------------------------------------------------------------------------------------------------------: | +| TNT-small\* | 23.76 | 3.36 | 81.52 | 95.73 | [config](./tnt-s-p16_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth) | *Models with * are converted from [timm](https://github.com/rwightman/pytorch-image-models/). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/twins/README.md b/configs/twins/README.md index 87e72941..7533d9f0 100644 --- a/configs/twins/README.md +++ b/configs/twins/README.md @@ -16,14 +16,14 @@ Very recently, a variety of vision transformer architectures for dense predictio ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------------------: | :---------------------------------------------------------------------------: | -| PCPVT-small\* | 24.11 | 3.67 | 81.14 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth) | -| PCPVT-base\* | 43.83 | 6.45 | 82.66 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth) | -| PCPVT-large\* | 60.99 | 9.51 | 83.09 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth) | -| SVT-small\* | 24.06 | 2.82 | 81.77 | 95.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth) | -| SVT-base\* | 56.07 | 8.35 | 83.13 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth) | -| SVT-large\* | 99.27 | 14.82 | 83.60 | 96.50 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------: | :----------------------------------------------------------------------------------------------------------: | +| PCPVT-small\* | 24.11 | 3.67 | 81.14 | 95.69 | [config](./twins-pcpvt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth) | +| PCPVT-base\* | 43.83 | 6.45 | 82.66 | 96.26 | [config](./twins-pcpvt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth) | +| PCPVT-large\* | 60.99 | 9.51 | 83.09 | 96.59 | [config](./twins-pcpvt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth) | +| SVT-small\* | 24.06 | 2.82 | 81.77 | 95.57 | [config](./twins-svt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth) | +| SVT-base\* | 56.07 | 8.35 | 83.13 | 96.29 | [config](./twins-svt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth) | +| SVT-large\* | 99.27 | 14.82 | 83.60 | 96.50 | [config](./twins-svt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth) | *Models with * are converted from [the official repo](https://github.com/Meituan-AutoML/Twins). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results. The validation accuracy is a little different from the official paper because of the PyTorch version. This result is get in PyTorch=1.9 while the official result is get in PyTorch=1.7* diff --git a/configs/van/README.md b/configs/van/README.md index e39dfc44..0e121454 100644 --- a/configs/van/README.md +++ b/configs/van/README.md @@ -16,12 +16,12 @@ While originally designed for natural language processing (NLP) tasks, the self- ### ImageNet-1k -| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------: | :-------------------------------------------------------------------: | -| VAN-T\* | From scratch | 224x224 | 4.11 | 0.88 | 75.41 | 93.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) | -| VAN-S\* | From scratch | 224x224 | 13.86 | 2.52 | 81.01 | 95.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) | -| VAN-B\* | From scratch | 224x224 | 26.58 | 5.03 | 82.80 | 96.21 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) | -| VAN-L\* | From scratch | 224x224 | 44.77 | 8.99 | 83.86 | 96.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) | +| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :----------------------------------: | :--------------------------------------------------------------------------------------------------: | +| VAN-T\* | From scratch | 224x224 | 4.11 | 0.88 | 75.41 | 93.02 | [config](./van-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) | +| VAN-S\* | From scratch | 224x224 | 13.86 | 2.52 | 81.01 | 95.63 | [config](./van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) | +| VAN-B\* | From scratch | 224x224 | 26.58 | 5.03 | 82.80 | 96.21 | [config](./van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) | +| VAN-L\* | From scratch | 224x224 | 44.77 | 8.99 | 83.86 | 96.73 | [config](./van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) | \*Models with * are converted from [the official repo](https://github.com/Visual-Attention-Network/VAN-Classification). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results. diff --git a/configs/van/metafile.yml b/configs/van/metafile.yml index 13e28c16..24a3fc9f 100644 --- a/configs/van/metafile.yml +++ b/configs/van/metafile.yml @@ -18,8 +18,8 @@ Collections: Models: - Name: van-tiny_8xb128_in1k Metadata: - FLOPs: 4110000 # 4.11M - Parameters: 880000000 # 0.88G + Parameters: 4110000 # 4.11M + FLOPs: 880000000 # 0.88G In Collection: Visual-Attention-Network Results: - Dataset: ImageNet-1k @@ -31,8 +31,8 @@ Models: Config: configs/van/van-tiny_8xb128_in1k.py - Name: van-small_8xb128_in1k Metadata: - FLOPs: 13860000 # 13.86M - Parameters: 2520000000 # 2.52G + Parameters: 13860000 # 13.86M + FLOPs: 2520000000 # 2.52G In Collection: Visual-Attention-Network Results: - Dataset: ImageNet-1k @@ -44,8 +44,8 @@ Models: Config: configs/van/van-small_8xb128_in1k.py - Name: van-base_8xb128_in1k Metadata: - FLOPs: 26580000 # 26.58M - Parameters: 5030000000 # 5.03G + Parameters: 26580000 # 26.58M + FLOPs: 5030000000 # 5.03G In Collection: Visual-Attention-Network Results: - Dataset: ImageNet-1k @@ -57,8 +57,8 @@ Models: Config: configs/van/van-base_8xb128_in1k.py - Name: van-large_8xb128_in1k Metadata: - FLOPs: 44770000 # 44.77 M - Parameters: 8990000000 # 8.99G + Parameters: 44770000 # 44.77 M + FLOPs: 8990000000 # 8.99G In Collection: Visual-Attention-Network Results: - Dataset: ImageNet-1k diff --git a/configs/vgg/README.md b/configs/vgg/README.md index 454489ff..0eccbf72 100644 --- a/configs/vgg/README.md +++ b/configs/vgg/README.md @@ -16,16 +16,16 @@ In this work we investigate the effect of the convolutional network depth on its ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------: | :-----------------------------------------------------------------------------: | -| VGG-11 | 132.86 | 7.63 | 68.75 | 88.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg11_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.log.json) | -| VGG-13 | 133.05 | 11.34 | 70.02 | 89.46 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg13_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.log.json) | -| VGG-16 | 138.36 | 15.5 | 71.62 | 90.49 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.log.json) | -| VGG-19 | 143.67 | 19.67 | 72.41 | 90.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg19_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.log.json) | -| VGG-11-BN | 132.87 | 7.64 | 70.67 | 90.16 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg11bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.log.json) | -| VGG-13-BN | 133.05 | 11.36 | 72.12 | 90.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg13bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.log.json) | -| VGG-16-BN | 138.37 | 15.53 | 73.74 | 91.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.log.json) | -| VGG-19-BN | 143.68 | 19.7 | 74.68 | 92.27 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg19bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.log.json) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------: | :-------: | :------: | :-------: | :-------: | :-------------------------------: | :-------------------------------------------------------------------------------------------------------------------------: | +| VGG-11 | 132.86 | 7.63 | 68.75 | 88.87 | [config](./vgg11_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.log.json) | +| VGG-13 | 133.05 | 11.34 | 70.02 | 89.46 | [config](./vgg13_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.log.json) | +| VGG-16 | 138.36 | 15.5 | 71.62 | 90.49 | [config](./vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.log.json) | +| VGG-19 | 143.67 | 19.67 | 72.41 | 90.80 | [config](./vgg19_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.log.json) | +| VGG-11-BN | 132.87 | 7.64 | 70.67 | 90.16 | [config](./vgg11bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.log.json) | +| VGG-13-BN | 133.05 | 11.36 | 72.12 | 90.66 | [config](./vgg13bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.log.json) | +| VGG-16-BN | 138.37 | 15.53 | 73.74 | 91.66 | [config](./vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.log.json) | +| VGG-19-BN | 143.68 | 19.7 | 74.68 | 92.27 | [config](./vgg19bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.log.json) | ## Citation diff --git a/configs/vision_transformer/README.md b/configs/vision_transformer/README.md index 7dcd4036..45e21a36 100644 --- a/configs/vision_transformer/README.md +++ b/configs/vision_transformer/README.md @@ -34,13 +34,13 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' ### ImageNet-1k -| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: | -| ViT-B16 | From scratch | 224x224 | 86.86 | 33.03 | 82.37 | 96.15 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.log) | -| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) | -| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) | -| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) | -| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) | +| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------: | :-----------------------------------------------------------------------------: | +| ViT-B16 | From scratch | 224x224 | 86.86 | 33.03 | 82.37 | 96.15 | [config](./vit-base-p16_pt-32xb128-mae_in1k-224.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.log) | +| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](./vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) | +| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](./vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) | +| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](./vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) | +| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](./vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) | *Models with * are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/wrn/README.md b/configs/wrn/README.md index b036caaf..2ac1f74a 100644 --- a/configs/wrn/README.md +++ b/configs/wrn/README.md @@ -16,11 +16,11 @@ Deep residual networks were shown to be able to scale up to thousands of layers ### ImageNet-1k -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :-------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------: | -| WRN-50\* | 68.88 | 11.44 | 78.48 | 94.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/wrn/wide-resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth) | -| WRN-101\* | 126.89 | 22.81 | 78.84 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/wrn/wide-resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) | -| WRN-50 (timm)\* | 68.88 | 11.44 | 81.45 | 95.53 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/wrn/wide-resnet50_timm_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty-timm_8xb32_in1k_20220304-83ae4399.pth) | +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +| :-------------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------: | :--------------------------------------------------------------------------------------------------------: | +| WRN-50\* | 68.88 | 11.44 | 78.48 | 94.08 | [config](./wide-resnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth) | +| WRN-101\* | 126.89 | 22.81 | 78.84 | 94.28 | [config](./wide-resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) | +| WRN-50 (timm)\* | 68.88 | 11.44 | 81.45 | 95.53 | [config](./wide-resnet50_timm_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty-timm_8xb32_in1k_20220304-83ae4399.pth) | *Models with * are converted from the [TorchVision](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) and [TIMM](https://github.com/rwightman/pytorch-image-models/blob/master). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile index e74a5ffc..dbfba3ad 100644 --- a/docker/serve/Dockerfile +++ b/docker/serve/Dockerfile @@ -4,7 +4,7 @@ ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel ARG MMCV="2.0.0rc1" -ARG MMCLS="1.0.0rc0" +ARG MMCLS="1.0.0rc1" ENV PYTHONUNBUFFERED TRUE diff --git a/docs/en/_static/image/tools/visualization/lr_schedule1.png b/docs/en/_static/image/tools/visualization/lr_schedule1.png deleted file mode 100644 index 31fca35b..00000000 Binary files a/docs/en/_static/image/tools/visualization/lr_schedule1.png and /dev/null differ diff --git a/docs/en/_static/image/tools/visualization/lr_schedule2.png b/docs/en/_static/image/tools/visualization/lr_schedule2.png deleted file mode 100644 index 8c6231db..00000000 Binary files a/docs/en/_static/image/tools/visualization/lr_schedule2.png and /dev/null differ diff --git a/docs/en/api/datasets.rst b/docs/en/api/datasets.rst index 0a6cfa90..f72dca0c 100644 --- a/docs/en/api/datasets.rst +++ b/docs/en/api/datasets.rst @@ -59,16 +59,15 @@ Base classes Dataset Wrappers ---------------- -ConcatDataset - -TODO: add MMEngine Link - -RepeatDataset - -TODO: add MMEngine Link - -ClassBalancedDataset - -TODO: add MMEngine Link - .. autoclass:: KFoldDataset + +The dataset wrappers in the MMEngine can be directly used in MMClassification. + +.. list-table:: + + * - :class:`~mmengine.dataset.ConcatDataset` + - A wrapper of concatenated dataset. + * - :class:`~mmengine.dataset.RepeatDataset` + - A wrapper of repeated dataset. + * - :class:`~mmengine.dataset.ClassBalancedDataset` + - A wrapper of class balanced dataset. diff --git a/docs/en/api/models.rst b/docs/en/api/models.rst index a01ed2dc..d188805b 100644 --- a/docs/en/api/models.rst +++ b/docs/en/api/models.rst @@ -65,12 +65,17 @@ Backbones ConvNeXt DenseNet DistilledVisionTransformer + EfficientFormer EfficientNet + EdgeNeXt HRNet + InceptionV3 LeNet5 + MViT MlpMixer MobileNetV2 MobileNetV3 + MobileOne PCPVT PoolFormer RegNet @@ -89,6 +94,7 @@ Backbones ShuffleNetV1 ShuffleNetV2 SwinTransformer + SwinTransformerV2 T2T_ViT TIMMBackbone TNT @@ -124,6 +130,7 @@ Heads LinearClsHead StackedLinearClsHead VisionTransformerClsHead + EfficientFormerClsHead DeiTClsHead ConformerHead MultiLabelClsHead @@ -164,12 +171,15 @@ Common Components InvertedResidual SELayer + WindowMSA + WindowMSAV2 ShiftWindowMSA MultiheadAttention ConditionalPositionEncoding PatchEmbed PatchMerging HybridEmbed + LayerScale .. _helpers: diff --git a/docs/en/index.rst b/docs/en/index.rst index 8109ec7f..5eaf65c4 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -43,7 +43,6 @@ You can switch between Chinese and English documentation in the lower-left corne :glob: modelzoo_statistics.md - model_zoo.md papers/* .. toctree:: diff --git a/docs/en/migration.md b/docs/en/migration.md index 5988ae3c..ff585d4c 100644 --- a/docs/en/migration.md +++ b/docs/en/migration.md @@ -298,7 +298,7 @@ Changes in **`lr_config`**: functionality. The new schedulers combination mechanism is very flexible, and you can use it to design many kinds of learning -rate / momentum curves. See [the tutorial](TODO) for more details. +rate / momentum curves. See {external+mmengine:doc}`the tutorial ` for more details. @@ -370,7 +370,7 @@ test_cfg = dict() # Use the default test loop.
In fact, in OpenMMLab 2.0, we introduced `Loop` to control the behaviors in training, validation and test. And -the functionalities of `Runner` are also changed. You can find more details in [the MMEngine tutorials](TODO). +the functionalities of `Runner` are also changed. You can find more details in {external+mmengine:doc}`the MMEngine tutorials `. ### Runtime settings @@ -468,7 +468,7 @@ Changes in **`workflow`**: `workflow` related functionalities are removed. New field **`visualizer`**: The visualizer is a new design in OpenMMLab 2.0 architecture. We use a visualizer instance in the runner to handle results & log visualization and save to different backends. -See the [MMEngine tutorial](TODO) for more details. +See the {external+mmengine:doc}`MMEngine tutorial ` for more details. ```python visualizer = dict( @@ -481,7 +481,7 @@ visualizer = dict( ) ``` -New field **`default_scope`**: The start point to search module for all registries. The `default_scope` in MMClassification is `mmcls`. See [the registry tutorial](TODO) for more details. +New field **`default_scope`**: The start point to search module for all registries. The `default_scope` in MMClassification is `mmcls`. See {external+mmengine:doc}`the registry tutorial ` for more details. ## Packages diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md deleted file mode 100644 index 83e6ec5d..00000000 --- a/docs/en/model_zoo.md +++ /dev/null @@ -1,155 +0,0 @@ -# Model Zoo - -## ImageNet - -ImageNet has multiple versions, but the most commonly used one is [ILSVRC 2012](http://www.image-net.org/challenges/LSVRC/2012/). -The ResNet family models below are trained by standard data augmentations, i.e., RandomResizedCrop, RandomHorizontalFlip and Normalize. - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -| :--------------------------------: | :-------------------------------: | :-----------------------------: | :-------: | :-------: | :---------------------------------------: | :-----------------------------------------: | -| VGG-11 | 132.86 | 7.63 | 68.75 | 88.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg11_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.log.json) | -| VGG-13 | 133.05 | 11.34 | 70.02 | 89.46 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg13_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.log.json) | -| VGG-16 | 138.36 | 15.5 | 71.62 | 90.49 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.log.json) | -| VGG-19 | 143.67 | 19.67 | 72.41 | 90.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg19_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.log.json) | -| VGG-11-BN | 132.87 | 7.64 | 70.75 | 90.12 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg11bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.log.json) | -| VGG-13-BN | 133.05 | 11.36 | 72.15 | 90.71 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg13bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.log.json) | -| VGG-16-BN | 138.37 | 15.53 | 73.72 | 91.68 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.log.json) | -| VGG-19-BN | 143.68 | 19.7 | 74.70 | 92.24 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg19bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.log.json) | -| RepVGG-A0\* | 9.11(train) \| 8.31 (deploy) | 1.52 (train) \| 1.36 (deploy) | 72.41 | 90.50 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth) | -| RepVGG-A1\* | 14.09 (train) \| 12.79 (deploy) | 2.64 (train) \| 2.37 (deploy) | 74.47 | 91.85 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth) | -| RepVGG-A2\* | 28.21 (train) \| 25.5 (deploy) | 5.7 (train) \| 5.12 (deploy) | 76.48 | 93.01 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth) | -| RepVGG-B0\* | 15.82 (train) \| 14.34 (deploy) | 3.42 (train) \| 3.06 (deploy) | 75.14 | 92.42 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth) | -| RepVGG-B1\* | 57.42 (train) \| 51.83 (deploy) | 13.16 (train) \| 11.82 (deploy) | 78.37 | 94.11 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth) | -| RepVGG-B1g2\* | 45.78 (train) \| 41.36 (deploy) | 9.82 (train) \| 8.82 (deploy) | 77.79 | 93.88 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth) | -| RepVGG-B1g4\* | 39.97 (train) \| 36.13 (deploy) | 8.15 (train) \| 7.32 (deploy) | 77.58 | 93.84 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth) | -| RepVGG-B2\* | 89.02 (train) \| 80.32 (deploy) | 20.46 (train) \| 18.39 (deploy) | 78.78 | 94.42 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth) | -| RepVGG-B2g4\* | 61.76 (train) \| 55.78 (deploy) | 12.63 (train) \| 11.34 (deploy) | 79.38 | 94.68 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth) | -| RepVGG-B3\* | 123.09 (train) \| 110.96 (deploy) | 29.17 (train) \| 26.22 (deploy) | 80.52 | 95.26 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth) | -| RepVGG-B3g4\* | 83.83 (train) \| 75.63 (deploy) | 17.9 (train) \| 16.08 (deploy) | 80.22 | 95.10 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth) | -| RepVGG-D2se\* | 133.33 (train) \| 120.39 (deploy) | 36.56 (train) \| 32.85 (deploy) | 81.81 | 95.94 | [config (train)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \| [config (deploy)](https://github.com/open-mmlab/mmclassification/blob/master/configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth) | -| ResNet-18 | 11.69 | 1.82 | 70.07 | 89.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet18_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.log.json) | -| ResNet-34 | 21.8 | 3.68 | 73.85 | 91.53 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet34_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.log.json) | -| ResNet-50 (rsb-a1) | 25.56 | 4.12 | 80.12 | 94.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.log.json) | -| ResNet-101 | 44.55 | 7.85 | 78.18 | 94.03 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.log.json) | -| ResNet-152 | 60.19 | 11.58 | 78.63 | 94.16 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.log.json) | -| Res2Net-50-14w-8s\* | 25.06 | 4.22 | 78.14 | 93.85 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net50-w14-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth) | -| Res2Net-50-26w-8s\* | 48.40 | 8.39 | 79.20 | 94.36 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net50-w26-s8_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth) | -| Res2Net-101-26w-4s\* | 45.21 | 8.12 | 79.19 | 94.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net101-w26-s4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth) | -| ResNeSt-50\* | 27.48 | 5.41 | 81.13 | 95.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnest/resnest50_32xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth) | -| ResNeSt-101\* | 48.28 | 10.27 | 82.32 | 96.24 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnest/resnest101_32xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth) | -| ResNeSt-200\* | 70.2 | 17.53 | 82.41 | 96.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnest/resnest200_64xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth) | -| ResNeSt-269\* | 110.93 | 22.58 | 82.70 | 96.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnest/resnest269_64xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth) | -| ResNetV1D-50 | 25.58 | 4.36 | 77.54 | 93.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.log.json) | -| ResNetV1D-101 | 44.57 | 8.09 | 78.93 | 94.48 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.log.json) | -| ResNetV1D-152 | 60.21 | 11.82 | 79.41 | 94.7 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnetv1d152_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.log.json) | -| ResNeXt-32x4d-50 | 25.03 | 4.27 | 77.90 | 93.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext50-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.log.json) | -| ResNeXt-32x4d-101 | 44.18 | 8.03 | 78.71 | 94.12 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.log.json) | -| ResNeXt-32x8d-101 | 88.79 | 16.5 | 79.23 | 94.58 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x8d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.log.json) | -| ResNeXt-32x4d-152 | 59.95 | 11.8 | 78.93 | 94.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext152-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.log.json) | -| SE-ResNet-50 | 28.09 | 4.13 | 77.74 | 93.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200708-657b3c36.log.json) | -| SE-ResNet-101 | 49.33 | 7.86 | 78.26 | 94.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200708-038a4d04.log.json) | -| RegNetX-400MF | 5.16 | 0.41 | 72.56 | 90.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211208_143316.log.json) | -| RegNetX-800MF | 7.26 | 0.81 | 74.76 | 92.32 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211207_143037.log.json) | -| RegNetX-1.6GF | 9.19 | 1.63 | 76.84 | 93.31 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211208_143018.log.json) | -| RegNetX-3.2GF | 15.3 | 3.21 | 78.09 | 94.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211208_142720.log.json) | -| RegNetX-4.0GF | 22.12 | 4.0 | 78.60 | 94.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211207_150431.log.json) | -| RegNetX-6.4GF | 26.21 | 6.51 | 79.38 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211213_172748.log.json) | -| RegNetX-8.0GF | 39.57 | 8.03 | 79.12 | 94.51 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211208_103250.log.json) | -| RegNetX-12GF | 46.11 | 12.15 | 79.67 | 95.03 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211208_143713.log.json) | -| ShuffleNetV1 1.0x (group=3) | 1.87 | 0.146 | 68.13 | 87.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) | -| ShuffleNetV2 1.0x | 2.28 | 0.149 | 69.55 | 88.92 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) | -| MobileNet V2 | 3.5 | 0.319 | 71.86 | 90.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) | -| ViT-B/16\* | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) | -| ViT-B/32\* | 88.3 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) | -| ViT-L/16\* | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) | -| Swin-Transformer tiny | 28.29 | 4.36 | 81.18 | 95.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-tiny_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925.log.json) | -| Swin-Transformer small | 49.61 | 8.52 | 83.02 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-small_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219.log.json) | -| Swin-Transformer base | 87.77 | 15.14 | 83.36 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742-93230b0d.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_base_224_b16x64_300e_imagenet_20210616_190742.log.json) | -| Transformer in Transformer small\* | 23.76 | 3.36 | 81.52 | 95.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/tnt/tnt-s-p16_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth) | -| T2T-ViT_t-14 | 21.47 | 4.34 | 81.83 | 95.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.log.json) | -| T2T-ViT_t-19 | 39.08 | 7.80 | 82.63 | 96.18 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.log.json) | -| T2T-ViT_t-24 | 64.00 | 12.69 | 82.71 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.log.json) | -| Mixer-B/16\* | 59.88 | 12.61 | 76.68 | 92.25 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth) | -| Mixer-L/16\* | 208.2 | 44.57 | 72.34 | 88.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) | -| DeiT-tiny | 5.72 | 1.08 | 74.50 | 92.24 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-tiny_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.log.json) | -| DeiT-tiny distilled\* | 5.72 | 1.08 | 74.51 | 91.90 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny-distilled_3rdparty_pt-4xb256_in1k_20211216-c429839a.pth) | -| DeiT-small | 22.05 | 4.24 | 80.69 | 95.06 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-small_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-small_pt-4xb256_in1k_20220218-9425b9bb.log.json) | -| DeiT-small distilled\* | 22.05 | 4.24 | 81.17 | 95.40 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-small-distilled_pt-4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-small-distilled_3rdparty_pt-4xb256_in1k_20211216-4de1d725.pth) | -| DeiT-base | 86.57 | 16.86 | 81.76 | 95.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_pt-16xb64_in1k_20220216-db63c16c.log.json) | -| DeiT-base distilled\* | 86.57 | 16.86 | 83.33 | 96.49 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base-distilled_pt-16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_pt-16xb64_in1k_20211216-42891296.pth) | -| DeiT-base 384px\* | 86.86 | 49.37 | 83.04 | 96.31 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base_3rdparty_ft-16xb32_in1k-384px_20211124-822d02f2.pth) | -| DeiT-base distilled 384px\* | 86.86 | 49.37 | 85.55 | 97.35 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/deit/deit-base-distilled_3rdparty_ft-16xb32_in1k-384px_20211216-e48d6000.pth) | -| Conformer-tiny-p16\* | 23.52 | 4.90 | 81.31 | 95.60 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) | -| Conformer-small-p32\* | 38.85 | 7.09 | 81.96 | 96.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) | -| Conformer-small-p16\* | 37.67 | 10.31 | 83.32 | 96.46 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) | -| Conformer-base-p16\* | 83.29 | 22.89 | 83.82 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) | -| PCPVT-small\* | 24.11 | 3.67 | 81.14 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth) | -| PCPVT-base\* | 43.83 | 6.45 | 82.66 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth) | -| PCPVT-large\* | 60.99 | 9.51 | 83.09 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth) | -| SVT-small\* | 24.06 | 2.82 | 81.77 | 95.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth) | -| SVT-base\* | 56.07 | 8.35 | 83.13 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth) | -| SVT-large\* | 99.27 | 14.82 | 83.60 | 96.50 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth) | -| EfficientNet-B0\* | 5.29 | 0.02 | 76.74 | 93.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth) | -| EfficientNet-B0 (AA)\* | 5.29 | 0.02 | 77.26 | 93.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth) | -| EfficientNet-B0 (AA + AdvProp)\* | 5.29 | 0.02 | 77.53 | 93.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth) | -| EfficientNet-B1\* | 7.79 | 0.03 | 78.68 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth) | -| EfficientNet-B1 (AA)\* | 7.79 | 0.03 | 79.20 | 94.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth) | -| EfficientNet-B1 (AA + AdvProp)\* | 7.79 | 0.03 | 79.52 | 94.43 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth) | -| EfficientNet-B2\* | 9.11 | 0.03 | 79.64 | 94.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth) | -| EfficientNet-B2 (AA)\* | 9.11 | 0.03 | 80.21 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth) | -| EfficientNet-B2 (AA + AdvProp)\* | 9.11 | 0.03 | 80.45 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth) | -| EfficientNet-B3\* | 12.23 | 0.06 | 81.01 | 95.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth) | -| EfficientNet-B3 (AA)\* | 12.23 | 0.06 | 81.58 | 95.67 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth) | -| EfficientNet-B3 (AA + AdvProp)\* | 12.23 | 0.06 | 81.81 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth) | -| EfficientNet-B4\* | 19.34 | 0.12 | 82.57 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth) | -| EfficientNet-B4 (AA)\* | 19.34 | 0.12 | 82.95 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth) | -| EfficientNet-B4 (AA + AdvProp)\* | 19.34 | 0.12 | 83.25 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth) | -| EfficientNet-B5\* | 30.39 | 0.24 | 83.18 | 96.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth) | -| EfficientNet-B5 (AA)\* | 30.39 | 0.24 | 83.82 | 96.76 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth) | -| EfficientNet-B5 (AA + AdvProp)\* | 30.39 | 0.24 | 84.21 | 96.98 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth) | -| EfficientNet-B6 (AA)\* | 43.04 | 0.41 | 84.05 | 96.82 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth) | -| EfficientNet-B6 (AA + AdvProp)\* | 43.04 | 0.41 | 84.74 | 97.14 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth) | -| EfficientNet-B7 (AA)\* | 66.35 | 0.72 | 84.38 | 96.88 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth) | -| EfficientNet-B7 (AA + AdvProp)\* | 66.35 | 0.72 | 85.14 | 97.23 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth) | -| EfficientNet-B8 (AA + AdvProp)\* | 87.41 | 1.09 | 85.38 | 97.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth) | -| ConvNeXt-T\* | 28.59 | 4.46 | 82.05 | 95.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) | -| ConvNeXt-S\* | 50.22 | 8.69 | 83.13 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) | -| ConvNeXt-B\* | 88.59 | 15.36 | 83.85 | 96.74 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) | -| ConvNeXt-B\* | 88.59 | 15.36 | 85.81 | 97.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) | -| ConvNeXt-L\* | 197.77 | 34.37 | 84.30 | 96.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) | -| ConvNeXt-L\* | 197.77 | 34.37 | 86.61 | 98.04 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) | -| ConvNeXt-XL\* | 350.20 | 60.93 | 86.97 | 98.20 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) | -| HRNet-W18\* | 21.30 | 4.33 | 76.75 | 93.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth) | -| HRNet-W30\* | 37.71 | 8.17 | 78.19 | 94.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w30_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth) | -| HRNet-W32\* | 41.23 | 8.99 | 78.44 | 94.19 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w32_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth) | -| HRNet-W40\* | 57.55 | 12.77 | 78.94 | 94.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w40_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth) | -| HRNet-W44\* | 67.06 | 14.96 | 78.88 | 94.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w44_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth) | -| HRNet-W48\* | 77.47 | 17.36 | 79.32 | 94.52 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth) | -| HRNet-W64\* | 128.06 | 29.00 | 79.46 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w64_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth) | -| HRNet-W18 (ssld)\* | 21.30 | 4.33 | 81.06 | 95.70 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth) | -| HRNet-W48 (ssld)\* | 77.47 | 17.36 | 83.63 | 96.79 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth) | -| WRN-50\* | 68.88 | 11.44 | 81.45 | 95.53 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/wrn/wide-resnet50_timm_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty-timm_8xb32_in1k_20220304-83ae4399.pth) | -| WRN-101\* | 126.89 | 22.81 | 78.84 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/wrn/wide-resnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) | -| CSPDarkNet50\* | 27.64 | 5.04 | 80.05 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspdarknet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth) | -| CSPResNet50\* | 21.62 | 3.48 | 79.55 | 94.68 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth) | -| CSPResNeXt50\* | 20.57 | 3.11 | 79.96 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/cspnet/cspresnext50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth) | -| DenseNet121\* | 7.98 | 2.88 | 74.96 | 92.21 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet121_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet121_4xb256_in1k_20220426-07450f99.pth) | -| DenseNet169\* | 14.15 | 3.42 | 76.08 | 93.11 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet169_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet169_4xb256_in1k_20220426-a2889902.pth) | -| DenseNet201\* | 20.01 | 4.37 | 77.32 | 93.64 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet201_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet201_4xb256_in1k_20220426-05cae4ef.pth) | -| DenseNet161\* | 28.68 | 7.82 | 77.61 | 93.83 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/densenet/densenet161_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/densenet/densenet161_4xb256_in1k_20220426-ee6a80a9.pth) | -| VAN-T\* | 4.11 | 0.88 | 75.41 | 93.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) | -| VAN-S\* | 13.86 | 2.52 | 81.01 | 95.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) | -| VAN-B\* | 26.58 | 5.03 | 82.80 | 96.21 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) | -| VAN-L\* | 44.77 | 8.99 | 83.86 | 96.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) | - -*Models with * are converted from other repos, others are trained by ourselves.* - -## CIFAR10 - -| Model | Params(M) | Flops(G) | Top-1 (%) | Config | Download | -| :--------------: | :-------: | :------: | :-------: | :----: | :------------------------------------------------------------------------------------------------------------: | -| ResNet-18-b16x8 | 11.17 | 0.56 | 94.82 | | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet18_8xb16_cifar10.py) | -| ResNet-34-b16x8 | 21.28 | 1.16 | 95.34 | | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet34_8xb16_cifar10.py) | -| ResNet-50-b16x8 | 23.52 | 1.31 | 95.55 | | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb16_cifar10.py) | -| ResNet-101-b16x8 | 42.51 | 2.52 | 95.58 | | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb16_cifar10.py) | -| ResNet-152-b16x8 | 58.16 | 3.74 | 95.76 | | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_8xb16_cifar10.py) | diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md index 4778bb84..28e3608b 100644 --- a/docs/en/notes/changelog.md +++ b/docs/en/notes/changelog.md @@ -1,5 +1,35 @@ # Changelog +## v1.0.0rc1(30/9/2022) + +### New Features + +- Support MViT for MMCLS 1.x ([#1023](https://github.com/open-mmlab/mmclassification/pull/1023)) +- Add ViT huge architecture. ([#1049](https://github.com/open-mmlab/mmclassification/pull/1049)) +- Support EdgeNeXt for dev-1.x. ([#1037](https://github.com/open-mmlab/mmclassification/pull/1037)) +- Support Swin Transformer V2 for MMCLS 1.x. ([#1029](https://github.com/open-mmlab/mmclassification/pull/1029)) +- Add efficientformer Backbone for MMCls 1.x. ([#1031](https://github.com/open-mmlab/mmclassification/pull/1031)) +- Add MobileOne Backbone For MMCls 1.x. ([#1030](https://github.com/open-mmlab/mmclassification/pull/1030)) +- Support BEiT Transformer layer. ([#919](https://github.com/open-mmlab/mmclassification/pull/919)) + +### Improvements + +- \[Refactor\] Fix visualization tools. ([#1045](https://github.com/open-mmlab/mmclassification/pull/1045)) +- \[Improve\] Update benchmark scripts ([#1028](https://github.com/open-mmlab/mmclassification/pull/1028)) +- \[Improve\] Update tools to enable `pin_memory` and `persistent_workers` by default. ([#1024](https://github.com/open-mmlab/mmclassification/pull/1024)) +- \[CI\] Update circle-ci and github workflow. ([#1018](https://github.com/open-mmlab/mmclassification/pull/1018)) + +### Bug Fixes + +- Fix verify dataset tool in 1.x. ([#1062](https://github.com/open-mmlab/mmclassification/pull/1062)) +- Fix `loss_weight` in `LabelSmoothLoss`. ([#1058](https://github.com/open-mmlab/mmclassification/pull/1058)) +- Fix the output position of Swin-Transformer. ([#947](https://github.com/open-mmlab/mmclassification/pull/947)) + +### Docs Update + +- Auto generate model summary table. ([#1010](https://github.com/open-mmlab/mmclassification/pull/1010)) +- Refactor new modules tutorial. ([#998](https://github.com/open-mmlab/mmclassification/pull/998)) + ## v1.0.0rc0(31/8/2022) MMClassification 1.0.0rc0 is the first version of MMClassification 1.x, a part of the OpenMMLab 2.0 projects. diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md index b6fde971..c8c16914 100644 --- a/docs/en/notes/faq.md +++ b/docs/en/notes/faq.md @@ -17,8 +17,9 @@ and make sure you fill in all required information in the template. | MMClassification version | MMCV version | | :----------------------: | :--------------------: | - | 1.0.0rc0 (1.x) | mmcv>=2.0.0rc1 | - | 0.23.1 (master) | mmcv>=1.4.2, \<1.6.0 | + | 1.0.0rc1 (1.x) | mmcv>=2.0.0rc1 | + | 0.24.0 (master) | mmcv>=1.4.2, \<1.7.0 | + | 0.23.1 | mmcv>=1.4.2, \<1.6.0 | | 0.22.1 | mmcv>=1.4.2, \<1.6.0 | | 0.21.0 | mmcv>=1.4.2, \<=1.5.0 | | 0.20.1 | mmcv>=1.4.2, \<=1.5.0 | diff --git a/docs/en/stat.py b/docs/en/stat.py index 8f1e5b2d..030f07b9 100755 --- a/docs/en/stat.py +++ b/docs/en/stat.py @@ -1,100 +1,145 @@ #!/usr/bin/env python -import functools as func -import glob -import os import re +from collections import defaultdict from pathlib import Path -import numpy as np +from modelindex.load_model_index import load +from tabulate import tabulate -MMCLS_ROOT = Path(__file__).absolute().parents[1] -url_prefix = 'https://github.com/open-mmlab/mmclassification/blob/master/' +MMCLS_ROOT = Path(__file__).absolute().parents[2] +PAPERS_ROOT = Path('papers') # Path to save generated paper pages. +GITHUB_PREFIX = 'https://github.com/open-mmlab/mmclassification/blob/1.x/' +MODELZOO_TEMPLATE = """ +# Model Zoo Summary -papers_root = Path('papers') -papers_root.mkdir(exist_ok=True) -files = [Path(f) for f in sorted(glob.glob('../../configs/*/README.md'))] +* Number of papers: {num_papers} +{type_msg} -stats = [] -titles = [] -num_ckpts = 0 -num_configs = 0 +* Number of checkpoints: {num_ckpts} +{paper_msg} +""" -for f in files: - with open(f, 'r') as content_file: - content = content_file.read() +model_index = load(str(MMCLS_ROOT / 'model-index.yml')) - # Extract checkpoints - ckpts = set(x.lower().strip() - for x in re.findall(r'\[model\]\((https?.*)\)', content)) - if len(ckpts) == 0: - continue - num_ckpts += len(ckpts) - # Extract paper title - match_res = list(re.finditer(r'> \[(.*)\]\((.*)\)', content)) - if len(match_res) > 0: - title, paperlink = match_res[0].groups() - else: - title = content.split('\n')[0].replace('# ', '').strip() - paperlink = None - titles.append(title) +def build_collections(model_index): + col_by_name = {} + for col in model_index.collections: + setattr(col, 'models', []) + col_by_name[col.name] = col - # Replace paper link to a button - if paperlink is not None: - start = match_res[0].start() - end = match_res[0].end() - # link_button = f'{title}' - link_button = f'[{title}]({paperlink})' - content = content[:start] + link_button + content[end:] + for model in model_index.models: + col = col_by_name[model.in_collection] + col.models.append(model) + setattr(model, 'collection', col) - # Extract paper type - _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] - assert len(_papertype) > 0 - papertype = _papertype[0] - paper = set([(papertype, title)]) + +build_collections(model_index) + + +def count_papers(collections): + total_num_ckpts = 0 + type_count = defaultdict(int) + paper_msgs = [] + + for collection in collections: + with open(MMCLS_ROOT / collection.readme) as f: + readme = f.read() + ckpts = set(x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', readme)) + total_num_ckpts += len(ckpts) + title = collection.paper['Title'] + papertype = collection.data.get('type', 'Algorithm') + type_count[papertype] += 1 + + readme = PAPERS_ROOT / Path( + collection.filepath).parent.with_suffix('.md').name + paper_msgs.append( + f'\t- [{papertype}] [{title}]({readme}) ({len(ckpts)} ckpts)') + + type_msg = '\n'.join( + [f'\t- {type_}: {count}' for type_, count in type_count.items()]) + paper_msg = '\n'.join(paper_msgs) + + modelzoo = MODELZOO_TEMPLATE.format( + num_papers=len(collections), + num_ckpts=total_num_ckpts, + type_msg=type_msg, + paper_msg=paper_msg, + ) + + with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +count_papers(model_index.collections) + + +def generate_paper_page(collection): + PAPERS_ROOT.mkdir(exist_ok=True) # Write a copy of README - copy = papers_root / (f.parent.name + '.md') - if copy.exists(): - os.remove(copy) + with open(MMCLS_ROOT / collection.readme) as f: + readme = f.read() + folder = Path(collection.filepath).parent + copy = PAPERS_ROOT / folder.with_suffix('.md').name def replace_link(matchobj): # Replace relative link to GitHub link. name = matchobj.group(1) link = matchobj.group(2) - if not link.startswith('http') and (f.parent / link).exists(): - rel_link = (f.parent / link).absolute().relative_to(MMCLS_ROOT) - link = url_prefix + str(rel_link) + if not link.startswith('http'): + assert (folder / link).exists(), \ + f'Link not found:\n{collection.readme}: {link}' + rel_link = (folder / link).absolute().relative_to(MMCLS_ROOT) + link = GITHUB_PREFIX + str(rel_link) return f'[{name}]({link})' - content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, content) + content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, readme) with open(copy, 'w') as copy_file: copy_file.write(content) - statsmsg = f""" -\t* [{papertype}] [{title}]({copy}) ({len(ckpts)} ckpts) -""" - stats.append(dict(paper=paper, ckpts=ckpts, statsmsg=statsmsg, copy=copy)) -allpapers = func.reduce(lambda a, b: a.union(b), - [stat['paper'] for stat in stats]) -msglist = '\n'.join(stat['statsmsg'] for stat in stats) +for collection in model_index.collections: + generate_paper_page(collection) -papertypes, papercounts = np.unique([t for t, _ in allpapers], - return_counts=True) -countstr = '\n'.join( - [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) -modelzoo = f""" -# Model Zoo Summary +def generate_summary_table(models): + dataset_rows = defaultdict(list) + for model in models: + if model.results is None: + continue + name = model.name + params = model.metadata.parameters / 1e6 + flops = model.metadata.flops / 1e9 + result = model.results[0] + top1 = result.metrics.get('Top 1 Accuracy') + top5 = result.metrics.get('Top 5 Accuracy') + readme = Path(model.collection.filepath).parent.with_suffix('.md').name + page = f'[link]({PAPERS_ROOT / readme})' + row = [name, params, flops, top1, top5, page] + dataset_rows[result.dataset].append(row) -* Number of papers: {len(set(titles))} -{countstr} + with open('modelzoo_statistics.md', 'a') as f: + for dataset, rows in dataset_rows.items(): + f.write(f'\n## {dataset}\n') + f.write("""```{table}\n:class: model-summary\n""") + header = [ + 'Model', + 'Params (M)', + 'Flops (G)', + 'Top-1 (%)', + 'Top-5 (%)', + 'Readme', + ] + table_cfg = dict( + tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(rows, header, **table_cfg)) + f.write('\n```\n') -* Number of checkpoints: {num_ckpts} -{msglist} -""" -with open('modelzoo_statistics.md', 'w') as f: - f.write(modelzoo) +generate_summary_table(model_index.models) diff --git a/docs/en/user_guides/config.md b/docs/en/user_guides/config.md index 195e7655..c4fe19fb 100644 --- a/docs/en/user_guides/config.md +++ b/docs/en/user_guides/config.md @@ -2,7 +2,7 @@ To manage various configurations in a deep-learning experiment, we use a kind of config file to record all of these configurations. This config system has a modular and inheritance design, and more details can be found in -[the tutorial in MMEngine](TODO). +{external+mmengine:doc}`the tutorial in MMEngine `. Usually, we use python files as config file. All configuration files are placed under the [`configs`](https://github.com/open-mmlab/mmclassification/tree/1.x/configs) folder, and the directory structure is as follows: @@ -64,7 +64,7 @@ This primitive config file includes a dict variable `model`, which mainly includ ```{note} Usually, we use the `type` field to specify the class of the component and use other fields to pass -the initialization arguments of the class. The [registry tutorial](TODO) describes it in detail. +the initialization arguments of the class. The {external+mmengine:doc}`registry tutorial ` describes it in detail. ``` Following is the model primitive config of the ResNet50 config file in [`configs/_base_/models/resnet50.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/_base_/models/resnet50.py): @@ -178,12 +178,12 @@ test loops: - `optim_wrapper`: The settings of the optimizer wrapper. We use the optimizer wrapper to customize the optimization process. - - `optimizer`: Supports all `pytorch` optimizers, refers to the relevant [MMEngine documentation](TODO). - - `paramwise_cfg`: To set different optimization arguments according to the parameters' type or name, refer to the relevant [learning policy documentation](TODO). + - `optimizer`: Supports all `pytorch` optimizers, refers to the relevant {external+mmengine:doc}`MMEngine documentation `. + - `paramwise_cfg`: To set different optimization arguments according to the parameters' type or name, refer to the relevant [learning policy documentation](../advanced_guides/schedule.md). - `accumulative_counts`: Optimize parameters after several backward steps instead of one backward step. You can use it to simulate large batch size by small batch size. -- `param_scheduler`: Optimizer parameters policy. You can use it to specify learning rate and momentum curves during training. See the [documentation](TODO) in MMEngine for more details. -- `train_cfg | val_cfg | test_cfg`: The settings of the training, validation and test loops, refer to the relevant [MMEngine documentation](TODO). +- `param_scheduler`: Optimizer parameters policy. You can use it to specify learning rate and momentum curves during training. See the {external+mmengine:doc}`documentation ` in MMEngine for more details. +- `train_cfg | val_cfg | test_cfg`: The settings of the training, validation and test loops, refer to the relevant {external+mmengine:doc}`MMEngine documentation `. Following is the schedule primitive config of the ResNet50 config in [`configs/_base_/datasets/imagenet_bs32.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/_base_/datasets/imagenet_bs32.py): @@ -348,7 +348,7 @@ test_dataloader = dict(dataset=dict(pipeline=val_pipeline)) ### Ignore some fields in the base configs -Sometimes, you need to set `_delete_=True` to ignore some domain content in the basic configuration file. You can refer to the [documentation in MMEngine](TODO) for more instructions. +Sometimes, you need to set `_delete_=True` to ignore some domain content in the basic configuration file. You can refer to the {external+mmengine:doc}`documentation in MMEngine ` for more instructions. The following is an example. If you want to use cosine schedule in the above ResNet50 case, just using inheritance and directly modifying it will report `get unexpected keyword 'step'` error, because the `'step'` field of the basic config in `param_scheduler` domain information is reserved, and you need to add `_delete_ =True` to ignore the content of `param_scheduler` related fields in the basic configuration file: @@ -361,7 +361,7 @@ param_scheduler = dict(type='CosineAnnealingLR', by_epoch=True, _delete_=True) ### Use some fields in the base configs -Sometimes, you may refer to some fields in the `_base_` config, to avoid duplication of definitions. You can refer to [MMEngine](TODO:) for some more instructions. +Sometimes, you may refer to some fields in the `_base_` config, to avoid duplication of definitions. You can refer to {external+mmengine:doc}`MMEngine ` for some more instructions. The following is an example of using auto augment in the training data preprocessing pipeline, refer to [`configs/resnest/resnest50_32xb64_in1k.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/resnest/resnest50_32xb64_in1k.py). When defining `train_pipeline`, just add the definition file name of auto augment to `_base_`, and then use `_base_.auto_increasing_policies` to reference the variables in the primitive config: diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md index dfd521fc..096d0797 100644 --- a/docs/en/user_guides/dataset_prepare.md +++ b/docs/en/user_guides/dataset_prepare.md @@ -304,7 +304,7 @@ To find more datasets supported by MMClassification, and get more configurations ## Dataset Wrappers -The following datawrappers are supported in MMEngine, you can refer to [MMEngine tutorial](TODO:) to learn how to use it. +The following datawrappers are supported in MMEngine, you can refer to {external+mmengine:doc}`MMEngine tutorial ` to learn how to use it. - [ConcatDataset](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/basedataset.md#concatdataset) - [RepeatDataset](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/basedataset.md#repeatdataset) diff --git a/docs/en/user_guides/finetune.md b/docs/en/user_guides/finetune.md index 4f83f849..967f7938 100644 --- a/docs/en/user_guides/finetune.md +++ b/docs/en/user_guides/finetune.md @@ -3,7 +3,7 @@ In most scenarios, we want to apply a model on new datasets without training from scratch, which might possibly introduce extra uncertainties about the model convergency and therefore, is time-consuming. The common sense is to learn from previous models trained on large dataset, which can hopefully provide better knowledge than a random beginner. Roughly speaking, this process is as known as fine-tuning. Classification models pre-trained on the ImageNet dataset have been demonstrated to be effective for other datasets and other downstream tasks. -Hence, this tutorial provides instructions for users to use the models provided in the [Model Zoo](../model_zoo.md) for other datasets to obtain better performance. +Hence, this tutorial provides instructions for users to use the models provided in the [Model Zoo](../modelzoo_statistics.md) for other datasets to obtain better performance. There are two steps to fine-tune a model on a new dataset. diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md index 5221f0a1..a1c970c1 100644 --- a/docs/en/user_guides/inference.md +++ b/docs/en/user_guides/inference.md @@ -1,6 +1,6 @@ # Inference with existing models -MMClassification provides pre-trained models for classification in [Model Zoo](../model_zoo.md). +MMClassification provides pre-trained models for classification in [Model Zoo](../modelzoo_statistics.md). This note will show **how to use existing models to inference on given images**. As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test) diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md index 1eeea642..dbf3e6f5 100644 --- a/docs/en/user_guides/visualization.md +++ b/docs/en/user_guides/visualization.md @@ -1,134 +1,138 @@ -# Visualization Tools (TODO) +# Visualization Tools -- [Pipeline Visualization](#pipeline-visualization) -- [Learning Rate Schedule Visualization](#learning-rate-schedule-visualization) +- [Browse Dataset](#browse-dataset) +- [Parameter Schedule Visualization](#parameter-schedule-visualization) - [Class Activation Map Visualization](#class-activation-map-visualization) - [FAQs](#faqs) -## Pipeline Visualization +## Browse Dataset ```bash -python tools/visualizations/vis_pipeline.py \ +python tools/visualizations/browse_dataset.py \ ${CONFIG_FILE} \ - [--output-dir ${OUTPUT_DIR}] \ - [--phase ${DATASET_PHASE}] \ - [--number ${BUNBER_IMAGES_DISPLAY}] \ - [--skip-type ${SKIP_TRANSFORM_TYPE}] \ - [--mode ${DISPLAY_MODE}] \ - [--show] \ - [--adaptive] \ - [--min-edge-length ${MIN_EDGE_LENGTH}] \ - [--max-edge-length ${MAX_EDGE_LENGTH}] \ - [--bgr2rgb] \ - [--window-size ${WINDOW_SIZE}] \ + [-o, --output-dir ${OUTPUT_DIR}] \ + [-p, --phase ${DATASET_PHASE}] \ + [-n, --show-number ${NUMBER_IMAGES_DISPLAY}] \ + [-i, --show-interval ${SHOW_INTERRVAL}] \ + [-m, --mode ${DISPLAY_MODE}] \ + [-r, --rescale-factor ${RESCALE_FACTOR}] \ + [-c, --channel-order ${CHANNEL_ORDER}] \ [--cfg-options ${CFG_OPTIONS}] ``` **Description of all arguments**: - `config` : The path of a model config file. -- `--output-dir`: The output path for visualized images. If not specified, it will be set to `''`, which means not to save. -- `--phase`: Phase of visualizing dataset,must be one of `[train, val, test]`. If not specified, it will be set to `train`. -- `--number`: The number of samples to visualized. If not specified, display all images in the dataset. -- `--skip-type`: The pipelines to be skipped. If not specified, it will be set to `['ToTensor', 'Normalize', 'ImageToTensor', 'Collect']`. -- `--mode`: The display mode, can be one of `[original, pipeline, concat]`. If not specified, it will be set to `concat`. -- `--show`: If set, display pictures in pop-up windows. -- `--adaptive`: If set, adaptively resize images for better visualization. -- `--min-edge-length`: The minimum edge length, used when `--adaptive` is set. When any side of the picture is smaller than `${MIN_EDGE_LENGTH}`, the picture will be enlarged while keeping the aspect ratio unchanged, and the short side will be aligned to `${MIN_EDGE_LENGTH}`. If not specified, it will be set to 200. -- `--max-edge-length`: The maximum edge length, used when `--adaptive` is set. When any side of the picture is larger than `${MAX_EDGE_LENGTH}`, the picture will be reduced while keeping the aspect ratio unchanged, and the long side will be aligned to `${MAX_EDGE_LENGTH}`. If not specified, it will be set to 1000. -- `--bgr2rgb`: If set, flip the color channel order of images. -- `--window-size`: The shape of the display window. If not specified, it will be set to `12*7`. If used, it must be in the format `'W*H'`. -- `--cfg-options` : Modifications to the configuration file, refer to [Tutorial 1: Learn about Configs](https://mmclassification.readthedocs.io/en/latest/tutorials/config.html). +- `-o, --output-dir`: The output path for visualized images. If not specified, it will be set to `''`, which means not to save. +- **`-p, --phase`**: Phase of visualizing dataset,must be one of `['train', 'val', 'test']`. If not specified, it will be set to `'train'`. +- **`-n, --show-number`**: The number of samples to visualized. If not specified, display all images in the dataset. +- `--show-interval`: The interval of show (s). +- **`-m, --mode`**: The display mode, can be one of `['original', 'transformed', 'concat', 'pipeline']`. If not specified, it will be set to `'transformed'`. +- **`-r, --rescale-factor`**: The image rescale factor, which is useful if the output is too large or too small. +- `-c, --channel-order`: The channel of the showing images, could be "BGR" or "RGB", If not specified, it will be set to 'BGR'. +- `--cfg-options` : Modifications to the configuration file, refer to [Learn about Configs](./config.md). ```{note} +1. The `-m, --mode` is about display mode, display original pictures or transformed pictures or comparison pictures: +- "original" means show images load from disk; +- "transformed" means to show images after transformed; +- "concat" means show images stitched by "original" and "transformed" images; +- "pipeline" means show all the intermediate images throghout the pipeline. -1. If the `--mode` is not specified, it will be set to `concat` as default, get the pictures stitched together by original pictures and transformed pictures; if the `--mode` is set to `original`, get the original pictures; if the `--mode` is set to `transformed`, get the transformed pictures; if the `--mode` is set to `pipeline`, get all the intermediate images through the pipeline. - -2. When `--adaptive` option is set, images that are too large or too small will be automatically adjusted, you can use `--min-edge-length` and `--max-edge-length` to set the adjust size. +2. The `-r, --rescale-factor` option is set when the label information is too large or too small relative to the picture. For example, when visualizing the CIFAR dataset, since the resolution of the image is very small, `--rescale-factor` can be set to 10. ``` **Examples**: -1. In **'original'** mode, visualize 100 original pictures in the `CIFAR100` validation set, then display and save them in the `./tmp` folder: +1. In **'original'** mode: ```shell -python ./tools/visualizations/vis_pipeline.py configs/resnet/resnet50_8xb16_cifar100.py --phase val --output-dir tmp --mode original --number 100 --show --adaptive --bgr2rgb +python ./tools/visualizations/browse_dataset.py ./configs/resnet/resnet101_8xb16_cifar10.py --phase val --output-dir tmp --mode original --show-number 100 --rescale-factor 10 --channel-order RGB ``` -
+- `--phase val`: Visual validation set, can be simplified to `-p val`; +- `--output-dir tmp`: The visualization results are saved in the "tmp" folder, can be simplified to `-o tmp`; +- `--mode original`: Visualize the original image, can be simplified to `-m original`; +- `--show-number 100`: visualize 100 images, can be simplified to `-n 100`; +- `--rescale-factor`: the image is enlarged by 10 times, can be simplified to `-r 10`; +- `--channel-order RGB`: The channel order of the visualized image is "RGB", can be simplified to `-c RGB`. -2. In **'transformed'** mode, visualize all the transformed pictures of the `ImageNet` training set and display them in pop-up windows: +
+ +2. In **'transformed'** mode: ```shell -python ./tools/visualizations/vis_pipeline.py ./configs/resnet/resnet50_8xb32_in1k.py --show --mode transformed +python ./tools/visualizations/browse_dataset.py ./configs/resnet/resnet50_8xb32_in1k.py -n 100 -r 2 ``` -
+
-3. In **'concat'** mode, visualize 10 pairs of origin and transformed images for comparison in the `ImageNet` train set and save them in the `./tmp` folder: +3. In **'concat'** mode: ```shell -python ./tools/visualizations/vis_pipeline.py configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py --phase train --output-dir tmp --number 10 --adaptive +python ./tools/visualizations/browse_dataset.py configs/swin_transformer/swin-small_16xb64_in1k.py -n 10 -m concat ``` -
+
-4. In **'pipeline'** mode, visualize all the intermediate pictures in the `ImageNet` train set through the pipeline: +4. In **'pipeline'** mode: ```shell -python ./tools/visualizations/vis_pipeline.py configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py --phase train --adaptive --mode pipeline --show +python ./tools/visualizations/browse_dataset.py configs/swin_transformer/swin-small_16xb64_in1k.py -m pipeline ``` -
+
-## Learning Rate Schedule Visualization +## Parameter Schedule Visualization ```bash -python tools/visualizations/vis_lr.py \ +python tools/visualizations/vis_scheduler.py \ ${CONFIG_FILE} \ - --dataset-size ${DATASET_SIZE} \ - --ngpus ${NUM_GPUs} - --save-path ${SAVE_PATH} \ - --title ${TITLE} \ - --style ${STYLE} \ - --window-size ${WINDOW_SIZE} - --cfg-options + [-p, --parameter ${PARAMETER_NAME}] \ + [-d, --dataset-size ${DATASET_SIZE}] \ + [-n, --ngpus ${NUM_GPUs}] \ + [-s, --save-path ${SAVE_PATH}] \ + [--title ${TITLE}] \ + [--style ${STYLE}] \ + [--window-size ${WINDOW_SIZE}] \ + [--cfg-options] ``` **Description of all arguments**: -- `config` : The path of a model config file. -- `dataset-size` : The size of the datasets. If set,`build_dataset` will be skipped and `${DATASET_SIZE}` will be used as the size. Default to use the function `build_dataset`. -- `ngpus` : The number of GPUs used in training, default to be 1. -- `save-path` : The learning rate curve plot save path, default not to save. -- `title` : Title of figure. If not set, default to be config file name. -- `style` : Style of plt. If not set, default to be `whitegrid`. -- `window-size`: The shape of the display window. If not specified, it will be set to `12*7`. If used, it must be in the format `'W*H'`. -- `cfg-options` : Modifications to the configuration file, refer to [Tutorial 1: Learn about Configs](https://mmclassification.readthedocs.io/en/latest/tutorials/config.html). +- `config`: The path of a model config file. +- **`-p, --parameter`**: The param to visualize its change curve, choose from "lr" and "momentum". Default to use "lr". +- **`-d, --dataset-size`**: The size of the datasets. If set,`build_dataset` will be skipped and `${DATASET_SIZE}` will be used as the size. Default to use the function `build_dataset`. +- **`-n, --ngpus`**: The number of GPUs used in training, default to be 1. +- **`-s, --save-path`**: The learning rate curve plot save path, default not to save. +- `--title`: Title of figure. If not set, default to be config file name. +- `--style`: Style of plt. If not set, default to be `whitegrid`. +- `--window-size`: The shape of the display window. If not specified, it will be set to `12*7`. If used, it must be in the format `'W*H'`. +- `--cfg-options`: Modifications to the configuration file, refer to [Learn about Configs](./config.md). ```{note} -Loading annotations maybe consume much time, you can directly specify the size of the dataset with `dataset-size` to save time. +Loading annotations maybe consume much time, you can directly specify the size of the dataset with `-d, dataset-size` to save time. ``` **Examples**: ```bash -python tools/visualizations/vis_lr.py configs/resnet/resnet50_b16x8_cifar100.py +python tools/visualizations/vis_scheduler.py configs/resnet/resnet50_b16x8_cifar100.py ``` -
+
When using ImageNet, directly specify the size of ImageNet, as below: ```bash -python tools/visualizations/vis_lr.py configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py --dataset-size 1281167 --ngpus 4 --save-path ./repvgg-B3g4_4xb64-lr.jpg +python tools/visualizations/vis_scheduler.py configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py --dataset-size 1281167 --ngpus 4 --save-path ./repvgg-B3g4_4xb64-lr.jpg ``` -
+
## Class Activation Map Visualization @@ -180,7 +184,7 @@ python tools/visualizations/vis_cam.py \ - `--aug_smooth` : Whether to use TTA(Test Time Augment) to get CAM. - `--eigen_smooth` : Whether to use the principal component to reduce noise. - `--device` : The computing device used. Default to 'cpu'. -- `--cfg-options` : Modifications to the configuration file, refer to [Tutorial 1: Learn about Configs](https://mmclassification.readthedocs.io/en/latest/tutorials/config.html). +- `--cfg-options` : Modifications to the configuration file, refer to [Learn about Configs](./config.md). ```{note} The argument `--preview-model` can view all network layers names in the given model. It will be helpful if you know nothing about the model layers when setting `--target-layers`. @@ -237,7 +241,7 @@ For example, the `backbone.layer4[-1]` is the same as `backbone.layer4.2` since ```shell python tools/visualizations/vis_cam.py \ demo/dog.jpg \ - configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py \ + configs/mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py \ https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth \ --target-layers 'backbone.layer16' \ --method LayerCAM \ diff --git a/docs/zh_CN/_static/image/tools/visualization/lr_schedule1.png b/docs/zh_CN/_static/image/tools/visualization/lr_schedule1.png deleted file mode 100644 index 31fca35b..00000000 Binary files a/docs/zh_CN/_static/image/tools/visualization/lr_schedule1.png and /dev/null differ diff --git a/docs/zh_CN/_static/image/tools/visualization/lr_schedule2.png b/docs/zh_CN/_static/image/tools/visualization/lr_schedule2.png deleted file mode 100644 index 8c6231db..00000000 Binary files a/docs/zh_CN/_static/image/tools/visualization/lr_schedule2.png and /dev/null differ diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst index c435968a..4abbdab1 100644 --- a/docs/zh_CN/index.rst +++ b/docs/zh_CN/index.rst @@ -43,7 +43,6 @@ You can switch between Chinese and English documentation in the lower-left corne :glob: modelzoo_statistics.md - model_zoo.md papers/* .. toctree:: diff --git a/docs/zh_CN/model_zoo.md b/docs/zh_CN/model_zoo.md deleted file mode 120000 index 013a9acc..00000000 --- a/docs/zh_CN/model_zoo.md +++ /dev/null @@ -1 +0,0 @@ -../en/model_zoo.md \ No newline at end of file diff --git a/docs/zh_CN/notes/faq.md b/docs/zh_CN/notes/faq.md index 61b1cdde..e4cfcce8 100644 --- a/docs/zh_CN/notes/faq.md +++ b/docs/zh_CN/notes/faq.md @@ -15,8 +15,9 @@ | MMClassification version | MMCV version | | :----------------------: | :--------------------: | - | 1.0.0rc0 (1.x) | mmcv>=2.0.0rc1 | - | 0.23.1 (master) | mmcv>=1.4.2, \<1.6.0 | + | 1.0.0rc1 (1.x) | mmcv>=2.0.0rc1 | + | 0.24.0 (master) | mmcv>=1.4.2, \<1.7.0 | + | 0.23.1 | mmcv>=1.4.2, \<1.6.0 | | 0.22.1 | mmcv>=1.4.2, \<1.6.0 | | 0.21.0 | mmcv>=1.4.2, \<=1.5.0 | | 0.20.1 | mmcv>=1.4.2, \<=1.5.0 | diff --git a/docs/zh_CN/stat.py b/docs/zh_CN/stat.py index f6d5b3ab..9b5a6a63 100755 --- a/docs/zh_CN/stat.py +++ b/docs/zh_CN/stat.py @@ -1,99 +1,145 @@ #!/usr/bin/env python -import functools as func -import glob -import os import re +from collections import defaultdict from pathlib import Path -import numpy as np +from modelindex.load_model_index import load +from tabulate import tabulate -MMCLS_ROOT = Path(__file__).absolute().parents[1] -url_prefix = 'https://github.com/open-mmlab/mmclassification/blob/master/' +MMCLS_ROOT = Path(__file__).absolute().parents[2] +PAPERS_ROOT = Path('papers') # Path to save generated paper pages. +GITHUB_PREFIX = 'https://github.com/open-mmlab/mmclassification/blob/1.x/' +MODELZOO_TEMPLATE = """ +# 模型库统计 -papers_root = Path('papers') -papers_root.mkdir(exist_ok=True) -files = [Path(f) for f in sorted(glob.glob('../../configs/*/README.md'))] +* 论文数量:{num_papers} +{type_msg} -stats = [] -titles = [] -num_ckpts = 0 -num_configs = 0 +* 模型权重文件数量:{num_ckpts} +{paper_msg} +""" -for f in files: - with open(f, 'r') as content_file: - content = content_file.read() +model_index = load(str(MMCLS_ROOT / 'model-index.yml')) - # Extract checkpoints - ckpts = set(x.lower().strip() - for x in re.findall(r'\[model\]\((https?.*)\)', content)) - if len(ckpts) == 0: - continue - num_ckpts += len(ckpts) - # Extract paper title - match_res = list(re.finditer(r'> \[(.*)\]\((.*)\)', content)) - if len(match_res) > 0: - title, paperlink = match_res[0].groups() - else: - title = content.split('\n')[0].replace('# ', '').strip() - paperlink = None - titles.append(title) +def build_collections(model_index): + col_by_name = {} + for col in model_index.collections: + setattr(col, 'models', []) + col_by_name[col.name] = col - # Replace paper link to a button - if paperlink is not None: - start = match_res[0].start() - end = match_res[0].end() - link_button = f'[{title}]({paperlink})' - content = content[:start] + link_button + content[end:] + for model in model_index.models: + col = col_by_name[model.in_collection] + col.models.append(model) + setattr(model, 'collection', col) - # Extract paper type - _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] - assert len(_papertype) > 0 - papertype = _papertype[0] - paper = set([(papertype, title)]) + +build_collections(model_index) + + +def count_papers(collections): + total_num_ckpts = 0 + type_count = defaultdict(int) + paper_msgs = [] + + for collection in collections: + with open(MMCLS_ROOT / collection.readme) as f: + readme = f.read() + ckpts = set(x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', readme)) + total_num_ckpts += len(ckpts) + title = collection.paper['Title'] + papertype = collection.data.get('type', 'Algorithm') + type_count[papertype] += 1 + + readme = PAPERS_ROOT / Path( + collection.filepath).parent.with_suffix('.md').name + paper_msgs.append( + f'\t- [{papertype}] [{title}]({readme}) ({len(ckpts)} ckpts)') + + type_msg = '\n'.join( + [f'\t- {type_}: {count}' for type_, count in type_count.items()]) + paper_msg = '\n'.join(paper_msgs) + + modelzoo = MODELZOO_TEMPLATE.format( + num_papers=len(collections), + num_ckpts=total_num_ckpts, + type_msg=type_msg, + paper_msg=paper_msg, + ) + + with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) + + +count_papers(model_index.collections) + + +def generate_paper_page(collection): + PAPERS_ROOT.mkdir(exist_ok=True) # Write a copy of README - copy = papers_root / (f.parent.name + '.md') - if copy.exists(): - os.remove(copy) + with open(MMCLS_ROOT / collection.readme) as f: + readme = f.read() + folder = Path(collection.filepath).parent + copy = PAPERS_ROOT / folder.with_suffix('.md').name def replace_link(matchobj): # Replace relative link to GitHub link. name = matchobj.group(1) link = matchobj.group(2) - if not link.startswith('http') and (f.parent / link).exists(): - rel_link = (f.parent / link).absolute().relative_to(MMCLS_ROOT) - link = url_prefix + str(rel_link) + if not link.startswith('http'): + assert (folder / link).exists(), \ + f'Link not found:\n{collection.readme}: {link}' + rel_link = (folder / link).absolute().relative_to(MMCLS_ROOT) + link = GITHUB_PREFIX + str(rel_link) return f'[{name}]({link})' - content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, content) + content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, readme) with open(copy, 'w') as copy_file: copy_file.write(content) - statsmsg = f""" -\t* [{papertype}] [{title}]({copy}) ({len(ckpts)} ckpts) -""" - stats.append(dict(paper=paper, ckpts=ckpts, statsmsg=statsmsg, copy=copy)) -allpapers = func.reduce(lambda a, b: a.union(b), - [stat['paper'] for stat in stats]) -msglist = '\n'.join(stat['statsmsg'] for stat in stats) +for collection in model_index.collections: + generate_paper_page(collection) -papertypes, papercounts = np.unique([t for t, _ in allpapers], - return_counts=True) -countstr = '\n'.join( - [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) -modelzoo = f""" -# 模型库统计 +def generate_summary_table(models): + dataset_rows = defaultdict(list) + for model in models: + if model.results is None: + continue + name = model.name + params = model.metadata.parameters / 1e6 + flops = model.metadata.flops / 1e9 + result = model.results[0] + top1 = result.metrics.get('Top 1 Accuracy') + top5 = result.metrics.get('Top 5 Accuracy') + readme = Path(model.collection.filepath).parent.with_suffix('.md').name + page = f'[链接]({PAPERS_ROOT / readme})' + row = [name, params, flops, top1, top5, page] + dataset_rows[result.dataset].append(row) -* 论文数量: {len(set(titles))} -{countstr} + with open('modelzoo_statistics.md', 'a') as f: + for dataset, rows in dataset_rows.items(): + f.write(f'\n## {dataset}\n') + f.write("""```{table}\n:class: model-summary\n""") + header = [ + '模型', + '参数量 (M)', + 'Flops (G)', + 'Top-1 (%)', + 'Top-5 (%)', + 'Readme', + ] + table_cfg = dict( + tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(rows, header, **table_cfg)) + f.write('\n```\n') -* 模型权重文件数量: {num_ckpts} -{msglist} -""" -with open('modelzoo_statistics.md', 'w') as f: - f.write(modelzoo) +generate_summary_table(model_index.models) diff --git a/docs/zh_CN/user_guides/config.md b/docs/zh_CN/user_guides/config.md index 2189ebb0..04917d67 100644 --- a/docs/zh_CN/user_guides/config.md +++ b/docs/zh_CN/user_guides/config.md @@ -1,6 +1,6 @@ # 学习配置文件 -为了管理深度学习实验的各种设置,我们使用配置文件来记录所有这些配置。这种配置文件系统具有模块化和继承特性,更多细节可以在[MMEngine 中的教程](TODO)。 +为了管理深度学习实验的各种设置,我们使用配置文件来记录所有这些配置。这种配置文件系统具有模块化和继承特性,更多细节可以在{external+mmengine:doc}`MMEngine 中的教程 `。 MMClassification 主要使用 python 文件作为配置文件,所有配置文件都放置在 [`configs`](https://github.com/open-mmlab/mmclassification/tree/1.x/configs) 文件夹下,目录结构如下所示: @@ -172,11 +172,11 @@ test_evaluator = val_evaluator # 测试集的评估配置,这里直接与 v 训练策略原始配置文件主要包括预优化器设置和训练、验证及测试的循环控制器(LOOP): - `optim_wrapper`: 优化器装饰器配置信息,我们使用优化器装饰配置优化进程。 - - `optimizer`: 支持 `pytorch` 所有的优化器,参考相关 [MMEngine](TODO:) 文档。 - - `paramwise_cfg`: 根据参数的类型或名称设置不同的优化参数,参考相关 [学习策略文档](TODO:) 文档。 + - `optimizer`: 支持 `pytorch` 所有的优化器,参考相关 {external+mmengine:doc}`MMEngine ` 文档。 + - `paramwise_cfg`: 根据参数的类型或名称设置不同的优化参数,参考相关 [学习策略文档](../advanced_guides/schedule.md) 文档。 - `accumulative_counts`: 积累几个反向传播后再优化参数,你可以用它通过小批量来模拟大批量。 -- `param_scheduler` : 学习率策略,你可以指定训练期间的学习率和动量曲线。有关详细信息,请参阅 MMEngine 中的 [文档](TODO:)。 -- `train_cfg | val_cfg | test_cfg`: 训练、验证以及测试的循环执行器配置,请参考相关的[MMEngine 文档](TODO:)。 +- `param_scheduler` : 学习率策略,你可以指定训练期间的学习率和动量曲线。有关详细信息,请参阅 MMEngine 中的 {external+mmengine:doc}`文档 `。 +- `train_cfg | val_cfg | test_cfg`: 训练、验证以及测试的循环执行器配置,请参考相关的{external+mmengine:doc}`MMEngine 文档 `。 以下是 ResNet50 的训练策略配置['configs/_base_/schedules/imagenet_bs256.py'](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/_base_/schedules/imagenet_bs256.py): @@ -337,7 +337,7 @@ test_dataloader = dict(dataset=dict(pipeline=val_pipeline)) ### 忽略基础配置文件里的部分内容 -有时,您需要设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。 可以参照 [MMEngine](TODO:) 来获得一些简单的指导。 +有时,您需要设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。可以查看 {external+mmengine:doc}`MMEngine 文档 ` 进一步了解该设计。 以下是一个简单应用案例。 如果在上述 ResNet50 案例中 使用余弦调度 ,使用继承并直接修改会报 `get unexcepected keyword 'step'` 错, 因为基础配置文件 `param_scheduler` 域信息的 `'step'` 字段被保留下来了,需要加入 `_delete_=True` 去忽略基础配置文件里的 `param_scheduler` 相关域内容: @@ -350,7 +350,7 @@ param_scheduler = dict(type='CosineAnnealingLR', by_epoch=True, _delete_=True) ### 引用基础配置文件里的变量 -有时,您可以引用 `_base_` 配置信息的一些域内容,这样可以避免重复定义。 可以参照 [MMEngine](TODO:) 来获得一些简单的指导。 +有时,您可以引用 `_base_` 配置信息的一些域内容,这样可以避免重复定义。可以查看 {external+mmengine:doc}`MMEngine 文档 ` 进一步了解该设计。 以下是一个简单应用案例,在训练数据预处理流水线中使用 `auto augment` 数据增强,参考配置文件 [`configs/resnest/resnest50_32xb64_in1k.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/resnest/resnest50_32xb64_in1k.py)。 在定义 `train_pipeline` 时,可以直接在 `_base_` 中加入定义 auto augment 数据增强的文件命名,再通过 `{{_base_.auto_increasing_policies}}` 引用变量: diff --git a/docs/zh_CN/user_guides/dataset_prepare.md b/docs/zh_CN/user_guides/dataset_prepare.md index 6c9a789a..d65fc3c1 100644 --- a/docs/zh_CN/user_guides/dataset_prepare.md +++ b/docs/zh_CN/user_guides/dataset_prepare.md @@ -282,10 +282,10 @@ MMCLassification 还是支持更多其他的数据集,可以通过查阅[数 ## 数据集包装 -MMEngine 中支持以下数据包装器,您可以参考 [MMEngine 教程](TODO:) 了解如何使用它。 +MMEngine 中支持以下数据包装器,您可以参考 {external+mmengine:doc}`MMEngine 教程 ` 了解如何使用它。 -- [ConcatDataset](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/basedataset.md#concatdataset) -- [RepeatDataset](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/basedataset.md#repeatdataset) -- [ClassBalanced](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/basedataset.md#classbalanceddataset) +- {external:py:class}`~mmengine.dataset.ConcatDataset` +- {external:py:class}`~mmengine.dataset.RepeatDataset` +- {external:py:class}`~mmengine.dataset.ClassBalancedDataset` 除上述之外,MMClassification 还支持了[KFoldDataset](mmcls.datasets.KFoldDataset),需用通过使用 `tools/kfold-cross-valid.py` 来使用它。 diff --git a/docs/zh_CN/user_guides/finetune.md b/docs/zh_CN/user_guides/finetune.md index 06c7d80c..acb28d00 100644 --- a/docs/zh_CN/user_guides/finetune.md +++ b/docs/zh_CN/user_guides/finetune.md @@ -4,7 +4,7 @@ 通常,已有的、在大数据集上训练好的模型会比随机初始化提供更为有效的先验信息,粗略来讲,在此基础上的学习我们称之为模型微调。 已经证明,在 ImageNet 数据集上预先训练的分类模型对于其他数据集和其他下游任务有很好的效果。 -因此,该教程提供了如何将 [Model Zoo](../model_zoo.md) 中提供的预训练模型用于其他数据集,已获得更好的效果。 +因此,该教程提供了如何将 [Model Zoo](../modelzoo_statistics.md) 中提供的预训练模型用于其他数据集,已获得更好的效果。 在新数据集上微调模型分为两步: diff --git a/docs/zh_CN/user_guides/inference.md b/docs/zh_CN/user_guides/inference.md index b79eaefb..b1f0afca 100644 --- a/docs/zh_CN/user_guides/inference.md +++ b/docs/zh_CN/user_guides/inference.md @@ -1,6 +1,6 @@ # 使用现有模型推理 -MMClassification 在 [Model Zoo](../model_zoo.md) 中提供了用于分类的预训练模型。 +MMClassification 在 [Model Zoo](../modelzoo_statistics.md) 中提供了用于分类的预训练模型。 本说明将展示**如何使用现有模型对给定图像进行推理**。 至于如何在标准数据集上测试现有模型,请看这个[指南](./train_test.md#测试) diff --git a/docs/zh_CN/user_guides/visualization.md b/docs/zh_CN/user_guides/visualization.md index 78756525..1610f82a 100644 --- a/docs/zh_CN/user_guides/visualization.md +++ b/docs/zh_CN/user_guides/visualization.md @@ -1,136 +1,141 @@ -# 可视化工具(待更新) +# 可视化工具 -- [数据流水线可视化](#数据流水线可视化) -- [学习率策略可视化](#学习率策略可视化) +- [浏览数据集](#浏览数据集) +- [优化器参数策略可视化](#优化器参数策略可视化) - [类别激活图可视化](#类别激活图可视化) - [常见问题](#常见问题) -## 数据流水线可视化 +## 浏览数据集 ```bash -python tools/visualizations/vis_pipeline.py \ +python tools/visualizations/browse_dataset.py \ ${CONFIG_FILE} \ - [--output-dir ${OUTPUT_DIR}] \ - [--phase ${DATASET_PHASE}] \ - [--number ${BUNBER_IMAGES_DISPLAY}] \ - [--skip-type ${SKIP_TRANSFORM_TYPE}] \ - [--mode ${DISPLAY_MODE}] \ - [--show] \ - [--adaptive] \ - [--min-edge-length ${MIN_EDGE_LENGTH}] \ - [--max-edge-length ${MAX_EDGE_LENGTH}] \ - [--bgr2rgb] \ - [--window-size ${WINDOW_SIZE}] \ + [-o, --output-dir ${OUTPUT_DIR}] \ + [-p, --phase ${DATASET_PHASE}] \ + [-n, --show-number ${NUMBER_IMAGES_DISPLAY}] \ + [-i, --show-interval ${SHOW_INTERRVAL}] \ + [-m, --mode ${DISPLAY_MODE}] \ + [-r, --rescale-factor ${RESCALE_FACTOR}] \ + [-c, --channel-order ${CHANNEL_ORDER}] \ [--cfg-options ${CFG_OPTIONS}] ``` **所有参数的说明**: - `config` : 模型配置文件的路径。 -- `--output-dir`: 保存图片文件夹,如果没有指定,默认为 `''`,表示不保存。 -- `--phase`: 可视化数据集的阶段,只能为 `[train, val, test]` 之一,默认为 `train`。 -- `--number`: 可视化样本数量。如果没有指定,默认展示数据集的所有图片。 -- `--skip-type`: 预设跳过的数据流水线过程。如果没有指定,默认为 `['ToTensor', 'Normalize', 'ImageToTensor', 'Collect']`。 -- `--mode`: 可视化的模式,只能为 `[original, transformed, concat, pipeline]` 之一,如果没有指定,默认为 `concat`。 -- `--show`: 将可视化图片以弹窗形式展示。 -- `--adaptive`: 自动调节可视化图片的大小。 -- `--min-edge-length`: 最短边长度,当使用了 `--adaptive` 时有效。 当图片任意边小于 `${MIN_EDGE_LENGTH}` 时,会保持长宽比不变放大图片,短边对齐至 `${MIN_EDGE_LENGTH}`,默认为200。 -- `--max-edge-length`: 最长边长度,当使用了 `--adaptive` 时有效。 当图片任意边大于 `${MAX_EDGE_LENGTH}` 时,会保持长宽比不变缩小图片,短边对齐至 `${MAX_EDGE_LENGTH}`,默认为1000。 -- `--bgr2rgb`: 将图片的颜色通道翻转。 -- `--window-size`: 可视化窗口大小,如果没有指定,默认为 `12*7`。如果需要指定,按照格式 `'W*H'`。 -- `--cfg-options` : 对配置文件的修改,参考[教程 1:如何编写配置文件](https://mmclassification.readthedocs.io/zh_CN/latest/tutorials/config.html)。 +- `-o, --output-dir`: 保存图片文件夹,如果没有指定,默认为 `''`,表示不保存。 +- **`-p, --phase`**: 可视化数据集的阶段,只能为 `['train', 'val', 'test']` 之一,默认为 `'train'`。 +- **`-n, --show-number`**: 可视化样本数量。如果没有指定,默认展示数据集的所有图片。 +- `-i, --show-interval`: 浏览时,每张图片的停留间隔,单位为秒。 +- **`-m, --mode`**: 可视化的模式,只能为 `['original', 'transformed', 'concat', 'pipeline']` 之一。 默认为`'transformed'`. +- **`-r, --rescale-factor`**: 对可视化图片的放缩倍数,在图片过大或过小时设置。 +- `-c, --channel-order`: 图片的通道顺序,为 `['BGR', 'RGB']` 之一,默认为 `'BGR'`。 +- `--cfg-options` : 对配置文件的修改,参考[学习配置文件](./config.md)。 ```{note} -1. 如果不指定 `--mode`,默认设置为 `concat`,获取原始图片和预处理后图片拼接的图片;如果 `--mode` 设置为 `original`,则获取原始图片;如果 `--mode` 设置为 `transformed`,则获取预处理后的图片;如果 `--mode` 设置为 `pipeline`,则获得数据流水线所有中间过程图片。 +1. `-m, --mode` 用于设置可视化的模式,默认设置为 'transformed'。 +- 如果 `--mode` 设置为 'original',则获取原始图片; +- 如果 `--mode` 设置为 'transformed',则获取预处理后的图片; +- 如果 `--mode` 设置为 'concat',获取原始图片和预处理后图片拼接的图片; +- 如果 `--mode` 设置为 'pipeline',则获得数据流水线所有中间过程图片。 -2. 当指定了 `--adaptive` 选项时,会自动的调整尺寸过大和过小的图片,你可以通过设定 `--min-edge-length` 与 `--max-edge-length` 来指定自动调整的图片尺寸。 +2. `-r, --rescale-factor` 在数据集中图片的分辨率过大或者过小时设置。比如在可视化 CIFAR 数据集时,由于图片的分辨率非常小,可将 `-r, --rescale-factor` 设置为 10。 ``` **示例**: -1. **'original'** 模式,可视化 `CIFAR100` 验证集中的100张原始图片,显示并保存在 `./tmp` 文件夹下: +1. **'original'** 模式 : ```shell -python ./tools/visualizations/vis_pipeline.py configs/resnet/resnet50_8xb16_cifar100.py --phase val --output-dir tmp --mode original --number 100 --show --adaptive --bgr2rgb +python ./tools/visualizations/browse_dataset.py ./configs/resnet/resnet101_8xb16_cifar10.py --phase val --output-dir tmp --mode original --show-number 100 --rescale-factor 10 --channel-order RGB ``` -
+- `--phase val`: 可视化验证集, 可简化为 `-p val`; +- `--output-dir tmp`: 可视化结果保存在 "tmp" 文件夹, 可简化为 `-o tmp`; +- `--mode original`: 可视化原图, 可简化为 `-m original`; +- `--show-number 100`: 可视化100张图,可简化为 `-n 100`; +- `--rescale-factor`: 图像放大10倍,可简化为 `-r 10`; +- `--channel-order RGB`: 可视化图像的通道顺序为 "RGB", 可简化为 `-c RGB`。 -2. **'transformed'** 模式,可视化 `ImageNet` 训练集的所有经过预处理的图片,并以弹窗形式显示: +
+ +2. **'transformed'** 模式 : ```shell -python ./tools/visualizations/vis_pipeline.py ./configs/resnet/resnet50_8xb32_in1k.py --show --mode transformed +python ./tools/visualizations/browse_dataset.py ./configs/resnet/resnet50_8xb32_in1k.py -n 100 -r 2 ``` -
+
-3. **'concat'** 模式,可视化 `ImageNet` 训练集的10张原始图片与预处理后图片对比图,保存在 `./tmp` 文件夹下: +3. **'concat'** 模式 : ```shell -python ./tools/visualizations/vis_pipeline.py configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py --phase train --output-dir tmp --number 10 --adaptive +python ./tools/visualizations/browse_dataset.py configs/swin_transformer/swin-small_16xb64_in1k.py -n 10 -m concat ``` -
+
-4. **'pipeline'** 模式,可视化 `ImageNet` 训练集经过数据流水线的过程图像: +4. **'pipeline'** 模式 : ```shell -python ./tools/visualizations/vis_pipeline.py configs/swin_transformer/swin_base_224_b16x64_300e_imagenet.py --phase train --adaptive --mode pipeline --show +python ./tools/visualizations/browse_dataset.py configs/swin_transformer/swin-small_16xb64_in1k.py -m pipeline ``` -
+
-## 学习率策略可视化 +## 优化器参数策略可视化 ```bash -python tools/visualizations/vis_lr.py \ +python tools/visualizations/vis_scheduler.py \ ${CONFIG_FILE} \ - [--dataset-size ${Dataset_Size}] \ - [--ngpus ${NUM_GPUs}] \ - [--save-path ${SAVE_PATH}] \ + [-p, --parammeter ${PARAMETER_NAME}] \ + [-d, --dataset-size ${DATASET_SIZE}] \ + [-n, --ngpus ${NUM_GPUs}] \ + [-s, --save-path ${SAVE_PATH}] \ [--title ${TITLE}] \ [--style ${STYLE}] \ [--window-size ${WINDOW_SIZE}] \ - [--cfg-options ${CFG_OPTIONS}] \ + [--cfg-options] ``` **所有参数的说明**: - `config` : 模型配置文件的路径。 -- `--dataset-size` : 数据集的大小。如果指定,`build_dataset` 将被跳过并使用这个大小作为数据集大小,默认使用 `build_dataset` 所得数据集的大小。 -- `--ngpus` : 使用 GPU 的数量。 -- `--save-path` : 保存的可视化图片的路径,默认不保存。 -- `--title` : 可视化图片的标题,默认为配置文件名。 -- `--style` : 可视化图片的风格,默认为 `whitegrid`。 +- **`-p, parameter`**: 可视化参数名,只能为 `["lr", "momentum"]` 之一, 默认为 `"lr"`. +- **`-d, --dataset-size`**: 数据集的大小。如果指定,`build_dataset` 将被跳过并使用这个大小作为数据集大小,默认使用 `build_dataset` 所得数据集的大小。 +- **`-n, --ngpus`**: 使用 GPU 的数量, 默认为1。 +- **`-s, --save-path`**: 保存的可视化图片的路径,默认不保存。 +- `--title`: 可视化图片的标题,默认为配置文件名。 +- `--style`: 可视化图片的风格,默认为 `whitegrid`。 - `--window-size`: 可视化窗口大小,如果没有指定,默认为 `12*7`。如果需要指定,按照格式 `'W*H'`。 -- `--cfg-options` : 对配置文件的修改,参考[教程 1:如何编写配置文件](https://mmclassification.readthedocs.io/zh_CN/latest/tutorials/config.html)。 +- `--cfg-options`: 对配置文件的修改,参考[学习配置文件](./config.md)。 ```{note} -部分数据集在解析标注阶段比较耗时,可直接将 `dataset-size` 指定数据集的大小,以节约时间。 +部分数据集在解析标注阶段比较耗时,可直接将 `-d, dataset-size` 指定数据集的大小,以节约时间。 ``` **示例**: ```bash -python tools/visualizations/vis_lr.py configs/resnet/resnet50_b16x8_cifar100.py +python tools/visualizations/vis_scheduler.py configs/resnet/resnet50_b16x8_cifar100.py ``` -
+
当数据集为 ImageNet 时,通过直接指定数据集大小来节约时间,并保存图片: ```bash -python tools/visualizations/vis_lr.py configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py --dataset-size 1281167 --ngpus 4 --save-path ./repvgg-B3g4_4xb64-lr.jpg +python tools/visualizations/vis_scheduler.py configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py --dataset-size 1281167 --ngpus 4 --save-path ./repvgg-B3g4_4xb64-lr.jpg ``` -
+
## 类别激活图可视化 @@ -182,7 +187,7 @@ python tools/visualizations/vis_cam.py \ - `--num-extra-tokens`: `ViT` 类网络的额外的 tokens 通道数,默认使用主干网络的 `num_extra_tokens`。 - `--aug-smooth`:是否使用测试时增强 - `--device`:使用的计算设备,如果不设置,默认为'cpu'。 -- `--cfg-options`:对配置文件的修改,参考[教程 1:如何编写配置文件](https://mmclassification.readthedocs.io/zh_CN/latest/tutorials/config.html)。 +- `--cfg-options`:对配置文件的修改,参考[学习配置文件](./config.md)。 ```{note} 在指定 `--target-layers` 时,如果不知道模型有哪些网络层,可使用命令行添加 `--preview-model` 查看所有网络层名称; @@ -237,7 +242,7 @@ python tools/visualizations/vis_cam.py \ ```shell python tools/visualizations/vis_cam.py \ demo/dog.jpg \ - configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py \ + configs/mobilenet_v3/mobilenet-v3-large_8xb128_in1k.py \ https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth \ --target-layers 'backbone.layer16' \ --method LayerCAM \ diff --git a/mmcls/apis/inference.py b/mmcls/apis/inference.py index 970b9eab..c2a29429 100644 --- a/mmcls/apis/inference.py +++ b/mmcls/apis/inference.py @@ -40,11 +40,11 @@ def init_model(config, checkpoint=None, device='cuda:0', options=None): if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: - from mmcls.datasets import ImageNet + from mmcls.datasets.categories import IMAGENET_CATEGORIES warnings.simplefilter('once') warnings.warn('Class names are not saved in the checkpoint\'s ' 'meta data, use imagenet by default.') - model.CLASSES = ImageNet.CLASSES + model.CLASSES = IMAGENET_CATEGORIES model.cfg = config # save the config in the model for convenience model.to(device) model.eval() diff --git a/mmcls/datasets/transforms/processing.py b/mmcls/datasets/transforms/processing.py index dc1db182..20b7b0b4 100644 --- a/mmcls/datasets/transforms/processing.py +++ b/mmcls/datasets/transforms/processing.py @@ -504,7 +504,8 @@ class RandomErasing(BaseTransform): 'aspect_range should be positive.' assert aspect_range[0] <= aspect_range[1], \ 'In aspect_range (min, max), min should be smaller than max.' - assert mode in ['const', 'rand'] + assert mode in ['const', 'rand'], \ + 'Please select `mode` from ["const", "rand"].' if isinstance(fill_color, Number): fill_color = [fill_color] * 3 assert isinstance(fill_color, Sequence) and len(fill_color) == 3 \ diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py index ebd04e59..8ed11e85 100644 --- a/mmcls/models/backbones/__init__.py +++ b/mmcls/models/backbones/__init__.py @@ -6,6 +6,8 @@ from .convnext import ConvNeXt from .cspnet import CSPDarkNet, CSPNet, CSPResNet, CSPResNeXt from .deit import DistilledVisionTransformer from .densenet import DenseNet +from .edgenext import EdgeNeXt +from .efficientformer import EfficientFormer from .efficientnet import EfficientNet from .hrnet import HRNet from .inception_v3 import InceptionV3 @@ -13,6 +15,8 @@ from .lenet import LeNet5 from .mlp_mixer import MlpMixer from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 +from .mobileone import MobileOne +from .mvit import MViT from .poolformer import PoolFormer from .regnet import RegNet from .repmlp import RepMLPNet @@ -27,6 +31,7 @@ from .seresnext import SEResNeXt from .shufflenet_v1 import ShuffleNetV1 from .shufflenet_v2 import ShuffleNetV2 from .swin_transformer import SwinTransformer +from .swin_transformer_v2 import SwinTransformerV2 from .t2t_vit import T2T_ViT from .timm_backbone import TIMMBackbone from .tnt import TNT @@ -36,12 +41,50 @@ from .vgg import VGG from .vision_transformer import VisionTransformer __all__ = [ - 'LeNet5', 'AlexNet', 'VGG', 'RegNet', 'ResNet', 'ResNeXt', 'ResNetV1d', - 'ResNeSt', 'ResNet_CIFAR', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', - 'ShuffleNetV2', 'MobileNetV2', 'MobileNetV3', 'VisionTransformer', - 'SwinTransformer', 'TNT', 'TIMMBackbone', 'T2T_ViT', 'Res2Net', 'RepVGG', - 'Conformer', 'MlpMixer', 'DistilledVisionTransformer', 'PCPVT', 'SVT', - 'EfficientNet', 'ConvNeXt', 'HRNet', 'ResNetV1c', 'ConvMixer', - 'CSPDarkNet', 'CSPResNet', 'CSPResNeXt', 'CSPNet', 'RepMLPNet', - 'PoolFormer', 'DenseNet', 'VAN', 'InceptionV3' + 'LeNet5', + 'AlexNet', + 'VGG', + 'RegNet', + 'ResNet', + 'ResNeXt', + 'ResNetV1d', + 'ResNeSt', + 'ResNet_CIFAR', + 'SEResNet', + 'SEResNeXt', + 'ShuffleNetV1', + 'ShuffleNetV2', + 'MobileNetV2', + 'MobileNetV3', + 'VisionTransformer', + 'SwinTransformer', + 'TNT', + 'TIMMBackbone', + 'T2T_ViT', + 'Res2Net', + 'RepVGG', + 'Conformer', + 'MlpMixer', + 'DistilledVisionTransformer', + 'PCPVT', + 'SVT', + 'EfficientNet', + 'ConvNeXt', + 'HRNet', + 'ResNetV1c', + 'ConvMixer', + 'EdgeNeXt', + 'CSPDarkNet', + 'CSPResNet', + 'CSPResNeXt', + 'CSPNet', + 'RepMLPNet', + 'PoolFormer', + 'DenseNet', + 'VAN', + 'InceptionV3', + 'MobileOne', + 'EfficientFormer', + 'SwinTransformerV2', + 'MViT', ] diff --git a/mmcls/models/backbones/convnext.py b/mmcls/models/backbones/convnext.py index 76ba1eb8..81de5ef2 100644 --- a/mmcls/models/backbones/convnext.py +++ b/mmcls/models/backbones/convnext.py @@ -43,6 +43,8 @@ class ConvNeXtBlock(BaseModule): Args: in_channels (int): The number of input channels. + dw_conv_cfg (dict): Config of depthwise convolution. + Defaults to ``dict(kernel_size=7, padding=3)``. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='LN2d', eps=1e-6)``. act_cfg (dict): The config dict for activation between pointwise @@ -70,6 +72,7 @@ class ConvNeXtBlock(BaseModule): def __init__(self, in_channels, + dw_conv_cfg=dict(kernel_size=7, padding=3), norm_cfg=dict(type='LN2d', eps=1e-6), act_cfg=dict(type='GELU'), mlp_ratio=4., @@ -78,11 +81,7 @@ class ConvNeXtBlock(BaseModule): layer_scale_init_value=1e-6): super().__init__() self.depthwise_conv = nn.Conv2d( - in_channels, - in_channels, - kernel_size=7, - padding=3, - groups=in_channels) + in_channels, in_channels, groups=in_channels, **dw_conv_cfg) self.linear_pw_conv = linear_pw_conv self.norm = build_norm_layer(norm_cfg, in_channels)[1] diff --git a/mmcls/models/backbones/edgenext.py b/mmcls/models/backbones/edgenext.py new file mode 100644 index 00000000..06fc56ce --- /dev/null +++ b/mmcls/models/backbones/edgenext.py @@ -0,0 +1,397 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from itertools import chain +from typing import Sequence + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer +from mmengine.model import BaseModule, ModuleList, Sequential +from mmengine.registry import MODELS + +from ..utils import ChannelMultiheadAttention, PositionEncodingFourier +from .base_backbone import BaseBackbone +from .convnext import ConvNeXtBlock + + +class SDTAEncoder(BaseModule): + """A PyTorch implementation of split depth-wise transpose attention (SDTA) + encoder. + + Inspiration from + https://github.com/mmaaz60/EdgeNeXt + Args: + in_channel (int): Number of input channels. + drop_path_rate (float): Stochastic depth dropout rate. + Defaults to 0. + layer_scale_init_value (float): Initial value of layer scale. + Defaults to 1e-6. + mlp_ratio (int): Number of channels ratio in the MLP. + Defaults to 4. + use_pos_emb (bool): Whether to use position encoding. + Defaults to True. + num_heads (int): Number of heads in the multihead attention. + Defaults to 8. + qkv_bias (bool): Whether to use bias in the multihead attention. + Defaults to True. + attn_drop (float): Dropout rate of the attention. + Defaults to 0. + proj_drop (float): Dropout rate of the projection. + Defaults to 0. + layer_scale_init_value (float): Initial value of layer scale. + Defaults to 1e-6. + norm_cfg (dict): Dictionary to construct normalization layer. + Defaults to ``dict(type='LN')``. + act_cfg (dict): Dictionary to construct activation layer. + Defaults to ``dict(type='GELU')``. + scales (int): Number of scales. Default to 1. + """ + + def __init__(self, + in_channel, + drop_path_rate=0., + layer_scale_init_value=1e-6, + mlp_ratio=4, + use_pos_emb=True, + num_heads=8, + qkv_bias=True, + attn_drop=0., + proj_drop=0., + norm_cfg=dict(type='LN'), + act_cfg=dict(type='GELU'), + scales=1, + init_cfg=None): + super(SDTAEncoder, self).__init__(init_cfg=init_cfg) + conv_channels = max( + int(math.ceil(in_channel / scales)), + int(math.floor(in_channel // scales))) + self.conv_channels = conv_channels + self.num_convs = scales if scales == 1 else scales - 1 + + self.conv_modules = ModuleList() + for i in range(self.num_convs): + self.conv_modules.append( + nn.Conv2d( + conv_channels, + conv_channels, + kernel_size=3, + padding=1, + groups=conv_channels)) + + self.pos_embed = PositionEncodingFourier( + embed_dims=in_channel) if use_pos_emb else None + + self.norm_csa = build_norm_layer(norm_cfg, in_channel)[1] + self.gamma_csa = nn.Parameter( + layer_scale_init_value * torch.ones(in_channel), + requires_grad=True) if layer_scale_init_value > 0 else None + self.csa = ChannelMultiheadAttention( + embed_dims=in_channel, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=proj_drop) + + self.norm = build_norm_layer(norm_cfg, in_channel)[1] + self.pointwise_conv1 = nn.Linear(in_channel, mlp_ratio * in_channel) + self.act = build_activation_layer(act_cfg) + self.pointwise_conv2 = nn.Linear(mlp_ratio * in_channel, in_channel) + self.gamma = nn.Parameter( + layer_scale_init_value * torch.ones(in_channel), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + shortcut = x + spx = torch.split(x, self.conv_channels, dim=1) + for i in range(self.num_convs): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.conv_modules[i](sp) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + + x = torch.cat((out, spx[self.num_convs]), 1) + + # Channel Self-attention + B, C, H, W = x.shape + x = x.reshape(B, C, H * W).permute(0, 2, 1) + if self.pos_embed: + pos_encoding = self.pos_embed((B, H, W)) + pos_encoding = pos_encoding.reshape(B, -1, + x.shape[1]).permute(0, 2, 1) + x += pos_encoding + + x = x + self.drop_path(self.gamma_csa * self.csa(self.norm_csa(x))) + x = x.reshape(B, H, W, C) + + # Inverted Bottleneck + x = self.norm(x) + x = self.pointwise_conv1(x) + x = self.act(x) + x = self.pointwise_conv2(x) + + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (B, H, W, C) -> (B, C, H, W) + + x = shortcut + self.drop_path(x) + + return x + + +@MODELS.register_module() +class EdgeNeXt(BaseBackbone): + """EdgeNeXt. + + A PyTorch implementation of: `EdgeNeXt: Efficiently Amalgamated + CNN-Transformer Architecture for Mobile Vision Applications + `_ + + Inspiration from + https://github.com/mmaaz60/EdgeNeXt + + Args: + arch (str | dict): The model's architecture. If string, it should be + one of architectures in ``EdgeNeXt.arch_settings``. + And if dict, it should include the following keys: + + - channels (list[int]): The number of channels at each stage. + - depths (list[int]): The number of blocks at each stage. + - num_heads (list[int]): The number of heads at each stage. + + Defaults to 'xxsmall'. + in_channels (int): The number of input channels. + Defaults to 3. + global_blocks (list[int]): The number of global blocks. + Defaults to [0, 1, 1, 1]. + global_block_type (list[str]): The type of global blocks. + Defaults to ['None', 'SDTA', 'SDTA', 'SDTA']. + drop_path_rate (float): Stochastic depth dropout rate. + Defaults to 0. + layer_scale_init_value (float): Initial value of layer scale. + Defaults to 1e-6. + linear_pw_conv (bool): Whether to use linear layer to do pointwise + convolution. Defaults to False. + mlp_ratio (int): The number of channel ratio in MLP layers. + Defaults to 4. + conv_kernel_size (list[int]): The kernel size of convolutional layers + at each stage. Defaults to [3, 5, 7, 9]. + use_pos_embd_csa (list[bool]): Whether to use positional embedding in + Channel Self-Attention. Defaults to [False, True, False, False]. + use_pos_emebd_global (bool): Whether to use positional embedding for + whole network. Defaults to False. + d2_scales (list[int]): The number of channel groups used for SDTA at + each stage. Defaults to [2, 2, 3, 4]. + norm_cfg (dict): The config of normalization layer. + Defaults to ``dict(type='LN2d', eps=1e-6)``. + out_indices (Sequence | int): Output from which stages. + Defaults to -1, means the last stage. + frozen_stages (int): Stages to be frozen (all param fixed). + Defaults to 0, which means not freezing any parameters. + gap_before_final_norm (bool): Whether to globally average the feature + map before the final norm layer. Defaults to True. + act_cfg (dict): The config of activation layer. + Defaults to ``dict(type='GELU')``. + init_cfg (dict, optional): Config for initialization. + Defaults to None. + """ + arch_settings = { + 'xxsmall': { # parameters: 1.3M + 'channels': [24, 48, 88, 168], + 'depths': [2, 2, 6, 2], + 'num_heads': [4, 4, 4, 4] + }, + 'xsmall': { # parameters: 2.3M + 'channels': [32, 64, 100, 192], + 'depths': [3, 3, 9, 3], + 'num_heads': [4, 4, 4, 4] + }, + 'small': { # parameters: 5.6M + 'channels': [48, 96, 160, 304], + 'depths': [3, 3, 9, 3], + 'num_heads': [8, 8, 8, 8] + }, + 'base': { # parameters: 18.51M + 'channels': [80, 160, 288, 584], + 'depths': [3, 3, 9, 3], + 'num_heads': [8, 8, 8, 8] + }, + } + + def __init__(self, + arch='xxsmall', + in_channels=3, + global_blocks=[0, 1, 1, 1], + global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'], + drop_path_rate=0., + layer_scale_init_value=1e-6, + linear_pw_conv=True, + mlp_ratio=4, + conv_kernel_sizes=[3, 5, 7, 9], + use_pos_embd_csa=[False, True, False, False], + use_pos_embd_global=False, + d2_scales=[2, 2, 3, 4], + norm_cfg=dict(type='LN2d', eps=1e-6), + out_indices=-1, + frozen_stages=0, + gap_before_final_norm=True, + act_cfg=dict(type='GELU'), + init_cfg=None): + super(EdgeNeXt, self).__init__(init_cfg=init_cfg) + + if isinstance(arch, str): + arch = arch.lower() + assert arch in self.arch_settings, \ + f'Arch {arch} is not in default archs ' \ + f'{set(self.arch_settings)}' + self.arch_settings = self.arch_settings[arch] + elif isinstance(arch, dict): + essential_keys = {'channels', 'depths', 'num_heads'} + assert isinstance(arch, dict) and set(arch) == essential_keys, \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.channels = self.arch_settings['channels'] + self.depths = self.arch_settings['depths'] + self.num_heads = self.arch_settings['num_heads'] + self.num_layers = len(self.depths) + self.use_pos_embd_global = use_pos_embd_global + + for g in global_block_type: + assert g in ['None', + 'SDTA'], f'Global block type {g} is not supported' + + self.num_stages = len(self.depths) + + if isinstance(out_indices, int): + out_indices = [out_indices] + assert isinstance(out_indices, Sequence), \ + f'"out_indices" must by a sequence or int, ' \ + f'get {type(out_indices)} instead.' + for i, index in enumerate(out_indices): + if index < 0: + out_indices[i] = 4 + index + assert out_indices[i] >= 0, f'Invalid out_indices {index}' + self.out_indices = out_indices + + self.frozen_stages = frozen_stages + self.gap_before_final_norm = gap_before_final_norm + + if self.use_pos_embd_global: + self.pos_embed = PositionEncodingFourier( + embed_dims=self.channels[0]) + else: + self.pos_embed = None + + # stochastic depth decay rule + dpr = [ + x.item() + for x in torch.linspace(0, drop_path_rate, sum(self.depths)) + ] + + self.downsample_layers = ModuleList() + stem = nn.Sequential( + nn.Conv2d(in_channels, self.channels[0], kernel_size=4, stride=4), + build_norm_layer(norm_cfg, self.channels[0])[1], + ) + self.downsample_layers.append(stem) + + self.stages = ModuleList() + block_idx = 0 + for i in range(self.num_stages): + depth = self.depths[i] + channels = self.channels[i] + + if i >= 1: + downsample_layer = nn.Sequential( + build_norm_layer(norm_cfg, self.channels[i - 1])[1], + nn.Conv2d( + self.channels[i - 1], + channels, + kernel_size=2, + stride=2, + )) + self.downsample_layers.append(downsample_layer) + + stage_blocks = [] + for j in range(depth): + if j > depth - global_blocks[i] - 1: + stage_blocks.append( + SDTAEncoder( + in_channel=channels, + drop_path_rate=dpr[block_idx + j], + mlp_ratio=mlp_ratio, + scales=d2_scales[i], + use_pos_emb=use_pos_embd_csa[i], + num_heads=self.num_heads[i], + )) + else: + dw_conv_cfg = dict( + kernel_size=conv_kernel_sizes[i], + padding=conv_kernel_sizes[i] // 2, + ) + stage_blocks.append( + ConvNeXtBlock( + in_channels=channels, + dw_conv_cfg=dw_conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + linear_pw_conv=linear_pw_conv, + drop_path_rate=dpr[block_idx + j], + layer_scale_init_value=layer_scale_init_value, + )) + block_idx += depth + + stage_blocks = Sequential(*stage_blocks) + self.stages.append(stage_blocks) + + if i in self.out_indices: + out_norm_cfg = dict(type='LN') if self.gap_before_final_norm \ + else norm_cfg + norm_layer = build_norm_layer(out_norm_cfg, channels)[1] + self.add_module(f'norm{i}', norm_layer) + + def init_weights(self) -> None: + # TODO: need to be implemented in the future + return super().init_weights() + + def forward(self, x): + outs = [] + for i, stage in enumerate(self.stages): + x = self.downsample_layers[i](x) + x = stage(x) + if self.pos_embed and i == 0: + B, _, H, W = x.shape + x += self.pos_embed((B, H, W)) + + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + if self.gap_before_final_norm: + gap = x.mean([-2, -1], keepdim=True) + outs.append(norm_layer(gap.flatten(1))) + else: + # The output of LayerNorm2d may be discontiguous, which + # may cause some problem in the downstream tasks + outs.append(norm_layer(x).contiguous()) + + return tuple(outs) + + def _freeze_stages(self): + for i in range(self.frozen_stages): + downsample_layer = self.downsample_layers[i] + stage = self.stages[i] + downsample_layer.eval() + stage.eval() + for param in chain(downsample_layer.parameters(), + stage.parameters()): + param.requires_grad = False + + def train(self, mode=True): + super(EdgeNeXt, self).train(mode) + self._freeze_stages() diff --git a/mmcls/models/backbones/efficientformer.py b/mmcls/models/backbones/efficientformer.py new file mode 100644 index 00000000..79757106 --- /dev/null +++ b/mmcls/models/backbones/efficientformer.py @@ -0,0 +1,606 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +from typing import Optional, Sequence + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import (ConvModule, DropPath, build_activation_layer, + build_norm_layer) +from mmengine.model import BaseModule, ModuleList, Sequential + +from mmcls.registry import MODELS +from ..utils import LayerScale +from .base_backbone import BaseBackbone +from .poolformer import Pooling + + +class AttentionWithBias(BaseModule): + """Multi-head Attention Module with attention_bias. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. Defaults to 8. + key_dim (int): The dimension of q, k. Defaults to 32. + attn_ratio (float): The dimension of v equals to + ``key_dim * attn_ratio``. Defaults to 4. + resolution (int): The height and width of attention_bias. + Defaults to 7. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads=8, + key_dim=32, + attn_ratio=4., + resolution=7, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.num_heads = num_heads + self.scale = key_dim**-0.5 + self.attn_ratio = attn_ratio + self.key_dim = key_dim + self.nh_kd = key_dim * num_heads + self.d = int(attn_ratio * key_dim) + self.dh = int(attn_ratio * key_dim) * num_heads + h = self.dh + self.nh_kd * 2 + self.qkv = nn.Linear(embed_dims, h) + self.proj = nn.Linear(self.dh, embed_dims) + + points = list(itertools.product(range(resolution), range(resolution))) + N = len(points) + attention_offsets = {} + idxs = [] + for p1 in points: + for p2 in points: + offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = nn.Parameter( + torch.zeros(num_heads, len(attention_offsets))) + self.register_buffer('attention_bias_idxs', + torch.LongTensor(idxs).view(N, N)) + + @torch.no_grad() + def train(self, mode=True): + """change the mode of model.""" + super().train(mode) + if mode and hasattr(self, 'ab'): + del self.ab + else: + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + def forward(self, x): + """forward function. + + Args: + x (tensor): input features with shape of (B, N, C) + """ + B, N, _ = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + q, k, v = qkv.split([self.key_dim, self.key_dim, self.d], dim=-1) + + attn = ((q @ k.transpose(-2, -1)) * self.scale + + (self.attention_biases[:, self.attention_bias_idxs] + if self.training else self.ab)) + attn = attn.softmax(dim=-1) + x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh) + x = self.proj(x) + return x + + +class Flat(nn.Module): + """Flat the input from (B, C, H, W) to (B, H*W, C).""" + + def __init__(self, ): + super().__init__() + + def forward(self, x: torch.Tensor): + x = x.flatten(2).transpose(1, 2) + return x + + +class LinearMlp(BaseModule): + """Mlp implemented with linear. + + The shape of input and output tensor are (B, N, C). + + Args: + in_features (int): Dimension of input features. + hidden_features (int): Dimension of hidden features. + out_features (int): Dimension of output features. + norm_cfg (dict): Config dict for normalization layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): The config dict for activation between pointwise + convolution. Defaults to ``dict(type='GELU')``. + drop (float): Dropout rate. Defaults to 0.0. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_cfg=dict(type='GELU'), + drop=0., + init_cfg=None): + super().__init__(init_cfg=init_cfg) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = build_activation_layer(act_cfg) + self.drop1 = nn.Dropout(drop) + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop2 = nn.Dropout(drop) + + def forward(self, x): + """ + Args: + x (torch.Tensor): input tensor with shape (B, N, C). + + Returns: + torch.Tensor: output tensor with shape (B, N, C). + """ + x = self.drop1(self.act(self.fc1(x))) + x = self.drop2(self.fc2(x)) + return x + + +class ConvMlp(BaseModule): + """Mlp implemented with 1*1 convolutions. + + Args: + in_features (int): Dimension of input features. + hidden_features (int): Dimension of hidden features. + out_features (int): Dimension of output features. + norm_cfg (dict): Config dict for normalization layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): The config dict for activation between pointwise + convolution. Defaults to ``dict(type='GELU')``. + drop (float): Dropout rate. Defaults to 0.0. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='GELU'), + drop=0., + init_cfg=None): + super().__init__(init_cfg=init_cfg) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, 1) + self.act = build_activation_layer(act_cfg) + self.fc2 = nn.Conv2d(hidden_features, out_features, 1) + self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1] + self.norm2 = build_norm_layer(norm_cfg, out_features)[1] + + self.drop = nn.Dropout(drop) + + def forward(self, x): + """ + Args: + x (torch.Tensor): input tensor with shape (B, C, H, W). + + Returns: + torch.Tensor: output tensor with shape (B, C, H, W). + """ + + x = self.act(self.norm1(self.fc1(x))) + x = self.drop(x) + x = self.norm2(self.fc2(x)) + x = self.drop(x) + return x + + +class Meta3D(BaseModule): + """Meta Former block using 3 dimensions inputs, ``torch.Tensor`` with shape + (B, N, C).""" + + def __init__(self, + dim, + mlp_ratio=4., + norm_cfg=dict(type='LN'), + act_cfg=dict(type='GELU'), + drop=0., + drop_path=0., + use_layer_scale=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.norm1 = build_norm_layer(norm_cfg, dim)[1] + self.token_mixer = AttentionWithBias(dim) + self.norm2 = build_norm_layer(norm_cfg, dim)[1] + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = LinearMlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_cfg=act_cfg, + drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. \ + else nn.Identity() + if use_layer_scale: + self.ls1 = LayerScale(dim) + self.ls2 = LayerScale(dim) + else: + self.ls1, self.ls2 = nn.Identity(), nn.Identity() + + def forward(self, x): + x = x + self.drop_path(self.ls1(self.token_mixer(self.norm1(x)))) + x = x + self.drop_path(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class Meta4D(BaseModule): + """Meta Former block using 4 dimensions inputs, ``torch.Tensor`` with shape + (B, C, H, W).""" + + def __init__(self, + dim, + pool_size=3, + mlp_ratio=4., + act_cfg=dict(type='GELU'), + drop=0., + drop_path=0., + use_layer_scale=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.token_mixer = Pooling(pool_size=pool_size) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ConvMlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_cfg=act_cfg, + drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. \ + else nn.Identity() + if use_layer_scale: + self.ls1 = LayerScale(dim, data_format='channels_first') + self.ls2 = LayerScale(dim, data_format='channels_first') + else: + self.ls1, self.ls2 = nn.Identity(), nn.Identity() + + def forward(self, x): + x = x + self.drop_path(self.ls1(self.token_mixer(x))) + x = x + self.drop_path(self.ls2(self.mlp(x))) + return x + + +def basic_blocks(in_channels, + out_channels, + index, + layers, + pool_size=3, + mlp_ratio=4., + act_cfg=dict(type='GELU'), + drop_rate=.0, + drop_path_rate=0., + use_layer_scale=True, + vit_num=1, + has_downsamper=False): + """generate EfficientFormer blocks for a stage.""" + blocks = [] + if has_downsamper: + blocks.append( + ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=True, + norm_cfg=dict(type='BN'), + act_cfg=None)) + if index == 3 and vit_num == layers[index]: + blocks.append(Flat()) + for block_idx in range(layers[index]): + block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / ( + sum(layers) - 1) + if index == 3 and layers[index] - block_idx <= vit_num: + blocks.append( + Meta3D( + out_channels, + mlp_ratio=mlp_ratio, + act_cfg=act_cfg, + drop=drop_rate, + drop_path=block_dpr, + use_layer_scale=use_layer_scale, + )) + else: + blocks.append( + Meta4D( + out_channels, + pool_size=pool_size, + act_cfg=act_cfg, + drop=drop_rate, + drop_path=block_dpr, + use_layer_scale=use_layer_scale)) + if index == 3 and layers[index] - block_idx - 1 == vit_num: + blocks.append(Flat()) + blocks = nn.Sequential(*blocks) + return blocks + + +@MODELS.register_module() +class EfficientFormer(BaseBackbone): + """EfficientFormer. + + A PyTorch implementation of EfficientFormer introduced by: + `EfficientFormer: Vision Transformers at MobileNet Speed `_ + + Modified from the `official repo + `. + + Args: + arch (str | dict): The model's architecture. If string, it should be + one of architecture in ``EfficientFormer.arch_settings``. And if dict, + it should include the following 4 keys: + + - layers (list[int]): Number of blocks at each stage. + - embed_dims (list[int]): The number of channels at each stage. + - downsamples (list[int]): Has downsample or not in the four stages. + - vit_num (int): The num of vit blocks in the last stage. + + Defaults to 'l1'. + + in_channels (int): The num of input channels. Defaults to 3. + pool_size (int): The pooling size of ``Meta4D`` blocks. Defaults to 3. + mlp_ratios (int): The dimension ratio of multi-head attention mechanism + in ``Meta4D`` blocks. Defaults to 3. + reshape_last_feat (bool): Whether to reshape the feature map from + (B, N, C) to (B, C, H, W) in the last stage, when the ``vit-num`` + in ``arch`` is not 0. Defaults to False. Usually set to True + in downstream tasks. + out_indices (Sequence[int]): Output from which stages. + Defaults to -1. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Defaults to -1. + act_cfg (dict): The config dict for activation between pointwise + convolution. Defaults to ``dict(type='GELU')``. + drop_rate (float): Dropout rate. Defaults to 0. + drop_path_rate (float): Stochastic depth rate. Defaults to 0. + use_layer_scale (bool): Whether to use use_layer_scale in MetaFormer + block. Defaults to True. + init_cfg (dict, optional): Initialization config dict. + Defaults to None. + + Example: + >>> from mmcls.models import EfficientFormer + >>> import torch + >>> inputs = torch.rand((1, 3, 224, 224)) + >>> # build EfficientFormer backbone for classification task + >>> model = EfficientFormer(arch="l1") + >>> model.eval() + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 448, 49) + >>> # build EfficientFormer backbone for downstream task + >>> model = EfficientFormer( + >>> arch="l3", + >>> out_indices=(0, 1, 2, 3), + >>> reshape_last_feat=True) + >>> model.eval() + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 56, 56) + (1, 128, 28, 28) + (1, 320, 14, 14) + (1, 512, 7, 7) + """ # noqa: E501 + + # --layers: [x,x,x,x], numbers of layers for the four stages + # --embed_dims: [x,x,x,x], embedding dims for the four stages + # --downsamples: [x,x,x,x], has downsample or not in the four stages + # --vit_num:(int), the num of vit blocks in the last stage + arch_settings = { + 'l1': { + 'layers': [3, 2, 6, 4], + 'embed_dims': [48, 96, 224, 448], + 'downsamples': [False, True, True, True], + 'vit_num': 1, + }, + 'l3': { + 'layers': [4, 4, 12, 6], + 'embed_dims': [64, 128, 320, 512], + 'downsamples': [False, True, True, True], + 'vit_num': 4, + }, + 'l7': { + 'layers': [6, 6, 18, 8], + 'embed_dims': [96, 192, 384, 768], + 'downsamples': [False, True, True, True], + 'vit_num': 8, + }, + } + + def __init__(self, + arch='l1', + in_channels=3, + pool_size=3, + mlp_ratios=4, + reshape_last_feat=False, + out_indices=-1, + frozen_stages=-1, + act_cfg=dict(type='GELU'), + drop_rate=0., + drop_path_rate=0., + use_layer_scale=True, + init_cfg=None): + + super().__init__(init_cfg=init_cfg) + self.num_extra_tokens = 0 # no cls_token, no dist_token + + if isinstance(arch, str): + assert arch in self.arch_settings, \ + f'Unavailable arch, please choose from ' \ + f'({set(self.arch_settings)}) or pass a dict.' + arch = self.arch_settings[arch] + elif isinstance(arch, dict): + default_keys = set(self.arch_settings['l1'].keys()) + assert set(arch.keys()) == default_keys, \ + f'The arch dict must have {default_keys}, ' \ + f'but got {list(arch.keys())}.' + + self.layers = arch['layers'] + self.embed_dims = arch['embed_dims'] + self.downsamples = arch['downsamples'] + assert isinstance(self.layers, list) and isinstance( + self.embed_dims, list) and isinstance(self.downsamples, list) + assert len(self.layers) == len(self.embed_dims) == len( + self.downsamples) + + self.vit_num = arch['vit_num'] + self.reshape_last_feat = reshape_last_feat + + assert self.vit_num >= 0, "'vit_num' must be an integer " \ + 'greater than or equal to 0.' + assert self.vit_num <= self.layers[-1], ( + "'vit_num' must be an integer smaller than layer number") + + self._make_stem(in_channels, self.embed_dims[0]) + + # set the main block in network + network = [] + for i in range(len(self.layers)): + if i != 0: + in_channels = self.embed_dims[i - 1] + else: + in_channels = self.embed_dims[i] + out_channels = self.embed_dims[i] + stage = basic_blocks( + in_channels, + out_channels, + i, + self.layers, + pool_size=pool_size, + mlp_ratio=mlp_ratios, + act_cfg=act_cfg, + drop_rate=drop_rate, + drop_path_rate=drop_path_rate, + vit_num=self.vit_num, + use_layer_scale=use_layer_scale, + has_downsamper=self.downsamples[i]) + network.append(stage) + + self.network = ModuleList(network) + + if isinstance(out_indices, int): + out_indices = [out_indices] + assert isinstance(out_indices, Sequence), \ + f'"out_indices" must by a sequence or int, ' \ + f'get {type(out_indices)} instead.' + for i, index in enumerate(out_indices): + if index < 0: + out_indices[i] = 4 + index + assert out_indices[i] >= 0, f'Invalid out_indices {index}' + + self.out_indices = out_indices + for i_layer in self.out_indices: + if not self.reshape_last_feat and \ + i_layer == 3 and self.vit_num > 0: + layer = build_norm_layer( + dict(type='LN'), self.embed_dims[i_layer])[1] + else: + # use GN with 1 group as channel-first LN2D + layer = build_norm_layer( + dict(type='GN', num_groups=1), self.embed_dims[i_layer])[1] + + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.frozen_stages = frozen_stages + self._freeze_stages() + + def _make_stem(self, in_channels: int, stem_channels: int): + """make 2-ConvBNReLu stem layer.""" + self.patch_embed = Sequential( + ConvModule( + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + inplace=True), + ConvModule( + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=2, + padding=1, + bias=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + inplace=True)) + + def forward_tokens(self, x): + outs = [] + for idx, block in enumerate(self.network): + if idx == len(self.network) - 1: + N, _, H, W = x.shape + if self.downsamples[idx]: + H, W = H // 2, W // 2 + x = block(x) + if idx in self.out_indices: + norm_layer = getattr(self, f'norm{idx}') + + if idx == len(self.network) - 1 and x.dim() == 3: + # when ``vit-num`` > 0 and in the last stage, + # if `self.reshape_last_feat`` is True, reshape the + # features to `BCHW` format before the final normalization. + # if `self.reshape_last_feat`` is False, do + # normalization directly and permute the features to `BCN`. + if self.reshape_last_feat: + x = x.permute((0, 2, 1)).reshape(N, -1, H, W) + x_out = norm_layer(x) + else: + x_out = norm_layer(x).permute((0, 2, 1)) + else: + x_out = norm_layer(x) + + outs.append(x_out.contiguous()) + return tuple(outs) + + def forward(self, x): + # input embedding + x = self.patch_embed(x) + # through stages + x = self.forward_tokens(x) + return x + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + for i in range(self.frozen_stages): + # Include both block and downsample layer. + module = self.network[i] + module.eval() + for param in module.parameters(): + param.requires_grad = False + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + norm_layer.eval() + for param in norm_layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(EfficientFormer, self).train(mode) + self._freeze_stages() diff --git a/mmcls/models/backbones/mobileone.py b/mmcls/models/backbones/mobileone.py new file mode 100644 index 00000000..c82cb8b5 --- /dev/null +++ b/mmcls/models/backbones/mobileone.py @@ -0,0 +1,515 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Modified from official impl https://github.com/apple/ml-mobileone/blob/main/mobileone.py # noqa: E501 +from typing import Optional, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer +from mmengine.model import BaseModule, ModuleList, Sequential +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcls.registry import MODELS +from ..utils.se_layer import SELayer +from .base_backbone import BaseBackbone + + +class MobileOneBlock(BaseModule): + """MobileOne block for MobileOne backbone. + + Args: + in_channels (int): The input channels of the block. + out_channels (int): The output channels of the block. + kernel_size (int): The kernel size of the convs in the block. If the + kernel size is large than 1, there will be a ``branch_scale`` in + the block. + num_convs (int): Number of the convolution branches in the block. + stride (int): Stride of convolution layers. Defaults to 1. + padding (int): Padding of the convolution layers. Defaults to 1. + dilation (int): Dilation of the convolution layers. Defaults to 1. + groups (int): Groups of the convolution layers. Defaults to 1. + se_cfg (None or dict): The configuration of the se module. + Defaults to None. + norm_cfg (dict): Configuration to construct and config norm layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + deploy (bool): Whether the model structure is in the deployment mode. + Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + num_convs: int, + stride: int = 1, + padding: int = 1, + dilation: int = 1, + groups: int = 1, + se_cfg: Optional[dict] = None, + conv_cfg: Optional[dict] = None, + norm_cfg: Optional[dict] = dict(type='BN'), + act_cfg: Optional[dict] = dict(type='ReLU'), + deploy: bool = False, + init_cfg: Optional[dict] = None): + super(MobileOneBlock, self).__init__(init_cfg) + + assert se_cfg is None or isinstance(se_cfg, dict) + if se_cfg is not None: + self.se = SELayer(channels=out_channels, **se_cfg) + else: + self.se = nn.Identity() + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.num_conv_branches = num_convs + self.stride = stride + self.padding = padding + self.se_cfg = se_cfg + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.deploy = deploy + self.groups = groups + self.dilation = dilation + + if deploy: + self.branch_reparam = build_conv_layer( + conv_cfg, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + groups=self.groups, + stride=stride, + padding=padding, + dilation=dilation, + bias=True) + else: + # judge if input shape and output shape are the same. + # If true, add a normalized identity shortcut. + if out_channels == in_channels and stride == 1: + self.branch_norm = build_norm_layer(norm_cfg, in_channels)[1] + else: + self.branch_norm = None + + self.branch_scale = None + if kernel_size > 1: + self.branch_scale = self.create_conv_bn(kernel_size=1) + + self.branch_conv_list = ModuleList() + for _ in range(num_convs): + self.branch_conv_list.append( + self.create_conv_bn( + kernel_size=kernel_size, + padding=padding, + dilation=dilation)) + + self.act = build_activation_layer(act_cfg) + + def create_conv_bn(self, kernel_size, dilation=1, padding=0): + """cearte a (conv + bn) Sequential layer.""" + conv_bn = Sequential() + conv_bn.add_module( + 'conv', + build_conv_layer( + self.conv_cfg, + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=kernel_size, + groups=self.groups, + stride=self.stride, + dilation=dilation, + padding=padding, + bias=False)) + conv_bn.add_module( + 'norm', + build_norm_layer(self.norm_cfg, num_features=self.out_channels)[1]) + + return conv_bn + + def forward(self, x): + + def _inner_forward(inputs): + if self.deploy: + return self.branch_reparam(inputs) + + inner_out = 0 + if self.branch_norm is not None: + inner_out = self.branch_norm(inputs) + + if self.branch_scale is not None: + inner_out += self.branch_scale(inputs) + + for branch_conv in self.branch_conv_list: + inner_out += branch_conv(inputs) + + return inner_out + + return self.act(self.se(_inner_forward(x))) + + def switch_to_deploy(self): + """Switch the model structure from training mode to deployment mode.""" + if self.deploy: + return + assert self.norm_cfg['type'] == 'BN', \ + "Switch is not allowed when norm_cfg['type'] != 'BN'." + + reparam_weight, reparam_bias = self.reparameterize() + self.branch_reparam = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + bias=True) + self.branch_reparam.weight.data = reparam_weight + self.branch_reparam.bias.data = reparam_bias + + for param in self.parameters(): + param.detach_() + delattr(self, 'branch_conv_list') + if hasattr(self, 'branch_scale'): + delattr(self, 'branch_scale') + delattr(self, 'branch_norm') + + self.deploy = True + + def reparameterize(self): + """Fuse all the parameters of all branches. + + Returns: + tuple[torch.Tensor, torch.Tensor]: Parameters after fusion of all + branches. the first element is the weights and the second is + the bias. + """ + weight_conv, bias_conv = 0, 0 + for branch_conv in self.branch_conv_list: + weight, bias = self._fuse_conv_bn(branch_conv) + weight_conv += weight + bias_conv += bias + + weight_scale, bias_scale = 0, 0 + if self.branch_scale is not None: + weight_scale, bias_scale = self._fuse_conv_bn(self.branch_scale) + # Pad scale branch kernel to match conv branch kernel size. + pad = self.kernel_size // 2 + weight_scale = F.pad(weight_scale, [pad, pad, pad, pad]) + + weight_norm, bias_norm = 0, 0 + if self.branch_norm: + tmp_conv_bn = self._norm_to_conv(self.branch_norm) + weight_norm, bias_norm = self._fuse_conv_bn(tmp_conv_bn) + + return (weight_conv + weight_scale + weight_norm, + bias_conv + bias_scale + bias_norm) + + def _fuse_conv_bn(self, branch): + """Fuse the parameters in a branch with a conv and bn. + + Args: + branch (mmcv.runner.Sequential): A branch with conv and bn. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The parameters obtained after + fusing the parameters of conv and bn in one branch. + The first element is the weight and the second is the bias. + """ + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.norm.running_mean + running_var = branch.norm.running_var + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm.eps + + std = (running_var + eps).sqrt() + fused_weight = (gamma / std).reshape(-1, 1, 1, 1) * kernel + fused_bias = beta - running_mean * gamma / std + + return fused_weight, fused_bias + + def _norm_to_conv(self, branch_nrom): + """Convert a norm layer to a conv-bn sequence towards + ``self.kernel_size``. + + Args: + branch (nn.BatchNorm2d): A branch only with bn in the block. + + Returns: + (mmcv.runner.Sequential): a sequential with conv and bn. + """ + input_dim = self.in_channels // self.groups + conv_weight = torch.zeros( + (self.in_channels, input_dim, self.kernel_size, self.kernel_size), + dtype=branch_nrom.weight.dtype) + + for i in range(self.in_channels): + conv_weight[i, i % input_dim, self.kernel_size // 2, + self.kernel_size // 2] = 1 + conv_weight = conv_weight.to(branch_nrom.weight.device) + + tmp_conv = self.create_conv_bn(kernel_size=self.kernel_size) + tmp_conv.conv.weight.data = conv_weight + tmp_conv.norm = branch_nrom + return tmp_conv + + +@MODELS.register_module() +class MobileOne(BaseBackbone): + """MobileOne backbone. + + A PyTorch impl of : `An Improved One millisecond Mobile Backbone + `_ + + Args: + arch (str | dict): MobileOne architecture. If use string, choose + from 's0', 's1', 's2', 's3' and 's4'. If use dict, it should + have below keys: + + - num_blocks (Sequence[int]): Number of blocks in each stage. + - width_factor (Sequence[float]): Width factor in each stage. + - num_conv_branches (Sequence[int]): Number of conv branches + in each stage. + - num_se_blocks (Sequence[int]): Number of SE layers in each + stage, all the SE layers are placed in the subsequent order + in each stage. + + Defaults to 's0'. + in_channels (int): Number of input image channels. Default: 3. + out_indices (Sequence[int] | int): Output from which stages. + Defaults to ``(3, )``. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Defaults to -1. + conv_cfg (dict | None): The config dict for conv layers. + Defaults to None. + norm_cfg (dict): The config dict for norm layers. + Defaults to ``dict(type='BN')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + deploy (bool): Whether to switch the model structure to deployment + mode. Defaults to False. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + + Example: + >>> from mmcls.models import MobileOne + >>> import torch + >>> x = torch.rand(1, 3, 224, 224) + >>> model = MobileOne("s0", out_indices=(0, 1, 2, 3)) + >>> model.eval() + >>> outputs = model(x) + >>> for out in outputs: + ... print(tuple(out.shape)) + (1, 48, 56, 56) + (1, 128, 28, 28) + (1, 256, 14, 14) + (1, 1024, 7, 7) + """ + + arch_zoo = { + 's0': + dict( + num_blocks=[2, 8, 10, 1], + width_factor=[0.75, 1.0, 1.0, 2.0], + num_conv_branches=[4, 4, 4, 4], + num_se_blocks=[0, 0, 0, 0]), + 's1': + dict( + num_blocks=[2, 8, 10, 1], + width_factor=[1.5, 1.5, 2.0, 2.5], + num_conv_branches=[1, 1, 1, 1], + num_se_blocks=[0, 0, 0, 0]), + 's2': + dict( + num_blocks=[2, 8, 10, 1], + width_factor=[1.5, 2.0, 2.5, 4.0], + num_conv_branches=[1, 1, 1, 1], + num_se_blocks=[0, 0, 0, 0]), + 's3': + dict( + num_blocks=[2, 8, 10, 1], + width_factor=[2.0, 2.5, 3.0, 4.0], + num_conv_branches=[1, 1, 1, 1], + num_se_blocks=[0, 0, 0, 0]), + 's4': + dict( + num_blocks=[2, 8, 10, 1], + width_factor=[3.0, 3.5, 3.5, 4.0], + num_conv_branches=[1, 1, 1, 1], + num_se_blocks=[0, 0, 5, 1]) + } + + def __init__(self, + arch, + in_channels=3, + out_indices=(3, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + se_cfg=dict(ratio=16), + deploy=False, + norm_eval=False, + init_cfg=[ + dict(type='Kaiming', layer=['Conv2d']), + dict(type='Constant', val=1, layer=['_BatchNorm']) + ]): + super(MobileOne, self).__init__(init_cfg) + + if isinstance(arch, str): + assert arch in self.arch_zoo, f'"arch": "{arch}"' \ + f' is not one of the {list(self.arch_zoo.keys())}' + arch = self.arch_zoo[arch] + elif not isinstance(arch, dict): + raise TypeError('Expect "arch" to be either a string ' + f'or a dict, got {type(arch)}') + + self.arch = arch + for k, value in self.arch.items(): + assert isinstance(value, list) and len(value) == 4, \ + f'the value of {k} in arch must be list with 4 items.' + + self.in_channels = in_channels + self.deploy = deploy + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.se_cfg = se_cfg + self.act_cfg = act_cfg + + base_channels = [64, 128, 256, 512] + channels = min(64, + int(base_channels[0] * self.arch['width_factor'][0])) + self.stage0 = MobileOneBlock( + self.in_channels, + channels, + stride=2, + kernel_size=3, + num_convs=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + deploy=deploy) + + self.in_planes = channels + self.stages = [] + for i, num_blocks in enumerate(self.arch['num_blocks']): + planes = int(base_channels[i] * self.arch['width_factor'][i]) + + stage = self._make_stage(planes, num_blocks, + arch['num_se_blocks'][i], + arch['num_conv_branches'][i]) + + stage_name = f'stage{i + 1}' + self.add_module(stage_name, stage) + self.stages.append(stage_name) + + if isinstance(out_indices, int): + out_indices = [out_indices] + assert isinstance(out_indices, Sequence), \ + f'"out_indices" must by a sequence or int, ' \ + f'get {type(out_indices)} instead.' + out_indices = list(out_indices) + for i, index in enumerate(out_indices): + if index < 0: + out_indices[i] = len(self.stages) + index + assert 0 <= out_indices[i] <= len(self.stages), \ + f'Invalid out_indices {index}.' + self.out_indices = out_indices + + def _make_stage(self, planes, num_blocks, num_se, num_conv_branches): + strides = [2] + [1] * (num_blocks - 1) + if num_se > num_blocks: + raise ValueError('Number of SE blocks cannot ' + 'exceed number of layers.') + blocks = [] + for i in range(num_blocks): + use_se = False + if i >= (num_blocks - num_se): + use_se = True + + blocks.append( + # Depthwise conv + MobileOneBlock( + in_channels=self.in_planes, + out_channels=self.in_planes, + kernel_size=3, + num_convs=num_conv_branches, + stride=strides[i], + padding=1, + groups=self.in_planes, + se_cfg=self.se_cfg if use_se else None, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + deploy=self.deploy)) + + blocks.append( + # Pointwise conv + MobileOneBlock( + in_channels=self.in_planes, + out_channels=planes, + kernel_size=1, + num_convs=num_conv_branches, + stride=1, + padding=0, + se_cfg=self.se_cfg if use_se else None, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + deploy=self.deploy)) + + self.in_planes = planes + + return Sequential(*blocks) + + def forward(self, x): + x = self.stage0(x) + outs = [] + for i, stage_name in enumerate(self.stages): + stage = getattr(self, stage_name) + x = stage(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.stage0.eval() + for param in self.stage0.parameters(): + param.requires_grad = False + for i in range(self.frozen_stages): + stage = getattr(self, f'stage{i+1}') + stage.eval() + for param in stage.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """switch the mobile to train mode or not.""" + super(MobileOne, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def switch_to_deploy(self): + """switch the model to deploy mode, which has smaller amount of + parameters and calculations.""" + for m in self.modules(): + if isinstance(m, MobileOneBlock): + m.switch_to_deploy() + self.deploy = True diff --git a/mmcls/models/backbones/mvit.py b/mmcls/models/backbones/mvit.py new file mode 100644 index 00000000..03043539 --- /dev/null +++ b/mmcls/models/backbones/mvit.py @@ -0,0 +1,700 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmcv.cnn.bricks import DropPath +from mmcv.cnn.bricks.transformer import PatchEmbed +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import trunc_normal_ +from mmengine.utils import to_2tuple + +from ..builder import BACKBONES +from ..utils import resize_pos_embed +from .base_backbone import BaseBackbone + + +def resize_decomposed_rel_pos(rel_pos, q_size, k_size): + """Get relative positional embeddings according to the relative positions + of query and key sizes. + + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + resized = F.interpolate( + # (L, C) -> (1, C, L) + rel_pos.transpose(0, 1).unsqueeze(0), + size=max_rel_dist, + mode='linear', + ) + # (1, C, L) -> (L, C) + resized = resized.squeeze(0).transpose(0, 1) + else: + resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_h_ratio = max(k_size / q_size, 1.0) + k_h_ratio = max(q_size / k_size, 1.0) + q_coords = torch.arange(q_size)[:, None] * q_h_ratio + k_coords = torch.arange(k_size)[None, :] * k_h_ratio + relative_coords = (q_coords - k_coords) + (k_size - 1) * k_h_ratio + + return resized[relative_coords.long()] + + +def add_decomposed_rel_pos(attn, + q, + q_shape, + k_shape, + rel_pos_h, + rel_pos_w, + has_cls_token=False): + """Spatial Relative Positional Embeddings.""" + sp_idx = 1 if has_cls_token else 0 + B, num_heads, _, C = q.shape + q_h, q_w = q_shape + k_h, k_w = k_shape + + Rh = resize_decomposed_rel_pos(rel_pos_h, q_h, k_h) + Rw = resize_decomposed_rel_pos(rel_pos_w, q_w, k_w) + + r_q = q[:, :, sp_idx:].reshape(B, num_heads, q_h, q_w, C) + rel_h = torch.einsum('byhwc,hkc->byhwk', r_q, Rh) + rel_w = torch.einsum('byhwc,wkc->byhwk', r_q, Rw) + rel_pos_embed = rel_h[:, :, :, :, :, None] + rel_w[:, :, :, :, None, :] + + attn_map = attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_h, q_w, k_h, k_w) + attn_map += rel_pos_embed + attn[:, :, sp_idx:, sp_idx:] = attn_map.view(B, -1, q_h * q_w, k_h * k_w) + + return attn + + +class MLP(BaseModule): + """Two-layer multilayer perceptron. + + Comparing with :class:`mmcv.cnn.bricks.transformer.FFN`, this class allows + different input and output channel numbers. + + Args: + in_channels (int): The number of input channels. + hidden_channels (int, optional): The number of hidden layer channels. + If None, same as the ``in_channels``. Defaults to None. + out_channels (int, optional): The number of output channels. If None, + same as the ``in_channels``. Defaults to None. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_channels, + hidden_channels=None, + out_channels=None, + act_cfg=dict(type='GELU'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + out_channels = out_channels or in_channels + hidden_channels = hidden_channels or in_channels + self.fc1 = nn.Linear(in_channels, hidden_channels) + self.act = build_activation_layer(act_cfg) + self.fc2 = nn.Linear(hidden_channels, out_channels) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +def attention_pool(x: torch.Tensor, + pool: nn.Module, + in_size: tuple, + norm: Optional[nn.Module] = None): + """Pooling the feature tokens. + + Args: + x (torch.Tensor): The input tensor, should be with shape + ``(B, num_heads, L, C)`` or ``(B, L, C)``. + pool (nn.Module): The pooling module. + in_size (Tuple[int]): The shape of the input feature map. + norm (nn.Module, optional): The normalization module. + Defaults to None. + """ + ndim = x.ndim + if ndim == 4: + B, num_heads, L, C = x.shape + elif ndim == 3: + num_heads = 1 + B, L, C = x.shape + else: + raise RuntimeError(f'Unsupported input dimension {x.shape}') + + H, W = in_size + assert L == H * W + + # (B, num_heads, H*W, C) -> (B*num_heads, C, H, W) + x = x.reshape(B * num_heads, H, W, C).permute(0, 3, 1, 2).contiguous() + x = pool(x) + out_size = x.shape[-2:] + + # (B*num_heads, C, H', W') -> (B, num_heads, H'*W', C) + x = x.reshape(B, num_heads, C, -1).transpose(2, 3) + + if norm is not None: + x = norm(x) + + if ndim == 3: + x = x.squeeze(1) + + return x, out_size + + +class MultiScaleAttention(BaseModule): + """Multiscale Multi-head Attention block. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3). + stride_q (int): stride size for q pooling layer. Defaults to 1. + stride_kv (int): stride size for kv pooling layer. Defaults to 1. + rel_pos_spatial (bool): Whether to enable the spatial relative + position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_spatial``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__(self, + in_dims, + out_dims, + num_heads, + qkv_bias=True, + norm_cfg=dict(type='LN'), + pool_kernel=(3, 3), + stride_q=1, + stride_kv=1, + rel_pos_spatial=False, + residual_pooling=True, + input_size=None, + rel_pos_zero_init=False, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.num_heads = num_heads + self.in_dims = in_dims + self.out_dims = out_dims + + head_dim = out_dims // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(in_dims, out_dims * 3, bias=qkv_bias) + self.proj = nn.Linear(out_dims, out_dims) + + # qkv pooling + pool_padding = [k // 2 for k in pool_kernel] + pool_dims = out_dims // num_heads + + def build_pooling(stride): + pool = nn.Conv2d( + pool_dims, + pool_dims, + pool_kernel, + stride=stride, + padding=pool_padding, + groups=pool_dims, + bias=False, + ) + norm = build_norm_layer(norm_cfg, pool_dims)[1] + return pool, norm + + self.pool_q, self.norm_q = build_pooling(stride_q) + self.pool_k, self.norm_k = build_pooling(stride_kv) + self.pool_v, self.norm_v = build_pooling(stride_kv) + + self.residual_pooling = residual_pooling + + self.rel_pos_spatial = rel_pos_spatial + self.rel_pos_zero_init = rel_pos_zero_init + if self.rel_pos_spatial: + # initialize relative positional embeddings + assert input_size[0] == input_size[1] + + size = input_size[0] + rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1 + self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim)) + + def init_weights(self): + """Weight initialization.""" + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress rel_pos_zero_init if use pretrained model. + return + + if not self.rel_pos_zero_init: + trunc_normal_(self.rel_pos_h, std=0.02) + trunc_normal_(self.rel_pos_w, std=0.02) + + def forward(self, x, in_size): + """Forward the MultiScaleAttention.""" + B, N, _ = x.shape # (B, H*W, C) + + # qkv: (B, H*W, 3, num_heads, C) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1) + # q, k, v: (B, num_heads, H*W, C) + q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0) + + q, q_shape = attention_pool(q, self.pool_q, in_size, norm=self.norm_q) + k, k_shape = attention_pool(k, self.pool_k, in_size, norm=self.norm_k) + v, v_shape = attention_pool(v, self.pool_v, in_size, norm=self.norm_v) + + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.rel_pos_spatial: + attn = add_decomposed_rel_pos(attn, q, q_shape, k_shape, + self.rel_pos_h, self.rel_pos_w) + + attn = attn.softmax(dim=-1) + x = attn @ v + + if self.residual_pooling: + x = x + q + + # (B, num_heads, H'*W', C'//num_heads) -> (B, H'*W', C') + x = x.transpose(1, 2).reshape(B, -1, self.out_dims) + x = self.proj(x) + + return x, q_shape + + +class MultiScaleBlock(BaseModule): + """Multiscale Transformer blocks. + + Args: + in_dims (int): Number of input channels. + out_dims (int): Number of output channels. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): If True, add a learnable bias to query, key and + value. Defaults to True. + drop_path (float): Stochastic depth rate. Defaults to 0. + norm_cfg (dict): The config of normalization layers. + Defaults to ``dict(type='LN')``. + act_cfg (dict): The config of activation function. + Defaults to ``dict(type='GELU')``. + qkv_pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3). + stride_q (int): stride size for q pooling layer. Defaults to 1. + stride_kv (int): stride size for kv pooling layer. Defaults to 1. + rel_pos_spatial (bool): Whether to enable the spatial relative + position embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + input_size (Tuple[int], optional): The input resolution, necessary + if enable the ``rel_pos_spatial``. Defaults to None. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + init_cfg (dict, optional): The config of weight initialization. + Defaults to None. + """ + + def __init__( + self, + in_dims, + out_dims, + num_heads, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=0.0, + norm_cfg=dict(type='LN'), + act_cfg=dict(type='GELU'), + qkv_pool_kernel=(3, 3), + stride_q=1, + stride_kv=1, + rel_pos_spatial=True, + residual_pooling=True, + dim_mul_in_attention=True, + input_size=None, + rel_pos_zero_init=False, + init_cfg=None, + ): + super().__init__(init_cfg=init_cfg) + self.in_dims = in_dims + self.out_dims = out_dims + self.norm1 = build_norm_layer(norm_cfg, in_dims)[1] + self.dim_mul_in_attention = dim_mul_in_attention + + attn_dims = out_dims if dim_mul_in_attention else in_dims + self.attn = MultiScaleAttention( + in_dims, + attn_dims, + num_heads=num_heads, + qkv_bias=qkv_bias, + norm_cfg=norm_cfg, + pool_kernel=qkv_pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_spatial=rel_pos_spatial, + residual_pooling=residual_pooling, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init) + self.drop_path = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = build_norm_layer(norm_cfg, attn_dims)[1] + + self.mlp = MLP( + in_channels=attn_dims, + hidden_channels=int(attn_dims * mlp_ratio), + out_channels=out_dims, + act_cfg=act_cfg) + + if in_dims != out_dims: + self.proj = nn.Linear(in_dims, out_dims) + else: + self.proj = None + + if stride_q > 1: + kernel_skip = stride_q + 1 + padding_skip = int(kernel_skip // 2) + self.pool_skip = nn.MaxPool2d( + kernel_skip, stride_q, padding_skip, ceil_mode=False) + + if input_size is not None: + input_size = to_2tuple(input_size) + out_size = [size // stride_q for size in input_size] + self.init_out_size = out_size + else: + self.init_out_size = None + else: + self.pool_skip = None + self.init_out_size = input_size + + def forward(self, x, in_size): + x_norm = self.norm1(x) + x_attn, out_size = self.attn(x_norm, in_size) + + if self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + if self.pool_skip is not None: + skip, _ = attention_pool(skip, self.pool_skip, in_size) + + x = skip + self.drop_path(x_attn) + x_norm = self.norm2(x) + x_mlp = self.mlp(x_norm) + + if not self.dim_mul_in_attention and self.proj is not None: + skip = self.proj(x_norm) + else: + skip = x + + x = skip + self.drop_path(x_mlp) + + return x, out_size + + +@BACKBONES.register_module() +class MViT(BaseBackbone): + """Multi-scale ViT v2. + + A PyTorch implement of : `MViTv2: Improved Multiscale Vision Transformers + for Classification and Detection `_ + + Inspiration from `the official implementation + `_ and `the detectron2 + implementation `_ + + Args: + arch (str | dict): MViT architecture. If use string, choose + from 'tiny', 'small', 'base' and 'large'. If use dict, it should + have below keys: + + - **embed_dims** (int): The dimensions of embedding. + - **num_layers** (int): The number of layers. + - **num_heads** (int): The number of heads in attention + modules of the initial layer. + - **downscale_indices** (List[int]): The layer indices to downscale + the feature map. + + Defaults to 'base'. + img_size (int): The expected input image shape. Defaults to 224. + in_channels (int): The num of input channels. Defaults to 3. + out_scales (int | Sequence[int]): The output scale indices. + They should not exceed the length of ``downscale_indices``. + Defaults to -1, which means the last scale. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults to False. + interpolate_mode (str): Select the interpolate mode for absolute + position embedding vector resize. Defaults to "bicubic". + pool_kernel (tuple): kernel size for qkv pooling layers. + Defaults to (3, 3). + dim_mul (int): The magnification for ``embed_dims`` in the downscale + layers. Defaults to 2. + head_mul (int): The magnification for ``num_heads`` in the downscale + layers. Defaults to 2. + adaptive_kv_stride (int): The stride size for kv pooling in the initial + layer. Defaults to 4. + rel_pos_spatial (bool): Whether to enable the spatial relative position + embedding. Defaults to True. + residual_pooling (bool): Whether to enable the residual connection + after attention pooling. Defaults to True. + dim_mul_in_attention (bool): Whether to multiply the ``embed_dims`` in + attention layers. If False, multiply it in MLP layers. + Defaults to True. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. Defaults to False. + mlp_ratio (float): Ratio of hidden dimensions in MLP layers. + Defaults to 4.0. + qkv_bias (bool): enable bias for qkv if True. Defaults to True. + norm_cfg (dict): Config dict for normalization layer for all output + features. Defaults to ``dict(type='LN', eps=1e-6)``. + patch_cfg (dict): Config dict for the patch embedding layer. + Defaults to ``dict(kernel_size=7, stride=4, padding=3)``. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + + Examples: + >>> import torch + >>> from mmcls.models import build_backbone + >>> + >>> cfg = dict(type='MViT', arch='tiny', out_scales=[0, 1, 2, 3]) + >>> model = build_backbone(cfg) + >>> inputs = torch.rand(1, 3, 224, 224) + >>> outputs = model(inputs) + >>> for i, output in enumerate(outputs): + >>> print(f'scale{i}: {output.shape}') + scale0: torch.Size([1, 96, 56, 56]) + scale1: torch.Size([1, 192, 28, 28]) + scale2: torch.Size([1, 384, 14, 14]) + scale3: torch.Size([1, 768, 7, 7]) + """ + arch_zoo = { + 'tiny': { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [1, 3, 8] + }, + 'small': { + 'embed_dims': 96, + 'num_layers': 16, + 'num_heads': 1, + 'downscale_indices': [1, 3, 14] + }, + 'base': { + 'embed_dims': 96, + 'num_layers': 24, + 'num_heads': 1, + 'downscale_indices': [2, 5, 21] + }, + 'large': { + 'embed_dims': 144, + 'num_layers': 48, + 'num_heads': 2, + 'downscale_indices': [2, 8, 44] + }, + } + num_extra_tokens = 0 + + def __init__(self, + arch='base', + img_size=224, + in_channels=3, + out_scales=-1, + drop_path_rate=0., + use_abs_pos_embed=False, + interpolate_mode='bicubic', + pool_kernel=(3, 3), + dim_mul=2, + head_mul=2, + adaptive_kv_stride=4, + rel_pos_spatial=True, + residual_pooling=True, + dim_mul_in_attention=True, + rel_pos_zero_init=False, + mlp_ratio=4., + qkv_bias=True, + norm_cfg=dict(type='LN', eps=1e-6), + patch_cfg=dict(kernel_size=7, stride=4, padding=3), + init_cfg=None): + super().__init__(init_cfg) + + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + essential_keys = { + 'embed_dims', 'num_layers', 'num_heads', 'downscale_indices' + } + assert isinstance(arch, dict) and essential_keys <= set(arch), \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.embed_dims = self.arch_settings['embed_dims'] + self.num_layers = self.arch_settings['num_layers'] + self.num_heads = self.arch_settings['num_heads'] + self.downscale_indices = self.arch_settings['downscale_indices'] + self.num_scales = len(self.downscale_indices) + 1 + self.stage_indices = { + index - 1: i + for i, index in enumerate(self.downscale_indices) + } + self.stage_indices[self.num_layers - 1] = self.num_scales - 1 + self.use_abs_pos_embed = use_abs_pos_embed + self.interpolate_mode = interpolate_mode + + if isinstance(out_scales, int): + out_scales = [out_scales] + assert isinstance(out_scales, Sequence), \ + f'"out_scales" must by a sequence or int, ' \ + f'get {type(out_scales)} instead.' + for i, index in enumerate(out_scales): + if index < 0: + out_scales[i] = self.num_scales + index + assert 0 <= out_scales[i] <= self.num_scales, \ + f'Invalid out_scales {index}' + self.out_scales = sorted(list(out_scales)) + + # Set patch embedding + _patch_cfg = dict( + in_channels=in_channels, + input_size=img_size, + embed_dims=self.embed_dims, + conv_type='Conv2d', + ) + _patch_cfg.update(patch_cfg) + self.patch_embed = PatchEmbed(**_patch_cfg) + self.patch_resolution = self.patch_embed.init_out_size + + # Set absolute position embedding + if self.use_abs_pos_embed: + num_patches = self.patch_resolution[0] * self.patch_resolution[1] + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches, self.embed_dims)) + + # stochastic depth decay rule + dpr = np.linspace(0, drop_path_rate, self.num_layers) + + self.blocks = ModuleList() + out_dims_list = [self.embed_dims] + num_heads = self.num_heads + stride_kv = adaptive_kv_stride + input_size = self.patch_resolution + for i in range(self.num_layers): + if i in self.downscale_indices: + num_heads *= head_mul + stride_q = 2 + stride_kv = max(stride_kv // 2, 1) + else: + stride_q = 1 + + # Set output embed_dims + if dim_mul_in_attention and i in self.downscale_indices: + # multiply embed_dims in downscale layers. + out_dims = out_dims_list[-1] * dim_mul + elif not dim_mul_in_attention and i + 1 in self.downscale_indices: + # multiply embed_dims before downscale layers. + out_dims = out_dims_list[-1] * dim_mul + else: + out_dims = out_dims_list[-1] + + attention_block = MultiScaleBlock( + in_dims=out_dims_list[-1], + out_dims=out_dims, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop_path=dpr[i], + norm_cfg=norm_cfg, + qkv_pool_kernel=pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + rel_pos_spatial=rel_pos_spatial, + residual_pooling=residual_pooling, + dim_mul_in_attention=dim_mul_in_attention, + input_size=input_size, + rel_pos_zero_init=rel_pos_zero_init) + self.blocks.append(attention_block) + + input_size = attention_block.init_out_size + out_dims_list.append(out_dims) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + norm_layer = build_norm_layer(norm_cfg, out_dims)[1] + self.add_module(f'norm{stage_index}', norm_layer) + + def init_weights(self): + super().init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress default init if use pretrained model. + return + + if self.use_abs_pos_embed: + trunc_normal_(self.pos_embed, std=0.02) + + def forward(self, x): + """Forward the MViT.""" + B = x.shape[0] + x, patch_resolution = self.patch_embed(x) + + if self.use_abs_pos_embed: + x = x + resize_pos_embed( + self.pos_embed, + self.patch_resolution, + patch_resolution, + mode=self.interpolate_mode, + num_extra_tokens=self.num_extra_tokens) + + outs = [] + for i, block in enumerate(self.blocks): + x, patch_resolution = block(x, patch_resolution) + + if i in self.stage_indices: + stage_index = self.stage_indices[i] + if stage_index in self.out_scales: + B, _, C = x.shape + x = getattr(self, f'norm{stage_index}')(x) + out = x.transpose(1, 2).reshape(B, C, *patch_resolution) + outs.append(out.contiguous()) + + return tuple(outs) diff --git a/mmcls/models/backbones/swin_transformer.py b/mmcls/models/backbones/swin_transformer.py index 076ba961..baeafac7 100644 --- a/mmcls/models/backbones/swin_transformer.py +++ b/mmcls/models/backbones/swin_transformer.py @@ -183,11 +183,11 @@ class SwinBlockSequence(BaseModule): else: self.downsample = None - def forward(self, x, in_shape): + def forward(self, x, in_shape, do_downsample=True): for block in self.blocks: x = block(x, in_shape) - if self.downsample: + if self.downsample is not None and do_downsample: x, out_shape = self.downsample(x, in_shape) else: out_shape = in_shape @@ -232,6 +232,8 @@ class SwinTransformer(BaseBackbone): window_size (int): The height and width of the window. Defaults to 7. drop_rate (float): Dropout rate after embedding. Defaults to 0. drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + out_after_downsample (bool): Whether to output the feature map of a + stage after the following downsample layer. Defaults to False. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults to False. interpolate_mode (str): Select the interpolate mode for absolute @@ -301,6 +303,7 @@ class SwinTransformer(BaseBackbone): drop_rate=0., drop_path_rate=0.1, out_indices=(3, ), + out_after_downsample=False, use_abs_pos_embed=False, interpolate_mode='bicubic', with_cp=False, @@ -329,6 +332,7 @@ class SwinTransformer(BaseBackbone): self.num_heads = self.arch_settings['num_heads'] self.num_layers = len(self.depths) self.out_indices = out_indices + self.out_after_downsample = out_after_downsample self.use_abs_pos_embed = use_abs_pos_embed self.interpolate_mode = interpolate_mode self.frozen_stages = frozen_stages @@ -392,9 +396,15 @@ class SwinTransformer(BaseBackbone): dpr = dpr[depth:] embed_dims.append(stage.out_channels) + if self.out_after_downsample: + self.num_features = embed_dims[1:] + else: + self.num_features = embed_dims[:-1] + for i in out_indices: if norm_cfg is not None: - norm_layer = build_norm_layer(norm_cfg, embed_dims[i + 1])[1] + norm_layer = build_norm_layer(norm_cfg, + self.num_features[i])[1] else: norm_layer = nn.Identity() @@ -421,14 +431,17 @@ class SwinTransformer(BaseBackbone): outs = [] for i, stage in enumerate(self.stages): - x, hw_shape = stage(x, hw_shape) + x, hw_shape = stage( + x, hw_shape, do_downsample=self.out_after_downsample) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(x) out = out.view(-1, *hw_shape, - stage.out_channels).permute(0, 3, 1, - 2).contiguous() + self.num_features[i]).permute(0, 3, 1, + 2).contiguous() outs.append(out) + if stage.downsample is not None and not self.out_after_downsample: + x, hw_shape = stage.downsample(x, hw_shape) return tuple(outs) diff --git a/mmcls/models/backbones/swin_transformer_v2.py b/mmcls/models/backbones/swin_transformer_v2.py new file mode 100644 index 00000000..c6f722ec --- /dev/null +++ b/mmcls/models/backbones/swin_transformer_v2.py @@ -0,0 +1,560 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from typing import Sequence + +import numpy as np +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, PatchEmbed +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import trunc_normal_ +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from ..builder import MODELS +from ..utils import (PatchMerging, ShiftWindowMSA, WindowMSAV2, + resize_pos_embed, to_2tuple) +from .base_backbone import BaseBackbone + + +class SwinBlockV2(BaseModule): + """Swin Transformer V2 block. Use post normalization. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): The height and width of the window. Defaults to 7. + shift (bool): Shift the attention window or not. Defaults to False. + extra_norm (bool): Whether add extra norm at the end of main branch. + ffn_ratio (float): The expansion ratio of feedforward network hidden + layer channels. Defaults to 4. + drop_path (float): The drop path rate after attention and ffn. + Defaults to 0. + pad_small_map (bool): If True, pad the small feature map to the window + size, which is common used in detection and segmentation. If False, + avoid shifting window and shrink the window size to the size of + feature map, which is common used in classification. + Defaults to False. + attn_cfgs (dict): The extra config of Shift Window-MSA. + Defaults to empty dict. + ffn_cfgs (dict): The extra config of FFN. Defaults to empty dict. + norm_cfg (dict): The config of norm layers. + Defaults to ``dict(type='LN')``. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + pretrained_window_size (int): Window size in pretrained. + init_cfg (dict, optional): The extra config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size=8, + shift=False, + extra_norm=False, + ffn_ratio=4., + drop_path=0., + pad_small_map=False, + attn_cfgs=dict(), + ffn_cfgs=dict(), + norm_cfg=dict(type='LN'), + with_cp=False, + pretrained_window_size=0, + init_cfg=None): + + super(SwinBlockV2, self).__init__(init_cfg) + self.with_cp = with_cp + self.extra_norm = extra_norm + + _attn_cfgs = { + 'embed_dims': embed_dims, + 'num_heads': num_heads, + 'shift_size': window_size // 2 if shift else 0, + 'window_size': window_size, + 'dropout_layer': dict(type='DropPath', drop_prob=drop_path), + 'pad_small_map': pad_small_map, + **attn_cfgs + } + # use V2 attention implementation + _attn_cfgs.update( + window_msa=WindowMSAV2, + pretrained_window_size=to_2tuple(pretrained_window_size)) + self.attn = ShiftWindowMSA(**_attn_cfgs) + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + + _ffn_cfgs = { + 'embed_dims': embed_dims, + 'feedforward_channels': int(embed_dims * ffn_ratio), + 'num_fcs': 2, + 'ffn_drop': 0, + 'dropout_layer': dict(type='DropPath', drop_prob=drop_path), + 'act_cfg': dict(type='GELU'), + 'add_identity': False, + **ffn_cfgs + } + self.ffn = FFN(**_ffn_cfgs) + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + + # add extra norm for every n blocks in huge and giant model + if self.extra_norm: + self.norm3 = build_norm_layer(norm_cfg, embed_dims)[1] + + def forward(self, x, hw_shape): + + def _inner_forward(x): + # Use post normalization + identity = x + x = self.attn(x, hw_shape) + x = self.norm1(x) + x = x + identity + + identity = x + x = self.ffn(x) + x = self.norm2(x) + x = x + identity + + if self.extra_norm: + x = self.norm3(x) + + return x + + if self.with_cp and x.requires_grad: + x = cp.checkpoint(_inner_forward, x) + else: + x = _inner_forward(x) + + return x + + +class SwinBlockV2Sequence(BaseModule): + """Module with successive Swin Transformer blocks and downsample layer. + + Args: + embed_dims (int): Number of input channels. + depth (int): Number of successive swin transformer blocks. + num_heads (int): Number of attention heads. + window_size (int): The height and width of the window. Defaults to 7. + downsample (bool): Downsample the output of blocks by patch merging. + Defaults to False. + downsample_cfg (dict): The extra config of the patch merging layer. + Defaults to empty dict. + drop_paths (Sequence[float] | float): The drop path rate in each block. + Defaults to 0. + block_cfgs (Sequence[dict] | dict): The extra config of each block. + Defaults to empty dicts. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + pad_small_map (bool): If True, pad the small feature map to the window + size, which is common used in detection and segmentation. If False, + avoid shifting window and shrink the window size to the size of + feature map, which is common used in classification. + Defaults to False. + extra_norm_every_n_blocks (int): Add extra norm at the end of main + branch every n blocks. Defaults to 0, which means no needs for + extra norm layer. + pretrained_window_size (int): Window size in pretrained. + init_cfg (dict, optional): The extra config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + depth, + num_heads, + window_size=8, + downsample=False, + downsample_cfg=dict(), + drop_paths=0., + block_cfgs=dict(), + with_cp=False, + pad_small_map=False, + extra_norm_every_n_blocks=0, + pretrained_window_size=0, + init_cfg=None): + super().__init__(init_cfg) + + if not isinstance(drop_paths, Sequence): + drop_paths = [drop_paths] * depth + + if not isinstance(block_cfgs, Sequence): + block_cfgs = [deepcopy(block_cfgs) for _ in range(depth)] + + if downsample: + self.out_channels = 2 * embed_dims + _downsample_cfg = { + 'in_channels': embed_dims, + 'out_channels': self.out_channels, + 'norm_cfg': dict(type='LN'), + **downsample_cfg + } + self.downsample = PatchMerging(**_downsample_cfg) + else: + self.out_channels = embed_dims + self.downsample = None + + self.blocks = ModuleList() + for i in range(depth): + extra_norm = True if extra_norm_every_n_blocks and \ + (i + 1) % extra_norm_every_n_blocks == 0 else False + _block_cfg = { + 'embed_dims': self.out_channels, + 'num_heads': num_heads, + 'window_size': window_size, + 'shift': False if i % 2 == 0 else True, + 'extra_norm': extra_norm, + 'drop_path': drop_paths[i], + 'with_cp': with_cp, + 'pad_small_map': pad_small_map, + 'pretrained_window_size': pretrained_window_size, + **block_cfgs[i] + } + block = SwinBlockV2(**_block_cfg) + self.blocks.append(block) + + def forward(self, x, in_shape): + if self.downsample: + x, out_shape = self.downsample(x, in_shape) + else: + out_shape = in_shape + + for block in self.blocks: + x = block(x, out_shape) + + return x, out_shape + + +@MODELS.register_module() +class SwinTransformerV2(BaseBackbone): + """Swin Transformer V2. + + A PyTorch implement of : `Swin Transformer V2: + Scaling Up Capacity and Resolution + `_ + + Inspiration from + https://github.com/microsoft/Swin-Transformer + + Args: + arch (str | dict): Swin Transformer architecture. If use string, choose + from 'tiny', 'small', 'base' and 'large'. If use dict, it should + have below keys: + + - **embed_dims** (int): The dimensions of embedding. + - **depths** (List[int]): The number of blocks in each stage. + - **num_heads** (List[int]): The number of heads in attention + modules of each stage. + - **extra_norm_every_n_blocks** (int): Add extra norm at the end + of main branch every n blocks. + + Defaults to 'tiny'. + img_size (int | tuple): The expected input image shape. Because we + support dynamic input shape, just set the argument to the most + common input image shape. Defaults to 224. + patch_size (int | tuple): The patch size in patch embedding. + Defaults to 4. + in_channels (int): The num of input channels. Defaults to 3. + window_size (int | Sequence): The height and width of the window. + Defaults to 7. + drop_rate (float): Dropout rate after embedding. Defaults to 0. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults to False. + interpolate_mode (str): Select the interpolate mode for absolute + position embeding vector resize. Defaults to "bicubic". + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Defaults to -1. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + pad_small_map (bool): If True, pad the small feature map to the window + size, which is common used in detection and segmentation. If False, + avoid shifting window and shrink the window size to the size of + feature map, which is common used in classification. + Defaults to False. + norm_cfg (dict): Config dict for normalization layer for all output + features. Defaults to ``dict(type='LN')`` + stage_cfgs (Sequence[dict] | dict): Extra config dict for each + stage. Defaults to an empty dict. + patch_cfg (dict): Extra config dict for patch embedding. + Defaults to an empty dict. + pretrained_window_sizes (tuple(int)): Pretrained window sizes of + each layer. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + + Examples: + >>> from mmcls.models import SwinTransformerV2 + >>> import torch + >>> extra_config = dict( + >>> arch='tiny', + >>> stage_cfgs=dict(downsample_cfg={'kernel_size': 3, + >>> 'padding': 'same'})) + >>> self = SwinTransformerV2(**extra_config) + >>> inputs = torch.rand(1, 3, 224, 224) + >>> output = self.forward(inputs) + >>> print(output.shape) + (1, 2592, 4) + """ + arch_zoo = { + **dict.fromkeys(['t', 'tiny'], + {'embed_dims': 96, + 'depths': [2, 2, 6, 2], + 'num_heads': [3, 6, 12, 24], + 'extra_norm_every_n_blocks': 0}), + **dict.fromkeys(['s', 'small'], + {'embed_dims': 96, + 'depths': [2, 2, 18, 2], + 'num_heads': [3, 6, 12, 24], + 'extra_norm_every_n_blocks': 0}), + **dict.fromkeys(['b', 'base'], + {'embed_dims': 128, + 'depths': [2, 2, 18, 2], + 'num_heads': [4, 8, 16, 32], + 'extra_norm_every_n_blocks': 0}), + **dict.fromkeys(['l', 'large'], + {'embed_dims': 192, + 'depths': [2, 2, 18, 2], + 'num_heads': [6, 12, 24, 48], + 'extra_norm_every_n_blocks': 0}), + # head count not certain for huge, and is employed for another + # parallel study about self-supervised learning. + **dict.fromkeys(['h', 'huge'], + {'embed_dims': 352, + 'depths': [2, 2, 18, 2], + 'num_heads': [8, 16, 32, 64], + 'extra_norm_every_n_blocks': 6}), + **dict.fromkeys(['g', 'giant'], + {'embed_dims': 512, + 'depths': [2, 2, 42, 4], + 'num_heads': [16, 32, 64, 128], + 'extra_norm_every_n_blocks': 6}), + } # yapf: disable + + _version = 1 + num_extra_tokens = 0 + + def __init__(self, + arch='tiny', + img_size=256, + patch_size=4, + in_channels=3, + window_size=8, + drop_rate=0., + drop_path_rate=0.1, + out_indices=(3, ), + use_abs_pos_embed=False, + interpolate_mode='bicubic', + with_cp=False, + frozen_stages=-1, + norm_eval=False, + pad_small_map=False, + norm_cfg=dict(type='LN'), + stage_cfgs=dict(), + patch_cfg=dict(), + pretrained_window_sizes=[0, 0, 0, 0], + init_cfg=None): + super(SwinTransformerV2, self).__init__(init_cfg=init_cfg) + + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + essential_keys = { + 'embed_dims', 'depths', 'num_heads', + 'extra_norm_every_n_blocks' + } + assert isinstance(arch, dict) and set(arch) == essential_keys, \ + f'Custom arch needs a dict with keys {essential_keys}' + self.arch_settings = arch + + self.embed_dims = self.arch_settings['embed_dims'] + self.depths = self.arch_settings['depths'] + self.num_heads = self.arch_settings['num_heads'] + self.extra_norm_every_n_blocks = self.arch_settings[ + 'extra_norm_every_n_blocks'] + self.num_layers = len(self.depths) + self.out_indices = out_indices + self.use_abs_pos_embed = use_abs_pos_embed + self.interpolate_mode = interpolate_mode + self.frozen_stages = frozen_stages + + if isinstance(window_size, int): + self.window_sizes = [window_size for _ in range(self.num_layers)] + elif isinstance(window_size, Sequence): + assert len(window_size) == self.num_layers, \ + f'Length of window_sizes {len(window_size)} is not equal to '\ + f'length of stages {self.num_layers}.' + self.window_sizes = window_size + else: + raise TypeError('window_size should be a Sequence or int.') + + _patch_cfg = dict( + in_channels=in_channels, + input_size=img_size, + embed_dims=self.embed_dims, + conv_type='Conv2d', + kernel_size=patch_size, + stride=patch_size, + norm_cfg=dict(type='LN'), + ) + _patch_cfg.update(patch_cfg) + self.patch_embed = PatchEmbed(**_patch_cfg) + self.patch_resolution = self.patch_embed.init_out_size + + if self.use_abs_pos_embed: + num_patches = self.patch_resolution[0] * self.patch_resolution[1] + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, num_patches, self.embed_dims)) + self._register_load_state_dict_pre_hook( + self._prepare_abs_pos_embed) + + self._register_load_state_dict_pre_hook(self._delete_reinit_params) + + self.drop_after_pos = nn.Dropout(p=drop_rate) + self.norm_eval = norm_eval + + # stochastic depth + total_depth = sum(self.depths) + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, total_depth) + ] # stochastic depth decay rule + + self.stages = ModuleList() + embed_dims = [self.embed_dims] + for i, (depth, + num_heads) in enumerate(zip(self.depths, self.num_heads)): + if isinstance(stage_cfgs, Sequence): + stage_cfg = stage_cfgs[i] + else: + stage_cfg = deepcopy(stage_cfgs) + downsample = True if i > 0 else False + _stage_cfg = { + 'embed_dims': embed_dims[-1], + 'depth': depth, + 'num_heads': num_heads, + 'window_size': self.window_sizes[i], + 'downsample': downsample, + 'drop_paths': dpr[:depth], + 'with_cp': with_cp, + 'pad_small_map': pad_small_map, + 'extra_norm_every_n_blocks': self.extra_norm_every_n_blocks, + 'pretrained_window_size': pretrained_window_sizes[i], + 'downsample_cfg': dict(use_post_norm=True), + **stage_cfg + } + + stage = SwinBlockV2Sequence(**_stage_cfg) + self.stages.append(stage) + + dpr = dpr[depth:] + embed_dims.append(stage.out_channels) + + for i in out_indices: + if norm_cfg is not None: + norm_layer = build_norm_layer(norm_cfg, embed_dims[i + 1])[1] + else: + norm_layer = nn.Identity() + + self.add_module(f'norm{i}', norm_layer) + + def init_weights(self): + super(SwinTransformerV2, self).init_weights() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg['type'] == 'Pretrained'): + # Suppress default init if use pretrained model. + return + + if self.use_abs_pos_embed: + trunc_normal_(self.absolute_pos_embed, std=0.02) + + def forward(self, x): + x, hw_shape = self.patch_embed(x) + + if self.use_abs_pos_embed: + x = x + resize_pos_embed( + self.absolute_pos_embed, self.patch_resolution, hw_shape, + self.interpolate_mode, self.num_extra_tokens) + x = self.drop_after_pos(x) + + outs = [] + for i, stage in enumerate(self.stages): + x, hw_shape = stage(x, hw_shape) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + out = norm_layer(x) + out = out.view(-1, *hw_shape, + stage.out_channels).permute(0, 3, 1, + 2).contiguous() + outs.append(out) + + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + for i in range(0, self.frozen_stages + 1): + m = self.stages[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + for i in self.out_indices: + if i <= self.frozen_stages: + for param in getattr(self, f'norm{i}').parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(SwinTransformerV2, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + def _prepare_abs_pos_embed(self, state_dict, prefix, *args, **kwargs): + name = prefix + 'absolute_pos_embed' + if name not in state_dict.keys(): + return + + ckpt_pos_embed_shape = state_dict[name].shape + if self.absolute_pos_embed.shape != ckpt_pos_embed_shape: + from mmengine.logging import MMLogger + logger = MMLogger.get_current_instance() + logger.info( + 'Resize the absolute_pos_embed shape from ' + f'{ckpt_pos_embed_shape} to {self.absolute_pos_embed.shape}.') + + ckpt_pos_embed_shape = to_2tuple( + int(np.sqrt(ckpt_pos_embed_shape[1] - self.num_extra_tokens))) + pos_embed_shape = self.patch_embed.init_out_size + + state_dict[name] = resize_pos_embed(state_dict[name], + ckpt_pos_embed_shape, + pos_embed_shape, + self.interpolate_mode, + self.num_extra_tokens) + + def _delete_reinit_params(self, state_dict, prefix, *args, **kwargs): + # delete relative_position_index since we always re-init it + relative_position_index_keys = [ + k for k in state_dict.keys() if 'relative_position_index' in k + ] + for k in relative_position_index_keys: + del state_dict[k] + + # delete relative_coords_table since we always re-init it + relative_position_index_keys = [ + k for k in state_dict.keys() if 'relative_coords_table' in k + ] + for k in relative_position_index_keys: + del state_dict[k] diff --git a/mmcls/models/backbones/vision_transformer.py b/mmcls/models/backbones/vision_transformer.py index 0a1878a2..6dc14eb8 100644 --- a/mmcls/models/backbones/vision_transformer.py +++ b/mmcls/models/backbones/vision_transformer.py @@ -288,6 +288,16 @@ class VisionTransformer(BaseBackbone): 'num_heads': 16, 'feedforward_channels': 4096 }), + **dict.fromkeys( + ['h', 'huge'], + { + # The same as the implementation in MAE + # + 'embed_dims': 1280, + 'num_layers': 32, + 'num_heads': 16, + 'feedforward_channels': 5120 + }), **dict.fromkeys( ['deit-t', 'deit-tiny'], { 'embed_dims': 192, diff --git a/mmcls/models/heads/__init__.py b/mmcls/models/heads/__init__.py index 13dbba98..02be12cf 100644 --- a/mmcls/models/heads/__init__.py +++ b/mmcls/models/heads/__init__.py @@ -2,6 +2,7 @@ from .cls_head import ClsHead from .conformer_head import ConformerHead from .deit_head import DeiTClsHead +from .efficientformer_head import EfficientFormerClsHead from .linear_head import LinearClsHead from .multi_label_cls_head import MultiLabelClsHead from .multi_label_linear_head import MultiLabelLinearClsHead @@ -11,5 +12,5 @@ from .vision_transformer_head import VisionTransformerClsHead __all__ = [ 'ClsHead', 'LinearClsHead', 'StackedLinearClsHead', 'MultiLabelClsHead', 'MultiLabelLinearClsHead', 'VisionTransformerClsHead', 'DeiTClsHead', - 'ConformerHead' + 'ConformerHead', 'EfficientFormerClsHead' ] diff --git a/mmcls/models/heads/efficientformer_head.py b/mmcls/models/heads/efficientformer_head.py new file mode 100644 index 00000000..1b67c1b3 --- /dev/null +++ b/mmcls/models/heads/efficientformer_head.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn as nn + +from mmcls.registry import MODELS +from mmcls.structures import ClsDataSample +from .cls_head import ClsHead + + +@MODELS.register_module() +class EfficientFormerClsHead(ClsHead): + """EfficientFormer classifier head. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + distillation (bool): Whether use a additional distilled head. + Defaults to True. + init_cfg (dict): The extra initialization configs. Defaults to + ``dict(type='Normal', layer='Linear', std=0.01)``. + """ + + def __init__(self, + num_classes, + in_channels, + distillation=True, + init_cfg=dict(type='Normal', layer='Linear', std=0.01), + *args, + **kwargs): + super(EfficientFormerClsHead, self).__init__( + init_cfg=init_cfg, *args, **kwargs) + self.in_channels = in_channels + self.num_classes = num_classes + self.dist = distillation + + if self.num_classes <= 0: + raise ValueError( + f'num_classes={num_classes} must be a positive integer') + + self.head = nn.Linear(self.in_channels, self.num_classes) + if self.dist: + self.dist_head = nn.Linear(self.in_channels, self.num_classes) + + def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor: + """The forward process.""" + pre_logits = self.pre_logits(feats) + # The final classification head. + cls_score = self.head(pre_logits) + + if self.dist: + cls_score = (cls_score + self.dist_head(pre_logits)) / 2 + return cls_score + + def pre_logits(self, feats: Tuple[torch.Tensor]) -> torch.Tensor: + """The process before the final classification head. + + The input ``feats`` is a tuple of tensor, and each tensor is the + feature of a backbone stage. In :obj`EfficientFormerClsHead`, we just + obtain the feature of the last stage. + """ + # The EfficientFormerClsHead doesn't have other module, just return + # after unpacking. + return feats[-1] + + def loss(self, feats: Tuple[torch.Tensor], + data_samples: List[ClsDataSample], **kwargs) -> dict: + """Calculate losses from the classification score. + + Args: + feats (tuple[Tensor]): The features extracted from the backbone. + Multiple stage inputs are acceptable but only the last stage + will be used to classify. The shape of every item should be + ``(num_samples, num_classes)``. + data_samples (List[ClsDataSample]): The annotation data of + every samples. + **kwargs: Other keyword arguments to forward the loss module. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + if self.dist: + raise NotImplementedError( + "MMClassification doesn't support to train" + ' the distilled version EfficientFormer.') + else: + return super().loss(feats, data_samples, **kwargs) diff --git a/mmcls/models/heads/linear_head.py b/mmcls/models/heads/linear_head.py index 2a236b6d..0bd746dd 100644 --- a/mmcls/models/heads/linear_head.py +++ b/mmcls/models/heads/linear_head.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -# Copyrigforward_trainht (c) OpenMMLab. All rights reserved. from typing import Optional, Tuple import torch diff --git a/mmcls/models/losses/label_smooth_loss.py b/mmcls/models/losses/label_smooth_loss.py index 99e50a77..ae8adc7a 100644 --- a/mmcls/models/losses/label_smooth_loss.py +++ b/mmcls/models/losses/label_smooth_loss.py @@ -24,37 +24,42 @@ class LabelSmoothLoss(nn.Module): label_smooth_val (float): The degree of label smoothing. num_classes (int, optional): Number of classes. Defaults to None. mode (str): Refers to notes, Options are 'original', 'classy_vision', - 'multi_label'. Defaults to 'original' + 'multi_label'. Defaults to 'original'. + use_sigmoid (bool, optional): Whether the prediction uses sigmoid of + softmax. Defaults to None, which means to use sigmoid in + "multi_label" mode and not use in other modes. reduction (str): The method used to reduce the loss. Options are "none", "mean" and "sum". Defaults to 'mean'. loss_weight (float): Weight of the loss. Defaults to 1.0. Notes: - if the mode is "original", this will use the same label smooth method - as the original paper as: + - if the mode is **"original"**, this will use the same label smooth + method as the original paper as: - .. math:: - (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K} + .. math:: + (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K} - where epsilon is the `label_smooth_val`, K is the num_classes and - delta(k,y) is Dirac delta, which equals 1 for k=y and 0 otherwise. + where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is the + ``num_classes`` and :math:`\delta_{k, y}` is Dirac delta, which + equals 1 for :math:`k=y` and 0 otherwise. - if the mode is "classy_vision", this will use the same label smooth - method as the facebookresearch/ClassyVision repo as: + - if the mode is **"classy_vision"**, this will use the same label + smooth method as the facebookresearch/ClassyVision repo as: - .. math:: - \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon} + .. math:: + \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon} - if the mode is "multi_label", this will accept labels from multi-label - task and smoothing them as: + - if the mode is **"multi_label"**, this will accept labels from + multi-label task and smoothing them as: - .. math:: - (1-2\epsilon)\delta_{k, y} + \epsilon + .. math:: + (1-2\epsilon)\delta_{k, y} + \epsilon """ def __init__(self, label_smooth_val, num_classes=None, + use_sigmoid=None, mode='original', reduction='mean', loss_weight=1.0): @@ -82,12 +87,21 @@ class LabelSmoothLoss(nn.Module): self._eps = label_smooth_val if mode == 'classy_vision': self._eps = label_smooth_val / (1 + label_smooth_val) + if mode == 'multi_label': - self.ce = CrossEntropyLoss(use_sigmoid=True) + if not use_sigmoid: + from mmengine.logging import MMLogger + MMLogger.get_current_instance().warning( + 'For multi-label tasks, please set `use_sigmoid=True` ' + 'to use binary cross entropy.') self.smooth_label = self.multilabel_smooth_label + use_sigmoid = True if use_sigmoid is None else use_sigmoid else: - self.ce = CrossEntropyLoss(use_soft=True) self.smooth_label = self.original_smooth_label + use_sigmoid = False if use_sigmoid is None else use_sigmoid + + self.ce = CrossEntropyLoss( + use_sigmoid=use_sigmoid, use_soft=not use_sigmoid) def generate_one_hot_like_label(self, label): """This function takes one-hot or index label vectors and computes one- @@ -148,7 +162,7 @@ class LabelSmoothLoss(nn.Module): f'and target.shape: {one_hot_like_label.shape}' smoothed_label = self.smooth_label(one_hot_like_label) - return self.ce.forward( + return self.loss_weight * self.ce.forward( cls_score, smoothed_label, weight=weight, diff --git a/mmcls/models/utils/__init__.py b/mmcls/models/utils/__init__.py index e1553bc8..f094fd80 100644 --- a/mmcls/models/utils/__init__.py +++ b/mmcls/models/utils/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .attention import BEiTAttention, MultiheadAttention, ShiftWindowMSA +from .attention import (BEiTAttention, ChannelMultiheadAttention, + MultiheadAttention, ShiftWindowMSA, WindowMSA, + WindowMSAV2) from .batch_augments import CutMix, Mixup, RandomBatchAugment, ResizeMix from .channel_shuffle import channel_shuffle from .data_preprocessor import ClsDataPreprocessor @@ -7,15 +9,39 @@ from .embed import (HybridEmbed, PatchEmbed, PatchMerging, resize_pos_embed, resize_relative_position_bias_table) from .helpers import is_tracing, to_2tuple, to_3tuple, to_4tuple, to_ntuple from .inverted_residual import InvertedResidual +from .layer_scale import LayerScale from .make_divisible import make_divisible -from .position_encoding import ConditionalPositionEncoding +from .position_encoding import (ConditionalPositionEncoding, + PositionEncodingFourier) from .se_layer import SELayer __all__ = [ - 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer', - 'to_ntuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'PatchEmbed', - 'PatchMerging', 'HybridEmbed', 'RandomBatchAugment', 'ShiftWindowMSA', - 'is_tracing', 'MultiheadAttention', 'ConditionalPositionEncoding', - 'resize_pos_embed', 'resize_relative_position_bias_table', - 'ClsDataPreprocessor', 'Mixup', 'CutMix', 'ResizeMix', 'BEiTAttention' + 'channel_shuffle', + 'make_divisible', + 'InvertedResidual', + 'SELayer', + 'to_ntuple', + 'to_2tuple', + 'to_3tuple', + 'to_4tuple', + 'PatchEmbed', + 'PatchMerging', + 'HybridEmbed', + 'RandomBatchAugment', + 'ShiftWindowMSA', + 'is_tracing', + 'MultiheadAttention', + 'ConditionalPositionEncoding', + 'resize_pos_embed', + 'resize_relative_position_bias_table', + 'ClsDataPreprocessor', + 'Mixup', + 'CutMix', + 'ResizeMix', + 'BEiTAttention', + 'LayerScale', + 'WindowMSA', + 'WindowMSAV2', + 'ChannelMultiheadAttention', + 'PositionEncodingFourier', ] diff --git a/mmcls/models/utils/attention.py b/mmcls/models/utils/attention.py index b718e475..064ec388 100644 --- a/mmcls/models/utils/attention.py +++ b/mmcls/models/utils/attention.py @@ -1,16 +1,25 @@ # Copyright (c) OpenMMLab. All rights reserved. -import warnings - +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn.bricks.drop import build_dropout from mmengine.model import BaseModule from mmengine.model.weight_init import trunc_normal_ +from mmengine.utils import digit_version from mmcls.registry import MODELS from .helpers import to_2tuple +# After pytorch v1.10.0, use torch.meshgrid without indexing +# will raise extra warning. For more details, +# refers to https://github.com/pytorch/pytorch/issues/50276 +if digit_version(torch.__version__) >= digit_version('1.10.0'): + from functools import partial + torch_meshgrid = partial(torch.meshgrid, indexing='ij') +else: + torch_meshgrid = torch.meshgrid + class WindowMSA(BaseModule): """Window based multi-head self-attention (W-MSA) module with relative @@ -121,6 +130,178 @@ class WindowMSA(BaseModule): return (seq1[:, None] + seq2[None, :]).reshape(1, -1) +class WindowMSAV2(BaseModule): + """Window based multi-head self-attention (W-MSA) module with relative + position bias. + + Based on implementation on Swin Transformer V2 original repo. Refers to + https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer_v2.py + for more details. + + Args: + embed_dims (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to q, k, v. + Defaults to True. + attn_drop (float): Dropout ratio of attention weight. + Defaults to 0. + proj_drop (float): Dropout ratio of output. Defaults to 0. + cpb_mlp_hidden_dims (int): The hidden dimensions of the continuous + relative position bias network. Defaults to 512. + pretrained_window_size (tuple(int)): The height and width of the window + in pre-training. Defaults to (0, 0), which means not load + pretrained model. + init_cfg (dict, optional): The extra config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + window_size, + num_heads, + qkv_bias=True, + attn_drop=0., + proj_drop=0., + cpb_mlp_hidden_dims=512, + pretrained_window_size=(0, 0), + init_cfg=None): + + super().__init__(init_cfg) + self.embed_dims = embed_dims + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + + # Use small network for continuous relative position bias + self.cpb_mlp = nn.Sequential( + nn.Linear( + in_features=2, out_features=cpb_mlp_hidden_dims, bias=True), + nn.ReLU(inplace=True), + nn.Linear( + in_features=cpb_mlp_hidden_dims, + out_features=num_heads, + bias=False)) + + # Add learnable scalar for cosine attention + self.logit_scale = nn.Parameter( + torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True) + + # get relative_coords_table + relative_coords_h = torch.arange( + -(self.window_size[0] - 1), + self.window_size[0], + dtype=torch.float32) + relative_coords_w = torch.arange( + -(self.window_size[1] - 1), + self.window_size[1], + dtype=torch.float32) + relative_coords_table = torch.stack( + torch_meshgrid([relative_coords_h, relative_coords_w])).permute( + 1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2 + if pretrained_window_size[0] > 0: + relative_coords_table[:, :, :, 0] /= ( + pretrained_window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= ( + pretrained_window_size[1] - 1) + else: + relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1) + relative_coords_table *= 8 # normalize to -8, 8 + relative_coords_table = torch.sign(relative_coords_table) * torch.log2( + torch.abs(relative_coords_table) + 1.0) / np.log2(8) + self.register_buffer('relative_coords_table', relative_coords_table) + + # get pair-wise relative position index + # for each token inside the window + indexes_h = torch.arange(self.window_size[0]) + indexes_w = torch.arange(self.window_size[1]) + coordinates = torch.stack( + torch_meshgrid([indexes_h, indexes_w]), dim=0) # 2, Wh, Ww + coordinates = torch.flatten(coordinates, start_dim=1) # 2, Wh*Ww + # 2, Wh*Ww, Wh*Ww + relative_coordinates = coordinates[:, :, None] - coordinates[:, + None, :] + relative_coordinates = relative_coordinates.permute( + 1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + + relative_coordinates[:, :, 0] += self.window_size[ + 0] - 1 # shift to start from 0 + relative_coordinates[:, :, 1] += self.window_size[1] - 1 + relative_coordinates[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coordinates.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer('relative_position_index', + relative_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(embed_dims)) + self.v_bias = nn.Parameter(torch.zeros(embed_dims)) + else: + self.q_bias = None + self.v_bias = None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop) + + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + + x (tensor): input features with shape of (num_windows*B, N, C) + mask (tensor, Optional): mask with shape of (num_windows, Wh*Ww, + Wh*Ww), value should be between (-inf, 0]. + """ + B_, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + (self.q_bias, + torch.zeros_like(self.v_bias, + requires_grad=False), self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B_, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + + # cosine attention + attn = ( + F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) + logit_scale = torch.clamp( + self.logit_scale, max=np.log(1. / 0.01)).exp() + attn = attn * logit_scale + + relative_position_bias_table = self.cpb_mlp( + self.relative_coords_table).view(-1, self.num_heads) + relative_position_bias = relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + relative_position_bias = 16 * torch.sigmoid(relative_position_bias) + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + @MODELS.register_module() class ShiftWindowMSA(BaseModule): """Shift Window Multihead Self-Attention Module. @@ -131,13 +312,6 @@ class ShiftWindowMSA(BaseModule): window_size (int): The height and width of the window. shift_size (int, optional): The shift step of each window towards right-bottom. If zero, act as regular window-msa. Defaults to 0. - qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. - Defaults to True - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Defaults to None. - attn_drop (float, optional): Dropout ratio of attention weight. - Defaults to 0.0. - proj_drop (float, optional): Dropout ratio of output. Defaults to 0. dropout_layer (dict, optional): The dropout_layer used before output. Defaults to dict(type='DropPath', drop_prob=0.). pad_small_map (bool): If True, pad the small feature map to the window @@ -145,8 +319,12 @@ class ShiftWindowMSA(BaseModule): avoid shifting window and shrink the window size to the size of feature map, which is common used in classification. Defaults to False. + window_msa (Callable): To build a window multi-head attention module. + Defaults to :class:`WindowMSA`. init_cfg (dict, optional): The extra config for initialization. Defaults to None. + **kwargs: Other keyword arguments to build the window multi-head + attention module. """ def __init__(self, @@ -154,36 +332,22 @@ class ShiftWindowMSA(BaseModule): num_heads, window_size, shift_size=0, - qkv_bias=True, - qk_scale=None, - attn_drop=0, - proj_drop=0, dropout_layer=dict(type='DropPath', drop_prob=0.), pad_small_map=False, - input_resolution=None, - auto_pad=None, - init_cfg=None): + window_msa=WindowMSA, + init_cfg=None, + **kwargs): super().__init__(init_cfg) - if input_resolution is not None or auto_pad is not None: - warnings.warn( - 'The ShiftWindowMSA in new version has supported auto padding ' - 'and dynamic input shape in all condition. And the argument ' - '`auto_pad` and `input_resolution` have been deprecated.', - DeprecationWarning) - self.shift_size = shift_size self.window_size = window_size assert 0 <= self.shift_size < self.window_size - self.w_msa = WindowMSA( + self.w_msa = window_msa( embed_dims=embed_dims, - window_size=to_2tuple(self.window_size), num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=proj_drop, + window_size=to_2tuple(self.window_size), + **kwargs, ) self.drop = build_dropout(dropout_layer) @@ -454,7 +618,7 @@ class BEiTAttention(BaseModule): coords_h = torch.arange(Wh) coords_w = torch.arange(Ww) # coords shape is (2, Wh, Ww) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) + coords = torch.stack(torch_meshgrid([coords_h, coords_w])) # coords_flatten shape is (2, Wh*Ww) coords_flatten = torch.flatten(coords, 1) relative_coords = ( @@ -513,3 +677,91 @@ class BEiTAttention(BaseModule): x = self.proj(x) x = self.proj_drop(x) return x + + +class ChannelMultiheadAttention(BaseModule): + """Channel Multihead Self-attention Module. + + This module implements channel multi-head attention that supports different + input dims and embed dims. + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + input_dims (int, optional): The input dimension, and if None, + use ``embed_dims``. Defaults to None. + attn_drop (float): Dropout rate of the dropout layer after the + attention calculation of query and key. Defaults to 0. + proj_drop (float): Dropout rate of the dropout layer after the + output projection. Defaults to 0. + dropout_layer (dict): The dropout config before adding the shoutcut. + Defaults to ``dict(type='Dropout', drop_prob=0.)``. + qkv_bias (bool): If True, add a learnable bias to q, k, v. + Defaults to False. + proj_bias (bool) If True, add a learnable bias to output projection. + Defaults to True. + qk_scale_type (str): The scale type of qk scale. + Defaults to 'learnable'. It can be 'learnable', 'fixed' or 'none'. + qk_scale (float, optional): If set qk_scale_type to 'none', this + should be specified with valid float number. Defaults to None. + v_shortcut (bool): Add a shortcut from value to output. It's usually + used if ``input_dims`` is different from ``embed_dims``. + Defaults to False. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads=8, + input_dims=None, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='Dropout', drop_prob=0.), + qkv_bias=False, + proj_bias=True, + qk_scale_type='learnable', + qk_scale=None, + v_shortcut=False, + init_cfg=None): + super().__init__(init_cfg) + + self.input_dims = input_dims or embed_dims + self.embed_dims = embed_dims + self.num_heads = num_heads + self.v_shortcut = v_shortcut + + self.head_dims = embed_dims // num_heads + if qk_scale_type == 'learnable': + self.scale = nn.Parameter(torch.ones(num_heads, 1, 1)) + elif qk_scale_type == 'fixed': + self.scale = self.head_dims**-0.5 + elif qk_scale_type == 'none': + assert qk_scale is not None + self.scale = qk_scale + + self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + self.out_drop = build_dropout(dropout_layer) + + def forward(self, x): + B, N, _ = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + self.head_dims).permute(2, 0, 3, 1, 4) + + q, k, v = [item.transpose(-2, -1) for item in [qkv[0], qkv[1], qkv[2]]] + + q, k = F.normalize(q, dim=-1), F.normalize(k, dim=-1) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, self.embed_dims) + x = self.proj(x) + x = self.out_drop(self.proj_drop(x)) + + if self.v_shortcut: + x = qkv[2].squeeze(1) + x + return x diff --git a/mmcls/models/utils/embed.py b/mmcls/models/utils/embed.py index bad563a6..1a1a8369 100644 --- a/mmcls/models/utils/embed.py +++ b/mmcls/models/utils/embed.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +from typing import Sequence import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.cnn.bricks.transformer import AdaptivePadding from mmengine.model import BaseModule from .helpers import to_2tuple @@ -274,91 +276,146 @@ class HybridEmbed(BaseModule): class PatchMerging(BaseModule): """Merge patch feature map. - This layer use nn.Unfold to group feature map by kernel_size, and use norm - and linear layer to embed grouped feature map. + Modified from mmcv, and this module supports specifying whether to use + post-norm. + + This layer groups feature map by kernel_size, and applies norm and linear + layers to the grouped feature map ((used in Swin Transformer)). Our + implementation uses :class:`torch.nn.Unfold` to merge patches, which is + about 25% faster than the original implementation. However, we need to + modify pretrained models for compatibility. Args: - input_resolution (tuple): The size of input patch resolution. - in_channels (int): The num of input channels. - expansion_ratio (Number): Expansion ratio of output channels. The num - of output channels is equal to int(expansion_ratio * in_channels). + in_channels (int): The num of input channels. To gets fully covered + by filter and stride you specified. + out_channels (int): The num of output channels. kernel_size (int | tuple, optional): the kernel size in the unfold layer. Defaults to 2. stride (int | tuple, optional): the stride of the sliding blocks in the - unfold layer. Defaults to be equal with kernel_size. - padding (int | tuple, optional): zero padding width in the unfold - layer. Defaults to 0. + unfold layer. Defaults to None, which means to be set as + ``kernel_size``. + padding (int | tuple | string ): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Defaults to "corner". dilation (int | tuple, optional): dilation parameter in the unfold layer. Defaults to 1. bias (bool, optional): Whether to add bias in linear layer or not. Defaults to False. norm_cfg (dict, optional): Config dict for normalization layer. - Defaults to dict(type='LN'). + Defaults to ``dict(type='LN')``. + use_post_norm (bool): Whether to use post normalization here. + Defaults to False. init_cfg (dict, optional): The extra config for initialization. Defaults to None. """ def __init__(self, - input_resolution, in_channels, - expansion_ratio, + out_channels, kernel_size=2, stride=None, - padding=0, + padding='corner', dilation=1, bias=False, norm_cfg=dict(type='LN'), + use_post_norm=False, init_cfg=None): - super().__init__(init_cfg) - warnings.warn('The `PatchMerging` in mmcls will be deprecated. ' - 'Please use `mmcv.cnn.bricks.transformer.PatchMerging`. ' - "It's more general and supports dynamic input shape") - - H, W = input_resolution - self.input_resolution = input_resolution + super().__init__(init_cfg=init_cfg) self.in_channels = in_channels - self.out_channels = int(expansion_ratio * in_channels) + self.out_channels = out_channels + self.use_post_norm = use_post_norm - if stride is None: + if stride: + stride = stride + else: stride = kernel_size + kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) - padding = to_2tuple(padding) dilation = to_2tuple(dilation) - self.sampler = nn.Unfold(kernel_size, dilation, padding, stride) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of unfold + padding = 0 + else: + self.adaptive_padding = None + + padding = to_2tuple(padding) + self.sampler = nn.Unfold( + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + stride=stride) sample_dim = kernel_size[0] * kernel_size[1] * in_channels + self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) + if norm_cfg is not None: - self.norm = build_norm_layer(norm_cfg, sample_dim)[1] + # build pre or post norm layer based on different channels + if self.use_post_norm: + self.norm = build_norm_layer(norm_cfg, out_channels)[1] + else: + self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None - self.reduction = nn.Linear(sample_dim, self.out_channels, bias=bias) - - # See https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html - H_out = (H + 2 * padding[0] - dilation[0] * - (kernel_size[0] - 1) - 1) // stride[0] + 1 - W_out = (W + 2 * padding[1] - dilation[1] * - (kernel_size[1] - 1) - 1) // stride[1] + 1 - self.output_resolution = (H_out, W_out) - - def forward(self, x): + def forward(self, x, input_size): """ - x: B, H*W, C + Args: + x (Tensor): Has shape (B, H*W, C_in). + input_size (tuple[int]): The spatial shape of x, arrange as (H, W). + Default: None. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) + - out_size (tuple[int]): Spatial shape of x, arrange as + (Merged_H, Merged_W). """ - H, W = self.input_resolution B, L, C = x.shape + assert isinstance(input_size, Sequence), f'Expect ' \ + f'input_size is ' \ + f'`Sequence` ' \ + f'but get {input_size}' + + H, W = input_size assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W + if self.adaptive_padding: + x = self.adaptive_padding(x) + H, W = x.shape[-2:] + # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility - x = self.sampler(x) # B, 4*C, H/2*W/2 + # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) + x = self.sampler(x) + + out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * + (self.sampler.kernel_size[0] - 1) - + 1) // self.sampler.stride[0] + 1 + out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * + (self.sampler.kernel_size[1] - 1) - + 1) // self.sampler.stride[1] + 1 + + output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C - x = self.norm(x) if self.norm else x - x = self.reduction(x) + if self.use_post_norm: + # use post-norm here + x = self.reduction(x) + x = self.norm(x) if self.norm else x + else: + x = self.norm(x) if self.norm else x + x = self.reduction(x) - return x + return x, output_size diff --git a/mmcls/models/utils/layer_scale.py b/mmcls/models/utils/layer_scale.py new file mode 100644 index 00000000..12d92db7 --- /dev/null +++ b/mmcls/models/utils/layer_scale.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + + +class LayerScale(nn.Module): + """LayerScale layer. + + Args: + dim (int): Dimension of input features. + inplace (bool): inplace: can optionally do the + operation in-place. Defaults to False. + data_format (str): The input data format, could be 'channels_last' + or 'channels_first', representing (B, C, H, W) and + (B, N, C) format data respectively. Defaults to 'channels_last'. + """ + + def __init__(self, + dim: int, + inplace: bool = False, + data_format: str = 'channels_last'): + super().__init__() + assert data_format in ('channels_last', 'channels_first'), \ + "'data_format' could only be channels_last or channels_first." + self.inplace = inplace + self.data_format = data_format + self.weight = nn.Parameter(torch.ones(dim) * 1e-5) + + def forward(self, x): + if self.data_format == 'channels_first': + if self.inplace: + return x.mul_(self.weight.view(-1, 1, 1)) + else: + return x * self.weight.view(-1, 1, 1) + return x.mul_(self.weight) if self.inplace else x * self.weight diff --git a/mmcls/models/utils/position_encoding.py b/mmcls/models/utils/position_encoding.py index da22df77..2490587b 100644 --- a/mmcls/models/utils/position_encoding.py +++ b/mmcls/models/utils/position_encoding.py @@ -1,6 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import math +from functools import partial + +import torch import torch.nn as nn from mmengine.model import BaseModule +from mmengine.utils import digit_version class ConditionalPositionEncoding(BaseModule): @@ -39,3 +44,66 @@ class ConditionalPositionEncoding(BaseModule): x = self.proj(cnn_feat) x = x.flatten(2).transpose(1, 2) return x + + +class PositionEncodingFourier(BaseModule): + """The Position Encoding Fourier (PEF) module. + + The PEF is adopted from EdgeNeXt '_. + Args: + in_channels (int): Number of input channels. + Default: 32 + embed_dims (int): The feature dimension. + Default: 768. + temperature (int): Temperature. + Default: 10000. + dtype (torch.dtype): The data type. + Default: torch.float32. + init_cfg (dict): The config dict for initializing the module. + Default: None. + """ + + def __init__(self, + in_channels=32, + embed_dims=768, + temperature=10000, + dtype=torch.float32, + init_cfg=None): + super(PositionEncodingFourier, self).__init__(init_cfg=init_cfg) + self.proj = nn.Conv2d(in_channels * 2, embed_dims, kernel_size=1) + self.scale = 2 * math.pi + self.in_channels = in_channels + self.embed_dims = embed_dims + self.dtype = dtype + + if digit_version(torch.__version__) < digit_version('1.8.0'): + floor_div = torch.floor_divide + else: + floor_div = partial(torch.div, rounding_mode='floor') + dim_t = torch.arange(in_channels, dtype=self.dtype) + self.dim_t = temperature**(2 * floor_div(dim_t, 2) / in_channels) + + def forward(self, bhw_shape): + B, H, W = bhw_shape + mask = torch.zeros(B, H, W).bool().to(self.proj.weight.device) + not_mask = ~mask + eps = 1e-6 + y_embed = not_mask.cumsum(1, dtype=self.dtype) + x_embed = not_mask.cumsum(2, dtype=self.dtype) + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = self.dim_t.to(mask.device) + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + dim=4).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + dim=4).flatten(3) + + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + pos = self.proj(pos) + + return pos diff --git a/mmcls/version.py b/mmcls/version.py index 7ac2c40a..e962ecca 100644 --- a/mmcls/version.py +++ b/mmcls/version.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved -__version__ = '1.0.0rc0' +__version__ = '1.0.0rc1' def parse_version_info(version_str): diff --git a/model-index.yml b/model-index.yml index 2a45c791..b1fa357f 100644 --- a/model-index.yml +++ b/model-index.yml @@ -28,3 +28,8 @@ Import: - configs/densenet/metafile.yml - configs/poolformer/metafile.yml - configs/inception_v3/metafile.yml + - configs/mvit/metafile.yml + - configs/edgenext/metafile.yml + - configs/mobileone/metafile.yml + - configs/efficientformer/metafile.yml + - configs/swin_transformer_v2/metafile.yml diff --git a/requirements/docs.txt b/requirements/docs.txt index 57f5492c..9bed8d91 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,5 +1,7 @@ docutils==0.17.1 +modelindex myst-parser --e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinx==4.5.0 sphinx-copybutton +tabulate diff --git a/tests/test_datasets/test_datasets.py b/tests/test_datasets/test_datasets.py index 18f1cb76..f637fb95 100644 --- a/tests/test_datasets/test_datasets.py +++ b/tests/test_datasets/test_datasets.py @@ -2,6 +2,7 @@ import os import os.path as osp import pickle +import sys import tempfile from unittest import TestCase from unittest.mock import MagicMock, call, patch @@ -141,12 +142,12 @@ class TestCustomDataset(TestBaseDataset): self.assertEqual(dataset.CLASSES, ('a', 'b')) # auto infer classes self.assertGreaterEqual( dataset.get_data_info(0).items(), { - 'img_path': osp.join(ASSETS_ROOT, 'a/1.JPG'), + 'img_path': osp.join(ASSETS_ROOT, 'a', '1.JPG'), 'gt_label': 0 }.items()) self.assertGreaterEqual( dataset.get_data_info(2).items(), { - 'img_path': osp.join(ASSETS_ROOT, 'b/subb/3.jpg'), + 'img_path': osp.join(ASSETS_ROOT, 'b', 'subb', '3.jpg'), 'gt_label': 1 }.items()) @@ -225,7 +226,7 @@ class TestCustomDataset(TestBaseDataset): self.assertEqual(len(dataset), 1) self.assertGreaterEqual( dataset.get_data_info(0).items(), { - 'img_path': osp.join(ASSETS_ROOT, 'b/2.jpeg'), + 'img_path': osp.join(ASSETS_ROOT, 'b', '2.jpeg'), 'gt_label': 1 }.items()) @@ -631,12 +632,12 @@ class TestVOC(TestBaseDataset): # Test different backend cfg = { **self.DEFAULT_ARGS, 'lazy_init': True, - 'data_root': 's3:/openmmlab/voc' + 'data_root': 's3://openmmlab/voc' } + petrel_mock = MagicMock() + sys.modules['petrel_client'] = petrel_mock dataset = dataset_class(**cfg) - dataset._check_integrity = MagicMock(return_value=False) - with self.assertRaisesRegex(FileNotFoundError, 's3:/openmmlab/voc'): - dataset.full_init() + petrel_mock.client.Client.assert_called() def test_extra_repr(self): dataset_class = DATASETS.get(self.DATASET_TYPE) diff --git a/tests/test_engine/test_hooks/test_precise_bn_hook.py b/tests/test_engine/test_hooks/test_precise_bn_hook.py index e79df9d9..661714cc 100644 --- a/tests/test_engine/test_hooks/test_precise_bn_hook.py +++ b/tests/test_engine/test_hooks/test_precise_bn_hook.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy -import shutil +import logging import tempfile from unittest import TestCase from unittest.mock import MagicMock, patch @@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch import pytest import torch import torch.nn as nn +from mmengine.logging import MMLogger from mmengine.model import BaseDataPreprocessor, BaseModel from mmengine.runner import Runner from torch.utils.data import DataLoader, Dataset @@ -115,7 +116,7 @@ class TestPreciseBNHookHook(TestCase): ) self.epoch_train_cfg = dict(by_epoch=True, max_epochs=1) self.iter_train_cfg = dict(by_epoch=False, max_iters=5) - self.tmpdir = tempfile.mkdtemp() + self.tmpdir = tempfile.TemporaryDirectory() self.preciseBN_cfg = copy.deepcopy(self.DEFAULT_ARGS) test_dataset = ExampleDataset() @@ -125,7 +126,7 @@ class TestPreciseBNHookHook(TestCase): def test_construct(self): self.runner = Runner( model=self.model, - work_dir=self.tmpdir, + work_dir=self.tmpdir.name, train_dataloader=self.loader, train_cfg=self.epoch_train_cfg, log_level='WARNING', @@ -160,7 +161,7 @@ class TestPreciseBNHookHook(TestCase): self.preciseBN_cfg['priority'] = 'ABOVE_NORMAL' self.runner = Runner( model=self.model, - work_dir=self.tmpdir, + work_dir=self.tmpdir.name, train_dataloader=self.loader, train_cfg=self.epoch_train_cfg, log_level='WARNING', @@ -176,7 +177,7 @@ class TestPreciseBNHookHook(TestCase): self.preciseBN_cfg['priority'] = 'ABOVE_NORMAL' self.runner = Runner( model=self.model, - work_dir=self.tmpdir, + work_dir=self.tmpdir.name, train_dataloader=self.loader, train_cfg=self.epoch_train_cfg, log_level='WARNING', @@ -213,7 +214,7 @@ class TestPreciseBNHookHook(TestCase): self.loader = DataLoader(test_dataset, batch_size=2) self.runner = Runner( model=self.model, - work_dir=self.tmpdir, + work_dir=self.tmpdir.name, train_dataloader=self.loader, train_cfg=self.iter_train_cfg, log_level='WARNING', @@ -226,4 +227,8 @@ class TestPreciseBNHookHook(TestCase): self.runner.train() def tearDown(self) -> None: - shutil.rmtree(self.tmpdir) + # `FileHandler` should be closed in Windows, otherwise we cannot + # delete the temporary directory. + logging.shutdown() + MMLogger._instance_dict.clear() + self.tmpdir.cleanup() diff --git a/tests/test_models/test_backbones/test_conformer.py b/tests/test_models/test_backbones/test_conformer.py index 96a5a2cc..0b1958c5 100644 --- a/tests/test_models/test_backbones/test_conformer.py +++ b/tests/test_models/test_backbones/test_conformer.py @@ -25,6 +25,7 @@ def check_norm_state(modules, train_state): return True +@torch.no_grad() # To save memory def test_conformer_backbone(): cfg_ori = dict( diff --git a/tests/test_models/test_backbones/test_convmixer.py b/tests/test_models/test_backbones/test_convmixer.py index 7d2219e2..26296615 100644 --- a/tests/test_models/test_backbones/test_convmixer.py +++ b/tests/test_models/test_backbones/test_convmixer.py @@ -18,6 +18,7 @@ def test_assertion(): ConvMixer(out_indices=-100) +@torch.no_grad() # To save memory def test_convmixer(): # Test forward diff --git a/tests/test_models/test_backbones/test_edgenext.py b/tests/test_models/test_backbones/test_edgenext.py new file mode 100644 index 00000000..4b39e3d8 --- /dev/null +++ b/tests/test_models/test_backbones/test_edgenext.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmcls.models.backbones import EdgeNeXt + + +def test_assertion(): + with pytest.raises(AssertionError): + EdgeNeXt(arch='unknown') + + with pytest.raises(AssertionError): + # EdgeNeXt arch dict should include 'embed_dims', + EdgeNeXt(arch=dict(channels=[24, 48, 88, 168])) + + with pytest.raises(AssertionError): + # EdgeNeXt arch dict should include 'embed_dims', + EdgeNeXt(arch=dict(depths=[2, 2, 6, 2], channels=[24, 48, 88, 168])) + + +def test_edgenext(): + + # Test forward + model = EdgeNeXt(arch='xxsmall', out_indices=-1) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 1 + assert feat[0].shape == torch.Size([1, 168]) + + # Test forward with multiple outputs + model = EdgeNeXt(arch='xxsmall', out_indices=(0, 1, 2, 3)) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 24]) + assert feat[1].shape == torch.Size([1, 48]) + assert feat[2].shape == torch.Size([1, 88]) + assert feat[3].shape == torch.Size([1, 168]) + + # Test with custom arch + model = EdgeNeXt( + arch={ + 'depths': [2, 3, 4, 5], + 'channels': [20, 40, 80, 160], + 'num_heads': [4, 4, 4, 4] + }, + out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 20]) + assert feat[1].shape == torch.Size([1, 40]) + assert feat[2].shape == torch.Size([1, 80]) + assert feat[3].shape == torch.Size([1, 160]) + + # Test without gap before final norm + model = EdgeNeXt( + arch='small', out_indices=(0, 1, 2, 3), gap_before_final_norm=False) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 48, 56, 56]) + assert feat[1].shape == torch.Size([1, 96, 28, 28]) + assert feat[2].shape == torch.Size([1, 160, 14, 14]) + assert feat[3].shape == torch.Size([1, 304, 7, 7]) + + # Test frozen_stages + model = EdgeNeXt(arch='small', out_indices=(0, 1, 2, 3), frozen_stages=2) + model.init_weights() + model.train() + + for i in range(2): + assert not model.downsample_layers[i].training + assert not model.stages[i].training + + for i in range(2, 4): + assert model.downsample_layers[i].training + assert model.stages[i].training diff --git a/tests/test_models/test_backbones/test_efficientformer.py b/tests/test_models/test_backbones/test_efficientformer.py new file mode 100644 index 00000000..88aad529 --- /dev/null +++ b/tests/test_models/test_backbones/test_efficientformer.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from unittest import TestCase + +import torch +from mmcv.cnn import ConvModule +from torch import nn + +from mmcls.models.backbones import EfficientFormer +from mmcls.models.backbones.efficientformer import (AttentionWithBias, Flat, + Meta3D, Meta4D) +from mmcls.models.backbones.poolformer import Pooling + + +class TestEfficientFormer(TestCase): + + def setUp(self): + self.cfg = dict(arch='l1', drop_path_rate=0.1) + self.arch = EfficientFormer.arch_settings['l1'] + self.custom_arch = { + 'layers': [1, 1, 1, 4], + 'embed_dims': [48, 96, 224, 448], + 'downsamples': [False, True, True, True], + 'vit_num': 2, + } + self.custom_cfg = dict(arch=self.custom_arch) + + def test_arch(self): + # Test invalid default arch + with self.assertRaisesRegex(AssertionError, 'Unavailable arch'): + cfg = deepcopy(self.cfg) + cfg['arch'] = 'unknown' + EfficientFormer(**cfg) + + # Test invalid custom arch + with self.assertRaisesRegex(AssertionError, 'must have'): + cfg = deepcopy(self.custom_cfg) + cfg['arch'].pop('layers') + EfficientFormer(**cfg) + + # Test vit_num < 0 + with self.assertRaisesRegex(AssertionError, "'vit_num' must"): + cfg = deepcopy(self.custom_cfg) + cfg['arch']['vit_num'] = -1 + EfficientFormer(**cfg) + + # Test vit_num > last stage layers + with self.assertRaisesRegex(AssertionError, "'vit_num' must"): + cfg = deepcopy(self.custom_cfg) + cfg['arch']['vit_num'] = 10 + EfficientFormer(**cfg) + + # Test out_ind + with self.assertRaisesRegex(AssertionError, '"out_indices" must'): + cfg = deepcopy(self.custom_cfg) + cfg['out_indices'] = dict + EfficientFormer(**cfg) + + # Test custom arch + cfg = deepcopy(self.custom_cfg) + model = EfficientFormer(**cfg) + self.assertEqual(len(model.patch_embed), 2) + layers = self.custom_arch['layers'] + downsamples = self.custom_arch['downsamples'] + vit_num = self.custom_arch['vit_num'] + + for i, stage in enumerate(model.network): + if downsamples[i]: + self.assertIsInstance(stage[0], ConvModule) + self.assertEqual(stage[0].conv.stride, (2, 2)) + self.assertTrue(hasattr(stage[0].conv, 'bias')) + self.assertTrue(isinstance(stage[0].bn, nn.BatchNorm2d)) + + if i < len(model.network) - 1: + self.assertIsInstance(stage[-1], Meta4D) + self.assertIsInstance(stage[-1].token_mixer, Pooling) + self.assertEqual(len(stage) - downsamples[i], layers[i]) + elif vit_num > 0: + self.assertIsInstance(stage[-1], Meta3D) + self.assertIsInstance(stage[-1].token_mixer, AttentionWithBias) + self.assertEqual(len(stage) - downsamples[i] - 1, layers[i]) + flat_layer_idx = len(stage) - vit_num - downsamples[i] + self.assertIsInstance(stage[flat_layer_idx], Flat) + count = 0 + for layer in stage: + if isinstance(layer, Meta3D): + count += 1 + self.assertEqual(count, vit_num) + + def test_init_weights(self): + # test weight init cfg + cfg = deepcopy(self.cfg) + cfg['init_cfg'] = [ + dict( + type='Kaiming', + layer='Conv2d', + mode='fan_in', + nonlinearity='linear'), + dict(type='Constant', layer=['LayerScale'], val=1e-4) + ] + model = EfficientFormer(**cfg) + ori_weight = model.patch_embed[0].conv.weight.clone().detach() + ori_ls_weight = model.network[0][-1].ls1.weight.clone().detach() + + model.init_weights() + initialized_weight = model.patch_embed[0].conv.weight + initialized_ls_weight = model.network[0][-1].ls1.weight + self.assertFalse(torch.allclose(ori_weight, initialized_weight)) + self.assertFalse(torch.allclose(ori_ls_weight, initialized_ls_weight)) + + def test_forward(self): + imgs = torch.randn(1, 3, 224, 224) + + # test last stage output + cfg = deepcopy(self.cfg) + model = EfficientFormer(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + self.assertEqual(feat.shape, (1, 448, 49)) + assert hasattr(model, 'norm3') + assert isinstance(getattr(model, 'norm3'), nn.LayerNorm) + + # test multiple output indices + cfg = deepcopy(self.cfg) + cfg['out_indices'] = (0, 1, 2, 3) + cfg['reshape_last_feat'] = True + model = EfficientFormer(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 4) + # Test out features shape + for dim, stride, out in zip(self.arch['embed_dims'], [1, 2, 4, 8], + outs): + self.assertEqual(out.shape, (1, dim, 56 // stride, 56 // stride)) + + # Test norm layer + for i in range(4): + assert hasattr(model, f'norm{i}') + stage_norm = getattr(model, f'norm{i}') + assert isinstance(stage_norm, nn.GroupNorm) + assert stage_norm.num_groups == 1 + + # Test vit_num == 0 + cfg = deepcopy(self.custom_cfg) + cfg['arch']['vit_num'] = 0 + cfg['out_indices'] = (0, 1, 2, 3) + model = EfficientFormer(**cfg) + for i in range(4): + assert hasattr(model, f'norm{i}') + stage_norm = getattr(model, f'norm{i}') + assert isinstance(stage_norm, nn.GroupNorm) + assert stage_norm.num_groups == 1 + + def test_structure(self): + # test drop_path_rate decay + cfg = deepcopy(self.cfg) + cfg['drop_path_rate'] = 0.2 + model = EfficientFormer(**cfg) + layers = self.arch['layers'] + for i, block in enumerate(model.network): + expect_prob = 0.2 / (sum(layers) - 1) * i + if hasattr(block, 'drop_path'): + if expect_prob == 0: + self.assertIsInstance(block.drop_path, torch.nn.Identity) + else: + self.assertAlmostEqual(block.drop_path.drop_prob, + expect_prob) + + # test with first stage frozen. + cfg = deepcopy(self.cfg) + frozen_stages = 1 + cfg['frozen_stages'] = frozen_stages + cfg['out_indices'] = (0, 1, 2, 3) + model = EfficientFormer(**cfg) + model.init_weights() + model.train() + + # the patch_embed and first stage should not require grad. + self.assertFalse(model.patch_embed.training) + for param in model.patch_embed.parameters(): + self.assertFalse(param.requires_grad) + for i in range(frozen_stages): + module = model.network[i] + for param in module.parameters(): + self.assertFalse(param.requires_grad) + for param in model.norm0.parameters(): + self.assertFalse(param.requires_grad) + + # the second stage should require grad. + for i in range(frozen_stages + 1, 4): + module = model.network[i] + for param in module.parameters(): + self.assertTrue(param.requires_grad) + if hasattr(model, f'norm{i}'): + norm = getattr(model, f'norm{i}') + for param in norm.parameters(): + self.assertTrue(param.requires_grad) diff --git a/tests/test_models/test_backbones/test_mobileone.py b/tests/test_models/test_backbones/test_mobileone.py new file mode 100644 index 00000000..32e6751f --- /dev/null +++ b/tests/test_models/test_backbones/test_mobileone.py @@ -0,0 +1,337 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import tempfile + +import pytest +import torch +from mmengine.runner import load_checkpoint, save_checkpoint +from torch import nn +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcls.models.backbones import MobileOne +from mmcls.models.backbones.mobileone import MobileOneBlock +from mmcls.models.utils import SELayer + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def is_mobileone_block(modules): + if isinstance(modules, MobileOneBlock): + return True + return False + + +def test_mobileoneblock(): + # Test MobileOneBlock with kernel_size 3 + block = MobileOneBlock(5, 10, 3, 1, stride=1, groups=5) + block.eval() + x = torch.randn(1, 5, 16, 16) + y = block(x) + assert block.branch_norm is None + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (3, 3) + assert block.branch_conv_list[0].conv.groups == 5 + assert block.se_cfg is None + assert y.shape == torch.Size((1, 10, 16, 16)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 5 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 10, 16, 16)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # Test MobileOneBlock with num_con = 4 + block = MobileOneBlock(5, 10, 3, 4, stride=1, groups=5) + block.eval() + x = torch.randn(1, 5, 16, 16) + y = block(x) + assert block.branch_norm is None + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (3, 3) + assert block.branch_conv_list[0].conv.groups == 5 + assert len(block.branch_conv_list) == 4 + assert block.se_cfg is None + assert y.shape == torch.Size((1, 10, 16, 16)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 5 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 10, 16, 16)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # Test MobileOneBlock with kernel_size 1 + block = MobileOneBlock(5, 10, 1, 1, stride=1, padding=0) + block.eval() + x = torch.randn(1, 5, 16, 16) + y = block(x) + assert block.branch_norm is None + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (1, 1) + assert block.branch_conv_list[0].conv.groups == 1 + assert len(block.branch_conv_list) == 1 + assert block.se_cfg is None + assert y.shape == torch.Size((1, 10, 16, 16)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (1, 1) + assert block.branch_reparam.groups == 1 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 10, 16, 16)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # Test MobileOneBlock with stride = 2 + block = MobileOneBlock(10, 10, 3, 4, stride=2, groups=10) + x = torch.randn(1, 10, 16, 16) + block.eval() + y = block(x) + assert block.branch_norm is None + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (3, 3) + assert block.branch_conv_list[0].conv.groups == 10 + assert len(block.branch_conv_list) == 4 + assert block.se_cfg is None + assert y.shape == torch.Size((1, 10, 8, 8)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 10 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 10, 8, 8)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # # Test MobileOneBlock with padding == dilation == 2 + block = MobileOneBlock( + 10, 10, 3, 4, stride=1, groups=10, padding=2, dilation=2) + x = torch.randn(1, 10, 16, 16) + block.eval() + y = block(x) + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (3, 3) + assert block.branch_conv_list[0].conv.groups == 10 + assert len(block.branch_conv_list) == 4 + assert block.se_cfg is None + assert y.shape == torch.Size((1, 10, 16, 16)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 10 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 10, 16, 16)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # Test MobileOneBlock with se + se_cfg = dict(ratio=4, divisor=1) + block = MobileOneBlock(32, 32, 3, 4, stride=1, se_cfg=se_cfg, groups=32) + x = torch.randn(1, 32, 16, 16) + block.eval() + y = block(x) + assert not hasattr(block, 'branch_reparam') + assert hasattr(block, 'branch_scale') + assert hasattr(block, 'branch_conv_list') + assert hasattr(block, 'branch_norm') + assert block.branch_conv_list[0].conv.kernel_size == (3, 3) + assert block.branch_conv_list[0].conv.groups == 32 + assert len(block.branch_conv_list) == 4 + assert isinstance(block.se, SELayer) + assert y.shape == torch.Size((1, 32, 16, 16)) + block.switch_to_deploy() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 32 + assert block.deploy is True + y_deploy = block(x) + assert y_deploy.shape == torch.Size((1, 32, 16, 16)) + assert torch.allclose(y, y_deploy, atol=1e-5, rtol=1e-4) + + # Test MobileOneBlock with deploy == True + se_cfg = dict(ratio=4, divisor=1) + block = MobileOneBlock( + 32, 32, 3, 4, stride=1, se_cfg=se_cfg, groups=32, deploy=True) + x = torch.randn(1, 32, 16, 16) + block.eval() + assert hasattr(block, 'branch_reparam') + assert block.branch_reparam.kernel_size == (3, 3) + assert block.branch_reparam.groups == 32 + assert isinstance(block.se, SELayer) + assert block.deploy is True + y = block(x) + assert y.shape == torch.Size((1, 32, 16, 16)) + + +def test_mobileone_backbone(): + with pytest.raises(TypeError): + # arch must be str or dict + MobileOne(arch=[4, 6, 16, 1]) + + with pytest.raises(AssertionError): + # arch must in arch_settings + MobileOne(arch='S3') + + with pytest.raises(KeyError): + arch = dict(num_blocks=[2, 4, 14, 1]) + MobileOne(arch=arch) + + # Test len(arch['num_blocks']) == len(arch['width_factor']) + with pytest.raises(AssertionError): + arch = dict( + num_blocks=[2, 4, 14, 1], + width_factor=[0.75, 0.75, 0.75], + num_conv_branches=[1, 1, 1, 1], + num_se_blocks=[0, 0, 5, 1]) + MobileOne(arch=arch) + + # Test max(out_indices) < len(arch['num_blocks']) + with pytest.raises(AssertionError): + MobileOne('s0', out_indices=dict()) + + # Test out_indices not type of int or Sequence + with pytest.raises(AssertionError): + MobileOne('s0', out_indices=(5, )) + + # Test MobileOne norm state + model = MobileOne('s0') + model.train() + assert check_norm_state(model.modules(), True) + + # Test MobileOne with first stage frozen + frozen_stages = 1 + model = MobileOne('s0', frozen_stages=frozen_stages) + model.train() + for param in model.stage0.parameters(): + assert param.requires_grad is False + for i in range(0, frozen_stages): + stage_name = model.stages[i] + stage = model.__getattr__(stage_name) + for mod in stage: + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in stage.parameters(): + assert param.requires_grad is False + + # Test MobileOne with norm_eval + model = MobileOne('s0', norm_eval=True) + model.train() + assert check_norm_state(model.modules(), False) + + # Test MobileOne forward with layer 3 forward + model = MobileOne('s0', out_indices=(3, )) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert isinstance(feat, tuple) + assert len(feat) == 1 + assert isinstance(feat[0], torch.Tensor) + assert feat[0].shape == torch.Size((1, 1024, 7, 7)) + + # Test MobileOne forward + arch_settings = { + 's0': dict(out_channels=[48, 128, 256, 1024], ), + 's1': dict(out_channels=[96, 192, 512, 1280]), + 's2': dict(out_channels=[96, 256, 640, 2048]), + 's3': dict(out_channels=[128, 320, 768, 2048], ), + 's4': dict(out_channels=[192, 448, 896, 2048], ) + } + + choose_models = ['s0', 's1', 's4'] + # Test RepVGG model forward + for model_name, model_arch in arch_settings.items(): + if model_name not in choose_models: + continue + model = MobileOne(model_name, out_indices=(0, 1, 2, 3)) + model.init_weights() + + # Test Norm + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + model.train() + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[0].shape == torch.Size( + (1, model_arch['out_channels'][0], 56, 56)) + assert feat[1].shape == torch.Size( + (1, model_arch['out_channels'][1], 28, 28)) + assert feat[2].shape == torch.Size( + (1, model_arch['out_channels'][2], 14, 14)) + assert feat[3].shape == torch.Size( + (1, model_arch['out_channels'][3], 7, 7)) + + # Test eval of "train" mode and "deploy" mode + gap = nn.AdaptiveAvgPool2d(output_size=(1)) + fc = nn.Linear(model_arch['out_channels'][3], 10) + model.eval() + feat = model(imgs) + pred = fc(gap(feat[3]).flatten(1)) + model.switch_to_deploy() + for m in model.modules(): + if isinstance(m, MobileOneBlock): + assert m.deploy is True + feat_deploy = model(imgs) + pred_deploy = fc(gap(feat_deploy[3]).flatten(1)) + for i in range(4): + torch.allclose(feat[i], feat_deploy[i]) + torch.allclose(pred, pred_deploy) + + +def test_load_deploy_mobileone(): + # Test output before and load from deploy checkpoint + model = MobileOne('s0', out_indices=(0, 1, 2, 3)) + inputs = torch.randn((1, 3, 224, 224)) + tmpdir = tempfile.gettempdir() + ckpt_path = os.path.join(tmpdir, 'ckpt.pth') + model.switch_to_deploy() + model.eval() + outputs = model(inputs) + + model_deploy = MobileOne('s0', out_indices=(0, 1, 2, 3), deploy=True) + save_checkpoint(model.state_dict(), ckpt_path) + load_checkpoint(model_deploy, ckpt_path) + + outputs_load = model_deploy(inputs) + for feat, feat_load in zip(outputs, outputs_load): + assert torch.allclose(feat, feat_load) + os.remove(ckpt_path) diff --git a/tests/test_models/test_backbones/test_mvit.py b/tests/test_models/test_backbones/test_mvit.py new file mode 100644 index 00000000..7757ab81 --- /dev/null +++ b/tests/test_models/test_backbones/test_mvit.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from unittest import TestCase + +import torch + +from mmcls.models import MViT + + +class TestMViT(TestCase): + + def setUp(self): + self.cfg = dict(arch='tiny', drop_path_rate=0.1) + + def test_structure(self): + # Test invalid default arch + with self.assertRaisesRegex(AssertionError, 'not in default archs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = 'unknown' + MViT(**cfg) + + # Test invalid custom arch + with self.assertRaisesRegex(AssertionError, 'Custom arch needs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'num_layers': 24, + 'num_heads': 16, + 'feedforward_channels': 4096 + } + MViT(**cfg) + + # Test custom arch + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'embed_dims': 96, + 'num_layers': 10, + 'num_heads': 1, + 'downscale_indices': [2, 5, 8] + } + stage_indices = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] + model = MViT(**cfg) + self.assertEqual(model.embed_dims, 96) + self.assertEqual(model.num_layers, 10) + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + self.assertEqual(block.out_dims, 96 * 2**(stage)) + + # Test out_indices + cfg = deepcopy(self.cfg) + cfg['out_scales'] = {1: 1} + with self.assertRaisesRegex(AssertionError, "get "): + MViT(**cfg) + cfg['out_scales'] = [0, 13] + with self.assertRaisesRegex(AssertionError, 'Invalid out_scales 13'): + MViT(**cfg) + + # Test model structure + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + stage_indices = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3] + self.assertEqual(len(model.blocks), 10) + dpr_inc = 0.1 / (10 - 1) + dpr = 0 + for i, block in enumerate(model.blocks): + stage = stage_indices[i] + print(i, stage) + self.assertEqual(block.attn.num_heads, 2**stage) + if dpr > 0: + self.assertAlmostEqual(block.drop_path.drop_prob, dpr) + dpr += dpr_inc + + def test_init_weights(self): + # test weight init cfg + cfg = deepcopy(self.cfg) + cfg['init_cfg'] = [ + dict( + type='Kaiming', + layer='Conv2d', + mode='fan_in', + nonlinearity='linear') + ] + cfg['use_abs_pos_embed'] = True + model = MViT(**cfg) + ori_weight = model.patch_embed.projection.weight.clone().detach() + # The pos_embed is all zero before initialize + self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.))) + + model.init_weights() + initialized_weight = model.patch_embed.projection.weight + self.assertFalse(torch.allclose(ori_weight, initialized_weight)) + self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.))) + + def test_forward(self): + imgs = torch.randn(1, 3, 224, 224) + + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token = outs[-1] + self.assertEqual(patch_token.shape, (1, 768, 7, 7)) + + # Test forward with multi out scales + cfg = deepcopy(self.cfg) + cfg['out_scales'] = (0, 1, 2, 3) + model = MViT(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 4) + for stage, out in enumerate(outs): + stride = 2**stage + self.assertEqual(out.shape, + (1, 96 * stride, 56 // stride, 56 // stride)) + + # Test forward with dynamic input size + imgs1 = torch.randn(1, 3, 224, 224) + imgs2 = torch.randn(1, 3, 256, 256) + imgs3 = torch.randn(1, 3, 256, 309) + cfg = deepcopy(self.cfg) + model = MViT(**cfg) + for imgs in [imgs1, imgs2, imgs3]: + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + patch_token = outs[-1] + expect_feat_shape = (math.ceil(imgs.shape[2] / 32), + math.ceil(imgs.shape[3] / 32)) + self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape)) diff --git a/tests/test_models/test_backbones/test_swin_transformer.py b/tests/test_models/test_backbones/test_swin_transformer.py index 02bb813f..613e9b54 100644 --- a/tests/test_models/test_backbones/test_swin_transformer.py +++ b/tests/test_models/test_backbones/test_swin_transformer.py @@ -167,7 +167,7 @@ class TestSwinTransformer(TestCase): outs = model(imgs) self.assertIsInstance(outs, tuple) self.assertEqual(len(outs), 4) - for stride, out in zip([2, 4, 8, 8], outs): + for stride, out in zip([1, 2, 4, 8], outs): self.assertEqual(out.shape, (1, 96 * stride, 56 // stride, 56 // stride)) diff --git a/tests/test_models/test_backbones/test_swin_transformer_v2.py b/tests/test_models/test_backbones/test_swin_transformer_v2.py new file mode 100644 index 00000000..03c93449 --- /dev/null +++ b/tests/test_models/test_backbones/test_swin_transformer_v2.py @@ -0,0 +1,243 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import os +import tempfile +from copy import deepcopy +from itertools import chain +from unittest import TestCase + +import torch +from mmengine.runner import load_checkpoint, save_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmcls.models.backbones import SwinTransformerV2 +from mmcls.models.backbones.swin_transformer import SwinBlock +from .utils import timm_resize_pos_embed + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +class TestSwinTransformerV2(TestCase): + + def setUp(self): + self.cfg = dict( + arch='b', img_size=256, patch_size=4, drop_path_rate=0.1) + + def test_arch(self): + # Test invalid default arch + with self.assertRaisesRegex(AssertionError, 'not in default archs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = 'unknown' + SwinTransformerV2(**cfg) + + # Test invalid custom arch + with self.assertRaisesRegex(AssertionError, 'Custom arch needs'): + cfg = deepcopy(self.cfg) + cfg['arch'] = { + 'embed_dims': 96, + 'num_heads': [3, 6, 12, 16], + } + SwinTransformerV2(**cfg) + + # Test custom arch + cfg = deepcopy(self.cfg) + depths = [2, 2, 6, 2] + num_heads = [6, 12, 6, 12] + cfg['arch'] = { + 'embed_dims': 256, + 'depths': depths, + 'num_heads': num_heads, + 'extra_norm_every_n_blocks': 2 + } + model = SwinTransformerV2(**cfg) + for i, stage in enumerate(model.stages): + self.assertEqual(stage.out_channels, 256 * (2**i)) + self.assertEqual(len(stage.blocks), depths[i]) + self.assertEqual(stage.blocks[0].attn.w_msa.num_heads, + num_heads[i]) + self.assertIsInstance(model.stages[2].blocks[5], torch.nn.Module) + + def test_init_weights(self): + # test weight init cfg + cfg = deepcopy(self.cfg) + cfg['use_abs_pos_embed'] = True + cfg['init_cfg'] = [ + dict( + type='Kaiming', + layer='Conv2d', + mode='fan_in', + nonlinearity='linear') + ] + model = SwinTransformerV2(**cfg) + ori_weight = model.patch_embed.projection.weight.clone().detach() + # The pos_embed is all zero before initialize + self.assertTrue( + torch.allclose(model.absolute_pos_embed, torch.tensor(0.))) + + model.init_weights() + initialized_weight = model.patch_embed.projection.weight + self.assertFalse(torch.allclose(ori_weight, initialized_weight)) + self.assertFalse( + torch.allclose(model.absolute_pos_embed, torch.tensor(0.))) + + pretrain_pos_embed = model.absolute_pos_embed.clone().detach() + + tmpdir = tempfile.TemporaryDirectory() + # Save checkpoints + checkpoint = os.path.join(tmpdir.name, 'checkpoint.pth') + save_checkpoint(model.state_dict(), checkpoint) + + # test load checkpoint + cfg = deepcopy(self.cfg) + cfg['use_abs_pos_embed'] = True + model = SwinTransformerV2(**cfg) + load_checkpoint(model, checkpoint, strict=False) + + # test load checkpoint with different img_size + cfg = deepcopy(self.cfg) + cfg['img_size'] = 384 + cfg['use_abs_pos_embed'] = True + model = SwinTransformerV2(**cfg) + load_checkpoint(model, checkpoint, strict=False) + resized_pos_embed = timm_resize_pos_embed( + pretrain_pos_embed, model.absolute_pos_embed, num_tokens=0) + self.assertTrue( + torch.allclose(model.absolute_pos_embed, resized_pos_embed)) + + tmpdir.cleanup() + + def test_forward(self): + imgs = torch.randn(1, 3, 256, 256) + + cfg = deepcopy(self.cfg) + model = SwinTransformerV2(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + self.assertEqual(feat.shape, (1, 1024, 8, 8)) + + # test with window_size=12 + cfg = deepcopy(self.cfg) + cfg['window_size'] = 12 + model = SwinTransformerV2(**cfg) + outs = model(torch.randn(1, 3, 384, 384)) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + self.assertEqual(feat.shape, (1, 1024, 12, 12)) + with self.assertRaisesRegex(AssertionError, r'the window size \(12\)'): + model(torch.randn(1, 3, 256, 256)) + + # test with pad_small_map=True + cfg = deepcopy(self.cfg) + cfg['window_size'] = 12 + cfg['pad_small_map'] = True + model = SwinTransformerV2(**cfg) + outs = model(torch.randn(1, 3, 256, 256)) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + self.assertEqual(feat.shape, (1, 1024, 8, 8)) + + # test multiple output indices + cfg = deepcopy(self.cfg) + cfg['out_indices'] = (0, 1, 2, 3) + model = SwinTransformerV2(**cfg) + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 4) + for stride, out in zip([1, 2, 4, 8], outs): + self.assertEqual(out.shape, + (1, 128 * stride, 64 // stride, 64 // stride)) + + # test with checkpoint forward + cfg = deepcopy(self.cfg) + cfg['with_cp'] = True + model = SwinTransformerV2(**cfg) + for m in model.modules(): + if isinstance(m, SwinBlock): + self.assertTrue(m.with_cp) + model.init_weights() + model.train() + + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + self.assertEqual(feat.shape, (1, 1024, 8, 8)) + + # test with dynamic input shape + imgs1 = torch.randn(1, 3, 224, 224) + imgs2 = torch.randn(1, 3, 256, 256) + imgs3 = torch.randn(1, 3, 256, 309) + cfg = deepcopy(self.cfg) + cfg['pad_small_map'] = True + model = SwinTransformerV2(**cfg) + for imgs in [imgs1, imgs2, imgs3]: + outs = model(imgs) + self.assertIsInstance(outs, tuple) + self.assertEqual(len(outs), 1) + feat = outs[-1] + expect_feat_shape = (math.ceil(imgs.shape[2] / 32), + math.ceil(imgs.shape[3] / 32)) + self.assertEqual(feat.shape, (1, 1024, *expect_feat_shape)) + + def test_structure(self): + # test drop_path_rate decay + cfg = deepcopy(self.cfg) + cfg['drop_path_rate'] = 0.2 + model = SwinTransformerV2(**cfg) + depths = model.arch_settings['depths'] + blocks = chain(*[stage.blocks for stage in model.stages]) + for i, block in enumerate(blocks): + expect_prob = 0.2 / (sum(depths) - 1) * i + self.assertAlmostEqual(block.ffn.dropout_layer.drop_prob, + expect_prob) + self.assertAlmostEqual(block.attn.drop.drop_prob, expect_prob) + + # test Swin-Transformer V2 with norm_eval=True + cfg = deepcopy(self.cfg) + cfg['norm_eval'] = True + cfg['norm_cfg'] = dict(type='BN') + cfg['stage_cfgs'] = dict(block_cfgs=dict(norm_cfg=dict(type='BN'))) + model = SwinTransformerV2(**cfg) + model.init_weights() + model.train() + self.assertTrue(check_norm_state(model.modules(), False)) + + # test Swin-Transformer V2 with first stage frozen. + cfg = deepcopy(self.cfg) + frozen_stages = 0 + cfg['frozen_stages'] = frozen_stages + cfg['out_indices'] = (0, 1, 2, 3) + model = SwinTransformerV2(**cfg) + model.init_weights() + model.train() + + # the patch_embed and first stage should not require grad. + self.assertFalse(model.patch_embed.training) + for param in model.patch_embed.parameters(): + self.assertFalse(param.requires_grad) + for i in range(frozen_stages + 1): + stage = model.stages[i] + for param in stage.parameters(): + self.assertFalse(param.requires_grad) + for param in model.norm0.parameters(): + self.assertFalse(param.requires_grad) + + # the second stage should require grad. + for i in range(frozen_stages + 1, 4): + stage = model.stages[i] + for param in stage.parameters(): + self.assertTrue(param.requires_grad) + norm = getattr(model, f'norm{i}') + for param in norm.parameters(): + self.assertTrue(param.requires_grad) diff --git a/tests/test_models/test_heads.py b/tests/test_models/test_heads.py index 861a7497..42c7f03c 100644 --- a/tests/test_models/test_heads.py +++ b/tests/test_models/test_heads.py @@ -24,6 +24,7 @@ def setup_seed(seed): class TestClsHead(TestCase): DEFAULT_ARGS = dict(type='ClsHead') + FAKE_FEATS = (torch.rand(4, 10), ) def test_pre_logits(self): head = MODELS.build(self.DEFAULT_ARGS) @@ -42,7 +43,7 @@ class TestClsHead(TestCase): self.assertIs(outs, feats[-1]) def test_loss(self): - feats = (torch.rand(4, 10), ) + feats = self.FAKE_FEATS data_samples = [ClsDataSample().set_gt_label(1) for _ in range(4)] # with cal_acc = False @@ -96,6 +97,7 @@ class TestClsHead(TestCase): class TestLinearClsHead(TestCase): DEFAULT_ARGS = dict(type='LinearClsHead', in_channels=10, num_classes=5) + FAKE_FEATS = (torch.rand(4, 10), ) def test_initialize(self): with self.assertRaisesRegex(ValueError, 'num_classes=-5 must be'): @@ -425,6 +427,47 @@ class TestMultiLabelClsHead(TestCase): self.assertIn('score', pred.pred_label) +class EfficientFormerClsHead(TestClsHead): + DEFAULT_ARGS = dict( + type='EfficientFormerClsHead', + in_channels=10, + num_classes=10, + distillation=False) + FAKE_FEATS = (torch.rand(4, 10), ) + + def test_forward(self): + # test with distillation head + cfg = copy.deepcopy(self.DEFAULT_ARGS) + cfg['distillation'] = True + head = MODELS.build(cfg) + self.assertTrue(hasattr(head, 'dist_head')) + feats = (torch.rand(4, 10), torch.rand(4, 10)) + outs = head(feats) + self.assertEqual(outs.shape, (4, 10)) + + # test without distillation head + cfg = copy.deepcopy(self.DEFAULT_ARGS) + head = MODELS.build(cfg) + self.assertFalse(hasattr(head, 'dist_head')) + feats = (torch.rand(4, 10), torch.rand(4, 10)) + outs = head(feats) + self.assertEqual(outs.shape, (4, 10)) + + def test_loss(self): + feats = (torch.rand(4, 10), ) + data_samples = [ClsDataSample().set_gt_label(1) for _ in range(4)] + + # test with distillation head + cfg = copy.deepcopy(self.DEFAULT_ARGS) + cfg['distillation'] = True + head = MODELS.build(cfg) + with self.assertRaisesRegex(NotImplementedError, 'MMClassification '): + head.loss(feats, data_samples) + + # test without distillation head + super().test_loss() + + class TestMultiLabelLinearClsHead(TestMultiLabelClsHead): DEFAULT_ARGS = dict( type='MultiLabelLinearClsHead', num_classes=10, in_channels=10) diff --git a/tests/test_models/test_losses.py b/tests/test_models/test_losses.py index 74eec620..442da9df 100644 --- a/tests/test_models/test_losses.py +++ b/tests/test_models/test_losses.py @@ -247,6 +247,17 @@ def test_label_smooth_loss(): correct = 0.2269 # from timm assert loss(cls_score, label) - correct <= 0.0001 + loss_cfg = dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + use_sigmoid=True, + reduction='mean', + loss_weight=1.0) + loss = build_loss(loss_cfg) + correct = 0.3633 # from timm + assert loss(cls_score, label) - correct <= 0.0001 + # test classy_vision mode label smooth loss loss_cfg = dict( type='LabelSmoothLoss', diff --git a/tests/test_models/test_utils/test_attention.py b/tests/test_models/test_utils/test_attention.py index 9626f66f..84941d1b 100644 --- a/tests/test_models/test_utils/test_attention.py +++ b/tests/test_models/test_utils/test_attention.py @@ -5,14 +5,15 @@ from unittest.mock import ANY, MagicMock import pytest import torch -from mmcls.models.utils.attention import ShiftWindowMSA, WindowMSA +from mmcls.models.utils.attention import (ShiftWindowMSA, WindowMSA, + torch_meshgrid) def get_relative_position_index(window_size): """Method from original code of Swin-Transformer.""" coords_h = torch.arange(window_size[0]) coords_w = torch.arange(window_size[1]) - coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords = torch.stack(torch_meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww # 2, Wh*Ww, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] @@ -186,16 +187,3 @@ class TestShiftWindowMSA(TestCase): # drop all attn output, output shuold be equal to proj.bias self.assertTrue( torch.allclose(attn(inputs, (14, 14)), torch.tensor(0.))) - - def test_deprecation(self): - # test deprecated arguments - with pytest.warns(DeprecationWarning): - ShiftWindowMSA( - embed_dims=96, - num_heads=4, - window_size=7, - input_resolution=(14, 14)) - - with pytest.warns(DeprecationWarning): - ShiftWindowMSA( - embed_dims=96, num_heads=4, window_size=7, auto_pad=True) diff --git a/tests/test_models/test_utils/test_embed.py b/tests/test_models/test_utils/test_embed.py index 8dba0606..eb7356b1 100644 --- a/tests/test_models/test_utils/test_embed.py +++ b/tests/test_models/test_utils/test_embed.py @@ -36,28 +36,26 @@ def test_hybrid_embed(): def test_patch_merging(): - settings = dict( - input_resolution=(56, 56), in_channels=16, expansion_ratio=2) + settings = dict(in_channels=16, out_channels=32, padding=0) downsample = PatchMerging(**settings) # test forward with wrong dims with pytest.raises(AssertionError): inputs = torch.rand((1, 16, 56 * 56)) - downsample(inputs) + downsample(inputs, input_size=(56, 56)) # test patch merging forward inputs = torch.rand((1, 56 * 56, 16)) - out = downsample(inputs) - assert downsample.output_resolution == (28, 28) + out, output_size = downsample(inputs, input_size=(56, 56)) + assert output_size == (28, 28) assert out.shape == (1, 28 * 28, 32) # test different kernel_size in each direction downsample = PatchMerging(kernel_size=(2, 3), **settings) - out = downsample(inputs) + out, output_size = downsample(inputs, input_size=(56, 56)) expected_dim = cal_unfold_dim(56, 2, 2) * cal_unfold_dim(56, 3, 3) assert downsample.sampler.kernel_size == (2, 3) - assert downsample.output_resolution == (cal_unfold_dim(56, 2, 2), - cal_unfold_dim(56, 3, 3)) + assert output_size == (cal_unfold_dim(56, 2, 2), cal_unfold_dim(56, 3, 3)) assert out.shape == (1, expected_dim, 32) # test default stride @@ -66,18 +64,25 @@ def test_patch_merging(): # test stride=3 downsample = PatchMerging(kernel_size=6, stride=3, **settings) - out = downsample(inputs) + out, output_size = downsample(inputs, input_size=(56, 56)) assert downsample.sampler.stride == (3, 3) assert out.shape == (1, cal_unfold_dim(56, 6, stride=3)**2, 32) # test padding - downsample = PatchMerging(kernel_size=6, padding=2, **settings) - out = downsample(inputs) + downsample = PatchMerging( + in_channels=16, out_channels=32, kernel_size=6, padding=2) + out, output_size = downsample(inputs, input_size=(56, 56)) assert downsample.sampler.padding == (2, 2) assert out.shape == (1, cal_unfold_dim(56, 6, 6, padding=2)**2, 32) + # test str padding + downsample = PatchMerging(in_channels=16, out_channels=32, kernel_size=6) + out, output_size = downsample(inputs, input_size=(56, 56)) + assert downsample.sampler.padding == (0, 0) + assert out.shape == (1, cal_unfold_dim(56, 6, 6, padding=2)**2, 32) + # test dilation downsample = PatchMerging(kernel_size=6, dilation=2, **settings) - out = downsample(inputs) + out, output_size = downsample(inputs, input_size=(56, 56)) assert downsample.sampler.dilation == (2, 2) assert out.shape == (1, cal_unfold_dim(56, 6, 6, dilation=2)**2, 32) diff --git a/tests/test_models/test_utils/test_layer_scale.py b/tests/test_models/test_utils/test_layer_scale.py new file mode 100644 index 00000000..c33b529d --- /dev/null +++ b/tests/test_models/test_utils/test_layer_scale.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmcls.models.utils import LayerScale + + +class TestLayerScale(TestCase): + + def test_init(self): + with self.assertRaisesRegex(AssertionError, "'data_format' could"): + cfg = dict( + dim=10, + data_format='BNC', + ) + LayerScale(**cfg) + + cfg = dict(dim=10) + ls = LayerScale(**cfg) + assert torch.equal(ls.weight, + torch.ones(10, requires_grad=True) * 1e-5) + + def forward(self): + # Test channels_last + cfg = dict(dim=256, inplace=False, data_format='channels_last') + ls_channels_last = LayerScale(**cfg) + x = torch.randn((4, 49, 256)) + out = ls_channels_last(x) + self.assertEqual(tuple(out.size()), (4, 49, 256)) + assert torch.equal(x * 1e-5, out) + + # Test channels_first + cfg = dict(dim=256, inplace=False, data_format='channels_first') + ls_channels_first = LayerScale(**cfg) + x = torch.randn((4, 256, 7, 7)) + out = ls_channels_first(x) + self.assertEqual(tuple(out.size()), (4, 256, 7, 7)) + assert torch.equal(x * 1e-5, out) + + # Test inplace True + cfg = dict(dim=256, inplace=True, data_format='channels_first') + ls_channels_first = LayerScale(**cfg) + x = torch.randn((4, 256, 7, 7)) + out = ls_channels_first(x) + self.assertEqual(tuple(out.size()), (4, 256, 7, 7)) + self.assertIs(x, out) diff --git a/tools/kfold-cross-valid.py b/tools/kfold-cross-valid.py index 7b1ca3dc..ac13f6b9 100644 --- a/tools/kfold-cross-valid.py +++ b/tools/kfold-cross-valid.py @@ -4,11 +4,13 @@ import copy import os import os.path as osp -from mmengine.config import Config, DictAction +from mmengine.config import Config, ConfigDict, DictAction from mmengine.dist import sync_random_seed from mmengine.fileio import dump, load from mmengine.hooks import Hook from mmengine.runner import Runner, find_latest_checkpoint +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION from mmcls.utils import register_all_modules @@ -58,6 +60,15 @@ def parse_args(): action='store_true', help='whether to auto scale the learning rate according to the ' 'actual batch size and the original batch size.') + parser.add_argument( + '--no-pin-memory', + action='store_true', + help='whether to disable the pin_memory option in dataloaders.') + parser.add_argument( + '--no-persistent-workers', + action='store_true', + help='whether to disable the persistent_workers option in dataloaders.' + ) parser.add_argument( '--cfg-options', nargs='+', @@ -112,6 +123,30 @@ def merge_args(cfg, args): if args.auto_scale_lr: cfg.auto_scale_lr.enable = True + # set dataloader args + default_dataloader_cfg = ConfigDict( + pin_memory=True, + persistent_workers=True, + collate_fn=dict(type='default_collate'), + ) + if digit_version(TORCH_VERSION) < digit_version('1.8.0'): + default_dataloader_cfg.persistent_workers = False + + def set_default_dataloader_cfg(cfg, field): + if cfg.get(field, None) is None: + return + dataloader_cfg = copy.deepcopy(default_dataloader_cfg) + dataloader_cfg.update(cfg[field]) + cfg[field] = dataloader_cfg + if args.no_pin_memory: + cfg[field]['pin_memory'] = False + if args.no_persistent_workers: + cfg[field]['persistent_workers'] = False + + set_default_dataloader_cfg(cfg, 'train_dataloader') + set_default_dataloader_cfg(cfg, 'val_dataloader') + set_default_dataloader_cfg(cfg, 'test_dataloader') + if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) diff --git a/tools/misc/verify_dataset.py b/tools/misc/verify_dataset.py index 4b5eb7e0..05e7bb93 100644 --- a/tools/misc/verify_dataset.py +++ b/tools/misc/verify_dataset.py @@ -7,7 +7,8 @@ from pathlib import Path from mmengine import (Config, DictAction, track_parallel_progress, track_progress) -from mmcls.datasets import PIPELINES, build_dataset +from mmcls.datasets import build_dataset +from mmcls.registry import TRANSFORMS def parse_args(): @@ -46,15 +47,14 @@ def parse_args(): class DatasetValidator(): """the dataset tool class to check if all file are broken.""" - def __init__(self, dataset_cfg, log_file_path, phase): + def __init__(self, dataset_cfg, log_file_path): super(DatasetValidator, self).__init__() # keep only LoadImageFromFile pipeline - assert dataset_cfg.data[phase].pipeline[0][ - 'type'] == 'LoadImageFromFile', 'This tool is only for dataset ' \ - 'that needs to load image from files.' - self.pipeline = PIPELINES.build(dataset_cfg.data[phase].pipeline[0]) - dataset_cfg.data[phase].pipeline = [] - dataset = build_dataset(dataset_cfg.data[phase]) + assert dataset_cfg.pipeline[0]['type'] == 'LoadImageFromFile', ( + 'This tool is only for datasets needs to load image from files.') + self.pipeline = TRANSFORMS.build(dataset_cfg.pipeline[0]) + dataset_cfg.pipeline = [] + dataset = build_dataset(dataset_cfg) self.dataset = dataset self.log_file_path = log_file_path @@ -102,13 +102,22 @@ def main(): # touch output file to save broken files list. output_path = Path(args.out_path) if not output_path.parent.exists(): - raise Exception('log_file parent directory not found.') + raise Exception("Path '--out-path' parent directory not found.") if output_path.exists(): os.remove(output_path) output_path.touch() - # do valid - validator = DatasetValidator(cfg, output_path, args.phase) + if args.phase == 'train': + dataset_cfg = cfg.train_dataloader.dataset + elif args.phase == 'val': + dataset_cfg = cfg.val_dataloader.dataset + elif args.phase == 'test': + dataset_cfg = cfg.test_dataloader.dataset + else: + raise ValueError("'--phase' only support 'train', 'val' and 'test'.") + + # do validate + validator = DatasetValidator(dataset_cfg, output_path) if args.num_process > 1: # The default chunksize calcuation method of Pool.map diff --git a/tools/model_converters/edgenext_to_mmcls.py b/tools/model_converters/edgenext_to_mmcls.py new file mode 100644 index 00000000..52224905 --- /dev/null +++ b/tools/model_converters/edgenext_to_mmcls.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from pathlib import Path + +import torch + + +def convert_weights(weight): + """Weight Converter. + + Converts the weights from timm to mmcls + Args: + weight (dict): weight dict from timm + Returns: + Converted weight dict for mmcls + """ + result = dict() + result['meta'] = dict() + temp = dict() + mapping = { + 'dwconv': 'depthwise_conv', + 'pwconv1': 'pointwise_conv1', + 'pwconv2': 'pointwise_conv2', + 'xca': 'csa', + 'convs': 'conv_modules', + 'token_projection': 'proj', + 'pos_embd': 'pos_embed', + 'temperature': 'scale', + } + strict_mapping = { + 'norm.weight': 'norm3.weight', + 'norm.bias': 'norm3.bias', + } + + try: + weight = weight['model_ema'] + except KeyError: + weight = weight['state_dict'] # for model learned with usi + else: + raise NotImplementedError + + for k, v in weight.items(): + # keyword mapping + for mk, mv in mapping.items(): + if mk in k: + k = k.replace(mk, mv) + # strict mapping + for mk, mv in strict_mapping.items(): + if mk == k: + k = mv + + if k.startswith('head.'): + temp['head.fc.' + k[5:]] = v + else: + temp['backbone.' + k] = v + + result['state_dict'] = temp + return result + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src detectron model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + dst = Path(args.dst) + if dst.suffix != '.pth': + print('The path should contain the name of the pth format file.') + exit(1) + dst.parent.mkdir(parents=True, exist_ok=True) + + original_model = torch.load(args.src, map_location='cpu') + converted_model = convert_weights(original_model) + torch.save(converted_model, args.dst) diff --git a/tools/model_converters/reparameterize_repvgg.py b/tools/model_converters/reparameterize_repvgg.py deleted file mode 100644 index e075d837..00000000 --- a/tools/model_converters/reparameterize_repvgg.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -import warnings -from pathlib import Path - -import torch - -from mmcls.apis import init_model - -bright_style, reset_style = '\x1b[1m', '\x1b[0m' -red_text, blue_text = '\x1b[31m', '\x1b[34m' -white_background = '\x1b[107m' - -msg = bright_style + red_text -msg += 'DeprecationWarning: This tool will be deprecated in future. ' -msg += red_text + 'Welcome to use the ' -msg += white_background -msg += '"tools/convert_models/reparameterize_model.py"' -msg += reset_style -warnings.warn(msg) - - -def convert_repvggblock_param(config_path, checkpoint_path, save_path): - model = init_model(config_path, checkpoint=checkpoint_path) - print('Converting...') - - model.backbone.switch_to_deploy() - torch.save(model.state_dict(), save_path) - - print('Done! Save at path "{}"'.format(save_path)) - - -def main(): - parser = argparse.ArgumentParser( - description='Convert the parameters of the repvgg block ' - 'from training mode to deployment mode.') - parser.add_argument( - 'config_path', - help='The path to the configuration file of the network ' - 'containing the repvgg block.') - parser.add_argument( - 'checkpoint_path', - help='The path to the checkpoint file corresponding to the model.') - parser.add_argument( - 'save_path', - help='The path where the converted checkpoint file is stored.') - args = parser.parse_args() - - save_path = Path(args.save_path) - if save_path.suffix != '.pth': - print('The path should contain the name of the pth format file.') - exit(1) - save_path.parent.mkdir(parents=True, exist_ok=True) - - convert_repvggblock_param(args.config_path, args.checkpoint_path, - args.save_path) - - -if __name__ == '__main__': - main() diff --git a/tools/test.py b/tools/test.py index e5b688d3..0fd56510 100644 --- a/tools/test.py +++ b/tools/test.py @@ -2,9 +2,10 @@ import argparse import os import os.path as osp +from copy import deepcopy import mmengine -from mmengine.config import Config, DictAction +from mmengine.config import Config, ConfigDict, DictAction from mmengine.hooks import Hook from mmengine.runner import Runner @@ -51,6 +52,10 @@ def parse_args(): type=float, default=2, help='display time of every window. (second)') + parser.add_argument( + '--no-pin-memory', + action='store_true', + help='whether to disable the pin_memory option in dataloaders.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], @@ -65,6 +70,19 @@ def parse_args(): def merge_args(cfg, args): """Merge CLI arguments to config.""" + cfg.launcher = args.launcher + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + # -------------------- visualization -------------------- if args.show or (args.show_dir is not None): assert 'visualization' in cfg.default_hooks, \ @@ -87,6 +105,26 @@ def merge_args(cfg, args): else: cfg.test_evaluator = [cfg.test_evaluator, dump_metric] + # set dataloader args + default_dataloader_cfg = ConfigDict( + pin_memory=True, + collate_fn=dict(type='default_collate'), + ) + + def set_default_dataloader_cfg(cfg, field): + if cfg.get(field, None) is None: + return + dataloader_cfg = deepcopy(default_dataloader_cfg) + dataloader_cfg.update(cfg[field]) + cfg[field] = dataloader_cfg + if args.no_pin_memory: + cfg[field]['pin_memory'] = False + + set_default_dataloader_cfg(cfg, 'test_dataloader') + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + return cfg @@ -100,20 +138,6 @@ def main(): # load config cfg = Config.fromfile(args.config) cfg = merge_args(cfg, args) - cfg.launcher = args.launcher - if args.cfg_options is not None: - cfg.merge_from_dict(args.cfg_options) - - # work_dir is determined in this priority: CLI > segment in file > filename - if args.work_dir is not None: - # update configs according to CLI args if args.work_dir is not None - cfg.work_dir = args.work_dir - elif cfg.get('work_dir', None) is None: - # use config filename as default work_dir if cfg.work_dir is None - cfg.work_dir = osp.join('./work_dirs', - osp.splitext(osp.basename(args.config))[0]) - - cfg.load_from = args.checkpoint # build the runner from config runner = Runner.from_cfg(cfg) diff --git a/tools/train.py b/tools/train.py index 4b509601..257cbabb 100644 --- a/tools/train.py +++ b/tools/train.py @@ -2,9 +2,12 @@ import argparse import os import os.path as osp +from copy import deepcopy -from mmengine.config import Config, DictAction +from mmengine.config import Config, ConfigDict, DictAction from mmengine.runner import Runner +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION from mmcls.utils import register_all_modules @@ -34,6 +37,15 @@ def parse_args(): action='store_true', help='whether to auto scale the learning rate according to the ' 'actual batch size and the original batch size.') + parser.add_argument( + '--no-pin-memory', + action='store_true', + help='whether to disable the pin_memory option in dataloaders.') + parser.add_argument( + '--no-persistent-workers', + action='store_true', + help='whether to disable the persistent_workers option in dataloaders.' + ) parser.add_argument( '--cfg-options', nargs='+', @@ -96,6 +108,30 @@ def merge_args(cfg, args): if args.auto_scale_lr: cfg.auto_scale_lr.enable = True + # set dataloader args + default_dataloader_cfg = ConfigDict( + pin_memory=True, + persistent_workers=True, + collate_fn=dict(type='default_collate'), + ) + if digit_version(TORCH_VERSION) < digit_version('1.8.0'): + default_dataloader_cfg.persistent_workers = False + + def set_default_dataloader_cfg(cfg, field): + if cfg.get(field, None) is None: + return + dataloader_cfg = deepcopy(default_dataloader_cfg) + dataloader_cfg.update(cfg[field]) + cfg[field] = dataloader_cfg + if args.no_pin_memory: + cfg[field]['pin_memory'] = False + if args.no_persistent_workers: + cfg[field]['persistent_workers'] = False + + set_default_dataloader_cfg(cfg, 'train_dataloader') + set_default_dataloader_cfg(cfg, 'val_dataloader') + set_default_dataloader_cfg(cfg, 'test_dataloader') + if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py index 6af99368..99245e7a 100644 --- a/tools/visualizations/browse_dataset.py +++ b/tools/visualizations/browse_dataset.py @@ -8,10 +8,10 @@ import mmcv import numpy as np from mmengine.config import Config, DictAction from mmengine.dataset import Compose +from mmengine.utils import ProgressBar from mmengine.visualization import Visualizer from mmcls.datasets.builder import build_dataset -from mmcls.registry import VISUALIZERS from mmcls.utils import register_all_modules from mmcls.visualization import ClsVisualizer from mmcls.visualization.cls_visualizer import _get_adaptive_scale @@ -22,12 +22,14 @@ def parse_args(): parser.add_argument('config', help='train config file path') parser.add_argument( '--output-dir', + '-o', default=None, type=str, help='If there is no display interface, you can save it.') parser.add_argument('--not-show', default=False, action='store_true') parser.add_argument( '--phase', + '-p', default='train', type=str, choices=['train', 'test', 'val'], @@ -35,6 +37,7 @@ def parse_args(): ' Defaults to "train".') parser.add_argument( '--show-number', + '-n', type=int, default=sys.maxsize, help='number of images selected to visualize, must bigger than 0. if ' @@ -42,11 +45,13 @@ def parse_args(): 'dataset; default "sys.maxsize", show all images in dataset') parser.add_argument( '--show-interval', + '-i', type=float, default=2, help='the interval of show (s)') parser.add_argument( '--mode', + '-m', default='transformed', type=str, choices=['original', 'transformed', 'concat', 'pipeline'], @@ -58,9 +63,17 @@ def parse_args(): 'Defaults to "transformed".') parser.add_argument( '--rescale-factor', + '-r', type=float, help='image rescale factor, which is useful if the output is too ' 'large or too small.') + parser.add_argument( + '--channel-order', + '-c', + default='BGR', + choices=['BGR', 'RGB'], + help='The channel order of the showing images, could be "BGR" ' + 'or "RGB", Defaults to "BGR".') parser.add_argument( '--cfg-options', nargs='+', @@ -168,12 +181,13 @@ def main(): intermediate_imgs) # init visualizer - visualizer: ClsVisualizer = VISUALIZERS.build(cfg.visualizer) + cfg.visualizer.pop('type') + visualizer = ClsVisualizer(**cfg.visualizer) visualizer.dataset_meta = dataset.metainfo # init visualization image number display_number = min(args.show_number, len(dataset)) - progress_bar = mmcv.ProgressBar(display_number) + progress_bar = ProgressBar(display_number) for i, item in zip(range(display_number), dataset): rescale_factor = args.rescale_factor @@ -195,11 +209,11 @@ def main(): intermediate_imgs.clear() - data_sample = item['data_sample'].numpy() + data_sample = item['data_samples'].numpy() # get filename from dataset or just use index as filename - if hasattr(item['data_sample'], 'img_path'): - filename = osp.basename(item['data_sample'].img_path) + if hasattr(item['data_samples'], 'img_path'): + filename = osp.basename(item['data_samples'].img_path) else: # some dataset have not image path filename = f'{i}.jpg' @@ -209,7 +223,7 @@ def main(): visualizer.add_datasample( filename, - image[..., ::-1], + image if args.channel_order == 'RGB' else image[..., ::-1], data_sample, rescale_factor=rescale_factor, show=not args.not_show, diff --git a/tools/visualizations/vis_cam.py b/tools/visualizations/vis_cam.py index a1fcadac..83241cae 100644 --- a/tools/visualizations/vis_cam.py +++ b/tools/visualizations/vis_cam.py @@ -8,13 +8,14 @@ from pathlib import Path import mmcv import numpy as np -from mmcv import Config, DictAction -from mmcv.utils import to_2tuple +from mmcv.transforms import Compose +from mmengine.config import Config, DictAction +from mmengine.utils import to_2tuple from torch.nn import BatchNorm1d, BatchNorm2d, GroupNorm, LayerNorm from mmcls import digit_version from mmcls.apis import init_model -from mmcls.datasets.pipelines import Compose +from mmcls.utils import register_all_modules try: from pytorch_grad_cam import (EigenCAM, EigenGradCAM, GradCAM, @@ -26,9 +27,6 @@ except ImportError: raise ImportError('Please run `pip install "grad-cam>=1.3.6"` to install ' '3rd party package pytorch_grad_cam.') -# set of transforms, which just change data format, not change the pictures -FORMAT_TRANSFORMS_SET = {'ToTensor', 'Normalize', 'ImageToTensor', 'Collect'} - # Supported grad-cam type map METHOD_MAP = { 'gradcam': GradCAM, @@ -159,56 +157,16 @@ def build_reshape_transform(model, args): return _reshape_transform -def apply_transforms(img_path, pipeline_cfg): - """Apply transforms pipeline and get both formatted data and the image - without formatting.""" - data = dict(img_info=dict(filename=img_path), img_prefix=None) - - def split_pipeline_cfg(pipeline_cfg): - """to split the transfoms into image_transforms and - format_transforms.""" - image_transforms_cfg, format_transforms_cfg = [], [] - if pipeline_cfg[0]['type'] != 'LoadImageFromFile': - pipeline_cfg.insert(0, dict(type='LoadImageFromFile')) - for transform in pipeline_cfg: - if transform['type'] in FORMAT_TRANSFORMS_SET: - format_transforms_cfg.append(transform) - else: - image_transforms_cfg.append(transform) - return image_transforms_cfg, format_transforms_cfg - - image_transforms, format_transforms = split_pipeline_cfg(pipeline_cfg) - image_transforms = Compose(image_transforms) - format_transforms = Compose(format_transforms) - - intermediate_data = image_transforms(data) - inference_img = copy.deepcopy(intermediate_data['img']) - format_data = format_transforms(intermediate_data) - - return format_data, inference_img - - -class MMActivationsAndGradients(ActivationsAndGradients): - """Activations and gradients manager for mmcls models.""" - - def __call__(self, x): - self.gradients = [] - self.activations = [] - return self.model( - x, return_loss=False, softmax=False, post_process=False) - - def init_cam(method, model, target_layers, use_cuda, reshape_transform): """Construct the CAM object once, In order to be compatible with mmcls, here we modify the ActivationsAndGradients object.""" - GradCAM_Class = METHOD_MAP[method.lower()] cam = GradCAM_Class( model=model, target_layers=target_layers, use_cuda=use_cuda) # Release the original hooks in ActivationsAndGradients to use - # MMActivationsAndGradients. + # ActivationsAndGradients. cam.activations_and_grads.release() - cam.activations_and_grads = MMActivationsAndGradients( + cam.activations_and_grads = ActivationsAndGradients( cam.model, cam.target_layers, reshape_transform) return cam @@ -306,6 +264,7 @@ def main(): if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + register_all_modules() # build the model from a config file and a checkpoint file model = init_model(cfg, args.checkpoint, device=args.device) if args.preview_model: @@ -314,7 +273,10 @@ def main(): return # apply transform and perpare data - data, src_img = apply_transforms(args.img, cfg.data.test.pipeline) + transforms = Compose(cfg.test_dataloader.dataset.pipeline) + data = transforms({'img_path': args.img}) + src_img = copy.deepcopy(data['inputs']).numpy().transpose(1, 2, 0) + data = model.data_preprocessor(data, False) # build target layers if args.target_layers: @@ -344,7 +306,7 @@ def main(): # calculate cam grads and show|save the visualization image grayscale_cam = cam( - data['img'].unsqueeze(0), + data['inputs'].unsqueeze(0), targets, eigen_smooth=args.eigen_smooth, aug_smooth=args.aug_smooth) diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py index 0375d616..87d076dc 100644 --- a/tools/visualizations/vis_scheduler.py +++ b/tools/visualizations/vis_scheduler.py @@ -76,22 +76,30 @@ def parse_args(): description='Visualize a Dataset Pipeline') parser.add_argument('config', help='config file path') parser.add_argument( - '--param', + '-p', + '--parameter', type=str, default='lr', choices=['lr', 'momentum'], - help='The param to visualize its change curve, choose from' + help='The parameter to visualize its change curve, choose from' '"lr" and "momentum". Defaults to "lr".') parser.add_argument( + '-d', '--dataset-size', type=int, help='The size of the dataset. If specify, `build_dataset` will ' 'be skipped and use this size as the dataset size.') parser.add_argument( + '-n', '--ngpus', type=int, default=1, help='The number of GPUs used in training.') + parser.add_argument( + '-s', + '--save-path', + type=Path, + help='The learning rate curve plot save path') parser.add_argument( '--log-level', default='WARNING', @@ -100,10 +108,6 @@ def parse_args(): parser.add_argument('--title', type=str, help='title of figure') parser.add_argument( '--style', type=str, default='whitegrid', help='style of plt') - parser.add_argument( - '--save-path', - type=Path, - help='The learning rate curve plot save path') parser.add_argument('--not-show', default=False, action='store_true') parser.add_argument( '--window-size', @@ -166,6 +170,7 @@ def simulate_train(data_loader, cfg, by_epoch): param_record_hook = ParamRecordHook(by_epoch=by_epoch) default_hooks = dict( param_scheduler=cfg.default_hooks['param_scheduler'], + runtime_info=None, timer=None, logger=None, checkpoint=None, @@ -246,12 +251,12 @@ def main(): # simulation training process lr_list, momentum_list = simulate_train(data_loader, cfg, by_epoch) - if args.param == 'lr': + if args.parameter == 'lr': param_list = lr_list else: param_list = momentum_list - param_name = 'Learning Rate' if args.param == 'lr' else 'Momentum' + param_name = 'Learning Rate' if args.parameter == 'lr' else 'Momentum' plot_curve(param_list, args, param_name, len(data_loader), by_epoch) if args.save_path: