From 742c2d524726d426ea2745055a5b217c020ccc72 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 27 May 2021 18:03:29 -0700 Subject: [PATCH 01/12] Add Gather-Excite and Global Context attn modules. Refactor existing SE-like attn for consistency and refactor byob/byoanet for less redundancy. --- timm/models/__init__.py | 1 - timm/models/byoanet.py | 374 +++++++-------------------- timm/models/byobnet.py | 365 +++++++++++++++++++++----- timm/models/layers/__init__.py | 8 +- timm/models/layers/cbam.py | 71 ++--- timm/models/layers/create_attn.py | 11 +- timm/models/layers/eca.py | 6 + timm/models/layers/gather_excite.py | 90 +++++++ timm/models/layers/global_context.py | 67 +++++ timm/models/layers/involution.py | 6 +- timm/models/layers/mlp.py | 23 ++ timm/models/layers/norm.py | 9 + timm/models/layers/se.py | 50 ---- timm/models/layers/squeeze_excite.py | 74 ++++++ timm/models/nfnet.py | 15 +- timm/models/regnet.py | 2 +- timm/models/resnet.py | 14 +- timm/models/rexnet.py | 29 +-- timm/models/tresnet.py | 6 +- timm/models/visformer.py | 15 +- 20 files changed, 744 insertions(+), 492 deletions(-) create mode 100644 timm/models/layers/gather_excite.py create mode 100644 timm/models/layers/global_context.py delete mode 100644 timm/models/layers/se.py create mode 100644 timm/models/layers/squeeze_excite.py diff --git a/timm/models/__init__.py b/timm/models/__init__.py index 788b7518..06217e18 100644 --- a/timm/models/__init__.py +++ b/timm/models/__init__.py @@ -17,7 +17,6 @@ from .inception_resnet_v2 import * from .inception_v3 import * from .inception_v4 import * from .levit import * -#from .levit import * from .mlp_mixer import * from .mobilenetv3 import * from .nasnet import * diff --git a/timm/models/byoanet.py b/timm/models/byoanet.py index c179a01c..73c6811b 100644 --- a/timm/models/byoanet.py +++ b/timm/models/byoanet.py @@ -12,24 +12,12 @@ Consider all of the models definitions here as experimental WIP and likely to ch Hacked together by / copyright Ross Wightman, 2021. """ -import math -from dataclasses import dataclass, field -from collections import OrderedDict -from typing import Tuple, List, Optional, Union, Any, Callable -from functools import partial - -import torch -import torch.nn as nn - from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from .byobnet import BlocksCfg, ByobCfg, create_byob_stem, create_byob_stages, create_downsample,\ - reduce_feat_size, register_block, num_groups, LayerFn, _init_weights +from .byobnet import ByoBlockCfg, ByoModelCfg, ByobNet, interleave_blocks from .helpers import build_model_with_cfg -from .layers import ClassifierHead, ConvBnAct, DropPath, get_act_layer, convert_norm_act, get_attn, get_self_attn,\ - make_divisible, to_2tuple from .registry import register_model -__all__ = ['ByoaNet'] +__all__ = [] def _cfg(url='', **kwargs): @@ -63,100 +51,68 @@ default_cfgs = { 'swinnet50ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), 'eca_swinnext26ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), - 'rednet26t': _cfg(url='', fixed_input_size=False, input_size=(3, 256, 256), pool_size=(8, 8)), - 'rednet50ts': _cfg(url='', fixed_input_size=False, input_size=(3, 256, 256), pool_size=(8, 8)), + 'rednet26t': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'rednet50ts': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), } -@dataclass -class ByoaBlocksCfg(BlocksCfg): - # FIXME allow overriding self_attn layer or args per block/stage, - pass - - -@dataclass -class ByoaCfg(ByobCfg): - blocks: Tuple[Union[ByoaBlocksCfg, Tuple[ByoaBlocksCfg, ...]], ...] = None - self_attn_layer: Optional[str] = None - self_attn_fixed_size: bool = False - self_attn_kwargs: dict = field(default_factory=lambda: dict()) - - -def interleave_attn( - types : Tuple[str, str], every: Union[int, List[int]], d, first: bool = False, **kwargs -) -> Tuple[ByoaBlocksCfg]: - """ interleave attn blocks - """ - assert len(types) == 2 - if isinstance(every, int): - every = list(range(0 if first else every, d, every)) - if not every: - every = [d - 1] - set(every) - blocks = [] - for i in range(d): - block_type = types[1] if i in every else types[0] - blocks += [ByoaBlocksCfg(type=block_type, d=1, **kwargs)] - return tuple(blocks) - - model_cfgs = dict( - botnet26t=ByoaCfg( + botnet26t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='maxpool', num_features=0, + fixed_input_size=True, self_attn_layer='bottleneck', - self_attn_fixed_size=True, self_attn_kwargs=dict() ), - botnet50ts=ByoaCfg( + botnet50ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=1, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='', num_features=0, + fixed_input_size=True, act_layer='silu', self_attn_layer='bottleneck', - self_attn_fixed_size=True, self_attn_kwargs=dict() ), - eca_botnext26ts=ByoaCfg( + eca_botnext26ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=16, br=0.25), - ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=16, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=16, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='maxpool', num_features=0, + fixed_input_size=True, act_layer='silu', attn_layer='eca', self_attn_layer='bottleneck', - self_attn_fixed_size=True, self_attn_kwargs=dict() ), - halonet_h1=ByoaCfg( + halonet_h1=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='self_attn', d=3, c=64, s=1, gs=0, br=1.0), - ByoaBlocksCfg(type='self_attn', d=3, c=128, s=2, gs=0, br=1.0), - ByoaBlocksCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0), - ByoaBlocksCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=64, s=1, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=128, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0), ), stem_chs=64, stem_type='7x7', @@ -165,12 +121,12 @@ model_cfgs = dict( self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=3), ), - halonet_h1_c4c5=ByoaCfg( + halonet_h1_c4c5=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=64, s=1, gs=0, br=1.0), - ByoaBlocksCfg(type='bottle', d=3, c=128, s=2, gs=0, br=1.0), - ByoaBlocksCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0), - ByoaBlocksCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0), + ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=0, br=1.0), + ByoBlockCfg(type='bottle', d=3, c=128, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0), ), stem_chs=64, stem_type='tiered', @@ -179,12 +135,12 @@ model_cfgs = dict( self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=3), ), - halonet26t=ByoaCfg( + halonet26t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -193,12 +149,12 @@ model_cfgs = dict( self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=2) # intended for 256x256 res ), - halonet50ts=ByoaCfg( + halonet50ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -208,12 +164,12 @@ model_cfgs = dict( self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=2) ), - eca_halonext26ts=ByoaCfg( + eca_halonext26ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), - ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -225,12 +181,12 @@ model_cfgs = dict( self_attn_kwargs=dict(block_size=8, halo_size=2) # intended for 256x256 res ), - lambda_resnet26t=ByoaCfg( + lambda_resnet26t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -239,12 +195,12 @@ model_cfgs = dict( self_attn_layer='lambda', self_attn_kwargs=dict() ), - lambda_resnet50t=ByoaCfg( + lambda_resnet50t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=3, d=6, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=3, d=6, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -253,12 +209,12 @@ model_cfgs = dict( self_attn_layer='lambda', self_attn_kwargs=dict() ), - eca_lambda_resnext26ts=ByoaCfg( + eca_lambda_resnext26ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), - ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -270,77 +226,76 @@ model_cfgs = dict( self_attn_kwargs=dict() ), - swinnet26t=ByoaCfg( + swinnet26t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='maxpool', num_features=0, + fixed_input_size=True, self_attn_layer='swin', - self_attn_fixed_size=True, self_attn_kwargs=dict(win_size=8) ), - swinnet50ts=ByoaCfg( + swinnet50ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=4, c=512, s=2, gs=0, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='maxpool', num_features=0, + fixed_input_size=True, act_layer='silu', self_attn_layer='swin', - self_attn_fixed_size=True, self_attn_kwargs=dict(win_size=8) ), - eca_swinnext26ts=ByoaCfg( + eca_swinnext26ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=16, br=0.25), - interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), ), stem_chs=64, stem_type='tiered', stem_pool='maxpool', num_features=0, + fixed_input_size=True, act_layer='silu', attn_layer='eca', self_attn_layer='swin', - self_attn_fixed_size=True, self_attn_kwargs=dict(win_size=8) ), - rednet26t=ByoaCfg( + rednet26t=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='self_attn', d=2, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=512, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', # FIXME RedNet uses involution in middle of stem stem_pool='maxpool', num_features=0, self_attn_layer='involution', - self_attn_fixed_size=False, self_attn_kwargs=dict() ), - rednet50ts=ByoaCfg( + rednet50ts=ByoModelCfg( blocks=( - ByoaBlocksCfg(type='self_attn', d=3, c=256, s=1, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=4, c=512, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25), - ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=4, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25), ), stem_chs=64, stem_type='tiered', @@ -348,161 +303,14 @@ model_cfgs = dict( num_features=0, act_layer='silu', self_attn_layer='involution', - self_attn_fixed_size=False, self_attn_kwargs=dict() ), ) -@dataclass -class ByoaLayerFn(LayerFn): - self_attn: Optional[Callable] = None - - -class SelfAttnBlock(nn.Module): - """ ResNet-like Bottleneck Block - 1x1 - optional kxk - self attn - 1x1 - """ - - def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, - downsample='avg', extra_conv=False, linear_out=False, post_attn_na=True, feat_size=None, - layers: ByoaLayerFn = None, drop_block=None, drop_path_rate=0.): - super(SelfAttnBlock, self).__init__() - assert layers is not None - mid_chs = make_divisible(out_chs * bottle_ratio) - groups = num_groups(group_size, mid_chs) - - if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]: - self.shortcut = create_downsample( - downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0], - apply_act=False, layers=layers) - else: - self.shortcut = nn.Identity() - - self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) - if extra_conv: - self.conv2_kxk = layers.conv_norm_act( - mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], - groups=groups, drop_block=drop_block) - stride = 1 # striding done via conv if enabled - else: - self.conv2_kxk = nn.Identity() - opt_kwargs = {} if feat_size is None else dict(feat_size=feat_size) - # FIXME need to dilate self attn to have dilated network support, moop moop - self.self_attn = layers.self_attn(mid_chs, stride=stride, **opt_kwargs) - self.post_attn = layers.norm_act(mid_chs) if post_attn_na else nn.Identity() - self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) - self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() - self.act = nn.Identity() if linear_out else layers.act(inplace=True) - - def init_weights(self, zero_init_last_bn=False): - if zero_init_last_bn: - nn.init.zeros_(self.conv3_1x1.bn.weight) - if hasattr(self.self_attn, 'reset_parameters'): - self.self_attn.reset_parameters() - - def forward(self, x): - shortcut = self.shortcut(x) - - x = self.conv1_1x1(x) - x = self.conv2_kxk(x) - x = self.self_attn(x) - x = self.post_attn(x) - x = self.conv3_1x1(x) - x = self.drop_path(x) - - x = self.act(x + shortcut) - return x - -register_block('self_attn', SelfAttnBlock) - - -def _byoa_block_args(block_kwargs, block_cfg: ByoaBlocksCfg, model_cfg: ByoaCfg, feat_size=None): - if block_cfg.type == 'self_attn' and model_cfg.self_attn_fixed_size: - assert feat_size is not None - block_kwargs['feat_size'] = feat_size - return block_kwargs - - -def get_layer_fns(cfg: ByoaCfg): - act = get_act_layer(cfg.act_layer) - norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act) - conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act) - attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None - self_attn = partial(get_self_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None - layer_fn = ByoaLayerFn( - conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn) - return layer_fn - - -class ByoaNet(nn.Module): - """ 'Bring-your-own-attention' Net - - A ResNet inspired backbone that supports interleaving traditional residual blocks with - 'Self Attention' bottleneck blocks that replace the bottleneck kxk conv w/ a self-attention - or similar module. - - FIXME This class network definition is almost the same as ByobNet, I'd like to merge them but - torchscript limitations prevent sensible inheritance overrides. - """ - def __init__(self, cfg: ByoaCfg, num_classes=1000, in_chans=3, output_stride=32, global_pool='avg', - zero_init_last_bn=True, img_size=None, drop_rate=0., drop_path_rate=0.): - super().__init__() - self.num_classes = num_classes - self.drop_rate = drop_rate - layers = get_layer_fns(cfg) - feat_size = to_2tuple(img_size) if img_size is not None else None - - self.feature_info = [] - stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor)) - self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers) - self.feature_info.extend(stem_feat[:-1]) - feat_size = reduce_feat_size(feat_size, stride=stem_feat[-1]['reduction']) - - self.stages, stage_feat = create_byob_stages( - cfg, drop_path_rate, output_stride, stem_feat[-1], - feat_size=feat_size, layers=layers, extra_args_fn=_byoa_block_args) - self.feature_info.extend(stage_feat[:-1]) - - prev_chs = stage_feat[-1]['num_chs'] - if cfg.num_features: - self.num_features = int(round(cfg.width_factor * cfg.num_features)) - self.final_conv = layers.conv_norm_act(prev_chs, self.num_features, 1) - else: - self.num_features = prev_chs - self.final_conv = nn.Identity() - self.feature_info += [ - dict(num_chs=self.num_features, reduction=stage_feat[-1]['reduction'], module='final_conv')] - - self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) - - for n, m in self.named_modules(): - _init_weights(m, n) - for m in self.modules(): - # call each block's weight init for block-specific overrides to init above - if hasattr(m, 'init_weights'): - m.init_weights(zero_init_last_bn=zero_init_last_bn) - - def get_classifier(self): - return self.head.fc - - def reset_classifier(self, num_classes, global_pool='avg'): - self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) - - def forward_features(self, x): - x = self.stem(x) - x = self.stages(x) - x = self.final_conv(x) - return x - - def forward(self, x): - x = self.forward_features(x) - x = self.head(x) - return x - - def _create_byoanet(variant, cfg_variant=None, pretrained=False, **kwargs): return build_model_with_cfg( - ByoaNet, variant, pretrained, + ByobNet, variant, pretrained, default_cfg=default_cfgs[variant], model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], feature_cfg=dict(flatten_sequential=True), diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index 8f4a2020..3f162c79 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -26,8 +26,7 @@ Hacked together by / copyright Ross Wightman, 2021. """ import math from dataclasses import dataclass, field, replace -from collections import OrderedDict -from typing import Tuple, List, Optional, Union, Any, Callable, Sequence +from typing import Tuple, List, Dict, Optional, Union, Any, Callable, Sequence from functools import partial import torch @@ -36,10 +35,10 @@ import torch.nn as nn from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg from .layers import ClassifierHead, ConvBnAct, BatchNormAct2d, DropPath, AvgPool2dSame, \ - create_conv2d, get_act_layer, convert_norm_act, get_attn, make_divisible + create_conv2d, get_act_layer, convert_norm_act, get_attn, get_self_attn, make_divisible, to_2tuple from .registry import register_model -__all__ = ['ByobNet', 'ByobCfg', 'BlocksCfg', 'create_byob_stem', 'create_block'] +__all__ = ['ByobNet', 'ByoModelCfg', 'ByoBlockCfg', 'create_byob_stem', 'create_block'] def _cfg(url='', **kwargs): @@ -87,35 +86,52 @@ default_cfgs = { 'repvgg_b3g4': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3g4-73c370bf.pth', first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + + # experimental configs + 'resnet52qs': _cfg(first_conv='stem.conv1.conv'), + 'geresnet50t': _cfg(first_conv='stem.conv1.conv'), + 'gcresnet50t': _cfg(first_conv='stem.conv1.conv'), } @dataclass -class BlocksCfg: +class ByoBlockCfg: type: Union[str, nn.Module] d: int # block depth (number of block repeats in stage) c: int # number of output channels for each block in stage s: int = 2 # stride of stage (first block) gs: Optional[Union[int, Callable]] = None # group-size of blocks in stage, conv is depthwise if gs == 1 br: float = 1. # bottleneck-ratio of blocks in stage - no_attn: bool = False # disable channel attn (ie SE) when layer is set for model + + # NOTE: these config items override the model cfgs that are applied to all blocks by default + attn_layer: Optional[str] = None + attn_kwargs: Optional[Dict[str, Any]] = None + self_attn_layer: Optional[str] = None + self_attn_kwargs: Optional[Dict[str, Any]] = None + block_kwargs: Optional[Dict[str, Any]] = None @dataclass -class ByobCfg: - blocks: Tuple[Union[BlocksCfg, Tuple[BlocksCfg, ...]], ...] +class ByoModelCfg: + blocks: Tuple[Union[ByoBlockCfg, Tuple[ByoBlockCfg, ...]], ...] downsample: str = 'conv1x1' stem_type: str = '3x3' - stem_pool: str = '' + stem_pool: Optional[str] = 'maxpool' stem_chs: int = 32 width_factor: float = 1.0 num_features: int = 0 # num out_channels for final conv, no final 1x1 conv if 0 zero_init_last_bn: bool = True + fixed_input_size: bool = False # model constrained to a fixed-input size / img_size must be provided on creation act_layer: str = 'relu' norm_layer: str = 'batchnorm' + + # NOTE: these config items will be overridden by the block cfg (per-block) if they are set there attn_layer: Optional[str] = None attn_kwargs: dict = field(default_factory=lambda: dict()) + self_attn_layer: Optional[str] = None + self_attn_kwargs: dict = field(default_factory=lambda: dict()) + block_kwargs: Dict[str, Any] = field(default_factory=lambda: dict()) def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0): @@ -123,103 +139,155 @@ def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0): group_size = 0 if groups > 0: group_size = lambda chs, idx: chs // groups if (idx + 1) % 2 == 0 else 0 - bcfg = tuple([BlocksCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)]) + bcfg = tuple([ByoBlockCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)]) return bcfg -model_cfgs = dict( +def interleave_blocks( + types: Tuple[str, str], every: Union[int, List[int]], d, first: bool = False, **kwargs +) -> Tuple[ByoBlockCfg]: + """ interleave 2 block types in stack + """ + assert len(types) == 2 + if isinstance(every, int): + every = list(range(0 if first else every, d, every)) + if not every: + every = [d - 1] + set(every) + blocks = [] + for i in range(d): + block_type = types[1] if i in every else types[0] + blocks += [ByoBlockCfg(type=block_type, d=1, **kwargs)] + return tuple(blocks) - gernet_l=ByobCfg( + +model_cfgs = dict( + gernet_l=ByoModelCfg( blocks=( - BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), - BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), - BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), - BlocksCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.), - BlocksCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.), + ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.), ), stem_chs=32, + stem_pool=None, num_features=2560, ), - gernet_m=ByobCfg( + gernet_m=ByoModelCfg( blocks=( - BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), - BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), - BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), - BlocksCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.), - BlocksCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.), + ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.), ), stem_chs=32, + stem_pool=None, num_features=2560, ), - gernet_s=ByobCfg( + gernet_s=ByoModelCfg( blocks=( - BlocksCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.), - BlocksCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.), - BlocksCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4), - BlocksCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.), - BlocksCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.), + ByoBlockCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.), ), stem_chs=13, + stem_pool=None, num_features=1920, ), - repvgg_a2=ByobCfg( + repvgg_a2=ByoModelCfg( blocks=_rep_vgg_bcfg(d=(2, 4, 14, 1), wf=(1.5, 1.5, 1.5, 2.75)), stem_type='rep', stem_chs=64, ), - repvgg_b0=ByobCfg( + repvgg_b0=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(1., 1., 1., 2.5)), stem_type='rep', stem_chs=64, ), - repvgg_b1=ByobCfg( + repvgg_b1=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.)), stem_type='rep', stem_chs=64, ), - repvgg_b1g4=ByobCfg( + repvgg_b1g4=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.), groups=4), stem_type='rep', stem_chs=64, ), - repvgg_b2=ByobCfg( + repvgg_b2=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.)), stem_type='rep', stem_chs=64, ), - repvgg_b2g4=ByobCfg( + repvgg_b2g4=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.), groups=4), stem_type='rep', stem_chs=64, ), - repvgg_b3=ByobCfg( + repvgg_b3=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.)), stem_type='rep', stem_chs=64, ), - repvgg_b3g4=ByobCfg( + repvgg_b3g4=ByoModelCfg( blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.), groups=4), stem_type='rep', stem_chs=64, ), - resnet52q=ByobCfg( + # WARN: experimental, may vanish/change + resnet52q=ByoModelCfg( blocks=( - BlocksCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), - BlocksCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), - BlocksCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25), - BlocksCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), ), stem_chs=128, stem_type='quad', num_features=2048, act_layer='silu', ), + + # WARN: experimental, may vanish/change + geresnet50t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='edge', d=3, c=256, s=1, br=0.25), + ByoBlockCfg(type='edge', d=4, c=512, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool=None, + attn_layer='ge', + attn_kwargs=dict(extent=8, extra_params=True), + #attn_kwargs=dict(extent=8), + #block_kwargs=dict(attn_last=True) + ), + + # WARN: experimental, may vanish/change + gcresnet50t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool=None, + attn_layer='gc' + ), ) -def expand_blocks_cfg(stage_blocks_cfg: Union[BlocksCfg, Sequence[BlocksCfg]]) -> List[BlocksCfg]: +def expand_blocks_cfg(stage_blocks_cfg: Union[ByoBlockCfg, Sequence[ByoBlockCfg]]) -> List[ByoBlockCfg]: if not isinstance(stage_blocks_cfg, Sequence): stage_blocks_cfg = (stage_blocks_cfg,) block_cfgs = [] @@ -243,6 +311,7 @@ class LayerFn: norm_act: Callable = BatchNormAct2d act: Callable = nn.ReLU attn: Optional[Callable] = None + self_attn: Optional[Callable] = None class DownsampleAvg(nn.Module): @@ -275,7 +344,8 @@ class BasicBlock(nn.Module): def __init__( self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), group_size=None, bottle_ratio=1.0, - downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None, + drop_path_rate=0.): super(BasicBlock, self).__init__() layers = layers or LayerFn() mid_chs = make_divisible(out_chs * bottle_ratio) @@ -289,15 +359,19 @@ class BasicBlock(nn.Module): self.shortcut = nn.Identity() self.conv1_kxk = layers.conv_norm_act(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0]) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) self.conv2_kxk = layers.conv_norm_act( mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_block=drop_block, apply_act=False) - self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.act = nn.Identity() if linear_out else layers.act(inplace=True) - def init_weights(self, zero_init_last_bn=False): + def init_weights(self, zero_init_last_bn: bool = False): if zero_init_last_bn: nn.init.zeros_(self.conv2_kxk.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() def forward(self, x): shortcut = self.shortcut(x) @@ -317,7 +391,8 @@ class BottleneckBlock(nn.Module): """ def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, - downsample='avg', linear_out=False, layers : LayerFn = None, drop_block=None, drop_path_rate=0.): + downsample='avg', attn_last=False, linear_out=False, layers: LayerFn = None, drop_block=None, + drop_path_rate=0.): super(BottleneckBlock, self).__init__() layers = layers or LayerFn() mid_chs = make_divisible(out_chs * bottle_ratio) @@ -334,14 +409,18 @@ class BottleneckBlock(nn.Module): self.conv2_kxk = layers.conv_norm_act( mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_block=drop_block) - self.attn = nn.Identity() if layers.attn is None else layers.attn(mid_chs) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.act = nn.Identity() if linear_out else layers.act(inplace=True) - def init_weights(self, zero_init_last_bn=False): + def init_weights(self, zero_init_last_bn: bool = False): if zero_init_last_bn: nn.init.zeros_(self.conv3_1x1.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() def forward(self, x): shortcut = self.shortcut(x) @@ -350,6 +429,7 @@ class BottleneckBlock(nn.Module): x = self.conv2_kxk(x) x = self.attn(x) x = self.conv3_1x1(x) + x = self.attn_last(x) x = self.drop_path(x) x = self.act(x + shortcut) @@ -368,7 +448,8 @@ class DarkBlock(nn.Module): """ def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, - downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None, + drop_path_rate=0.): super(DarkBlock, self).__init__() layers = layers or LayerFn() mid_chs = make_divisible(out_chs * bottle_ratio) @@ -382,23 +463,28 @@ class DarkBlock(nn.Module): self.shortcut = nn.Identity() self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) self.conv2_kxk = layers.conv_norm_act( mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_block=drop_block, apply_act=False) - self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.act = nn.Identity() if linear_out else layers.act(inplace=True) - def init_weights(self, zero_init_last_bn=False): + def init_weights(self, zero_init_last_bn: bool = False): if zero_init_last_bn: nn.init.zeros_(self.conv2_kxk.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() def forward(self, x): shortcut = self.shortcut(x) x = self.conv1_1x1(x) - x = self.conv2_kxk(x) x = self.attn(x) + x = self.conv2_kxk(x) + x = self.attn_last(x) x = self.drop_path(x) x = self.act(x + shortcut) return x @@ -415,7 +501,8 @@ class EdgeBlock(nn.Module): """ def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, - downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + downsample='avg', attn_last=False, linear_out=False, layers: LayerFn = None, + drop_block=None, drop_path_rate=0.): super(EdgeBlock, self).__init__() layers = layers or LayerFn() mid_chs = make_divisible(out_chs * bottle_ratio) @@ -431,14 +518,18 @@ class EdgeBlock(nn.Module): self.conv1_kxk = layers.conv_norm_act( in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_block=drop_block) - self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) self.conv2_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.act = nn.Identity() if linear_out else layers.act(inplace=True) - def init_weights(self, zero_init_last_bn=False): + def init_weights(self, zero_init_last_bn: bool = False): if zero_init_last_bn: nn.init.zeros_(self.conv2_1x1.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() def forward(self, x): shortcut = self.shortcut(x) @@ -446,6 +537,7 @@ class EdgeBlock(nn.Module): x = self.conv1_kxk(x) x = self.attn(x) x = self.conv2_1x1(x) + x = self.attn_last(x) x = self.drop_path(x) x = self.act(x + shortcut) return x @@ -460,7 +552,7 @@ class RepVggBlock(nn.Module): """ def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, - downsample='', layers : LayerFn = None, drop_block=None, drop_path_rate=0.): + downsample='', layers: LayerFn = None, drop_block=None, drop_path_rate=0.): super(RepVggBlock, self).__init__() layers = layers or LayerFn() groups = num_groups(group_size, in_chs) @@ -475,12 +567,15 @@ class RepVggBlock(nn.Module): self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity() self.act = layers.act(inplace=True) - def init_weights(self, zero_init_last_bn=False): + def init_weights(self, zero_init_last_bn: bool = False): # NOTE this init overrides that base model init with specific changes for the block type for m in self.modules(): if isinstance(m, nn.BatchNorm2d): nn.init.normal_(m.weight, .1, .1) nn.init.normal_(m.bias, 0, .1) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() def forward(self, x): if self.identity is None: @@ -495,12 +590,68 @@ class RepVggBlock(nn.Module): return x +class SelfAttnBlock(nn.Module): + """ ResNet-like Bottleneck Block - 1x1 - optional kxk - self attn - 1x1 + """ + + def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, + downsample='avg', extra_conv=False, linear_out=False, post_attn_na=True, feat_size=None, + layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + super(SelfAttnBlock, self).__init__() + assert layers is not None + mid_chs = make_divisible(out_chs * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]: + self.shortcut = create_downsample( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0], + apply_act=False, layers=layers) + else: + self.shortcut = nn.Identity() + + self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) + if extra_conv: + self.conv2_kxk = layers.conv_norm_act( + mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], + groups=groups, drop_block=drop_block) + stride = 1 # striding done via conv if enabled + else: + self.conv2_kxk = nn.Identity() + opt_kwargs = {} if feat_size is None else dict(feat_size=feat_size) + # FIXME need to dilate self attn to have dilated network support, moop moop + self.self_attn = layers.self_attn(mid_chs, stride=stride, **opt_kwargs) + self.post_attn = layers.norm_act(mid_chs) if post_attn_na else nn.Identity() + self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last_bn: bool = False): + if zero_init_last_bn: + nn.init.zeros_(self.conv3_1x1.bn.weight) + if hasattr(self.self_attn, 'reset_parameters'): + self.self_attn.reset_parameters() + + def forward(self, x): + shortcut = self.shortcut(x) + + x = self.conv1_1x1(x) + x = self.conv2_kxk(x) + x = self.self_attn(x) + x = self.post_attn(x) + x = self.conv3_1x1(x) + x = self.drop_path(x) + + x = self.act(x + shortcut) + return x + + _block_registry = dict( basic=BasicBlock, bottle=BottleneckBlock, dark=DarkBlock, edge=EdgeBlock, rep=RepVggBlock, + self_attn=SelfAttnBlock, ) @@ -552,7 +703,7 @@ class Stem(nn.Sequential): curr_stride *= s prev_feat = conv_name - if 'max' in pool.lower(): + if pool and 'max' in pool.lower(): self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat)) self.add_module('pool', nn.MaxPool2d(3, 2, 1)) curr_stride *= 2 @@ -601,9 +752,58 @@ def reduce_feat_size(feat_size, stride=2): return None if feat_size is None else tuple([s // stride for s in feat_size]) +def override_kwargs(block_kwargs, model_kwargs): + """ Override model level attn/self-attn/block kwargs w/ block level + + NOTE: kwargs are NOT merged across levels, block_kwargs will fully replace model_kwargs + for the block if set to anything that isn't None. + + i.e. an empty block_kwargs dict will remove kwargs set at model level for that block + """ + out_kwargs = block_kwargs if block_kwargs is not None else model_kwargs + return out_kwargs or {} # make sure None isn't returned + + +def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, model_cfg: ByoModelCfg, ): + layer_fns = block_kwargs['layers'] + + # override attn layer / args with block local config + if block_cfg.attn_kwargs is not None or block_cfg.attn_layer is not None: + # override attn layer config + if not block_cfg.attn_layer: + # empty string for attn_layer type will disable attn for this block + attn_layer = None + else: + attn_kwargs = override_kwargs(block_cfg.attn_kwargs, model_cfg.attn_kwargs) + attn_layer = block_cfg.attn_layer or model_cfg.attn_layer + attn_layer = partial(get_attn(attn_layer), *attn_kwargs) if attn_layer is not None else None + layer_fns = replace(layer_fns, attn=attn_layer) + + # override self-attn layer / args with block local cfg + if block_cfg.self_attn_kwargs is not None or block_cfg.self_attn_layer is not None: + # override attn layer config + if not block_cfg.self_attn_layer: + # empty string for self_attn_layer type will disable attn for this block + self_attn_layer = None + else: + self_attn_kwargs = override_kwargs(block_cfg.self_attn_kwargs, model_cfg.self_attn_kwargs) + self_attn_layer = block_cfg.self_attn_layer or model_cfg.self_attn_layer + self_attn_layer = partial(get_self_attn(self_attn_layer), *self_attn_kwargs) \ + if self_attn_layer is not None else None + layer_fns = replace(layer_fns, self_attn=self_attn_layer) + + block_kwargs['layers'] = layer_fns + + # add additional block_kwargs specified in block_cfg or model_cfg, precedence to block if set + block_kwargs.update(override_kwargs(block_cfg.block_kwargs, model_cfg.block_kwargs)) + + def create_byob_stages( - cfg, drop_path_rate, output_stride, stem_feat, - feat_size=None, layers=None, extra_args_fn=None): + cfg: ByoModelCfg, drop_path_rate: float, output_stride: int, stem_feat: Dict[str, Any], + feat_size: Optional[int] = None, + layers: Optional[LayerFn] = None, + block_kwargs_fn: Optional[Callable] = update_block_kwargs): + layers = layers or LayerFn() feature_info = [] block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks] @@ -641,8 +841,10 @@ def create_byob_stages( drop_path_rate=dpr[stage_idx][block_idx], layers=layers, ) - if extra_args_fn is not None: - extra_args_fn(block_kwargs, block_cfg=block_cfg, model_cfg=cfg, feat_size=feat_size) + if block_cfg.type in ('self_attn',): + # add feat_size arg for blocks that support/need it + block_kwargs['feat_size'] = feat_size + block_kwargs_fn(block_kwargs, block_cfg=block_cfg, model_cfg=cfg) blocks += [create_block(block_cfg.type, **block_kwargs)] first_dilation = dilation prev_chs = out_chs @@ -656,12 +858,13 @@ def create_byob_stages( return nn.Sequential(*stages), feature_info -def get_layer_fns(cfg: ByobCfg): +def get_layer_fns(cfg: ByoModelCfg): act = get_act_layer(cfg.act_layer) norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act) conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act) attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None - layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn) + self_attn = partial(get_self_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None + layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn) return layer_fn @@ -673,19 +876,24 @@ class ByobNet(nn.Module): Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act). """ - def __init__(self, cfg: ByobCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32, - zero_init_last_bn=True, drop_rate=0., drop_path_rate=0.): + def __init__(self, cfg: ByoModelCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32, + zero_init_last_bn=True, img_size=None, drop_rate=0., drop_path_rate=0.): super().__init__() self.num_classes = num_classes self.drop_rate = drop_rate layers = get_layer_fns(cfg) + if cfg.fixed_input_size: + assert img_size is not None, 'img_size argument is required for fixed input size model' + feat_size = to_2tuple(img_size) if img_size is not None else None self.feature_info = [] stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor)) self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers) self.feature_info.extend(stem_feat[:-1]) + feat_size = reduce_feat_size(feat_size, stride=stem_feat[-1]['reduction']) - self.stages, stage_feat = create_byob_stages(cfg, drop_path_rate, output_stride, stem_feat[-1], layers=layers) + self.stages, stage_feat = create_byob_stages( + cfg, drop_path_rate, output_stride, stem_feat[-1], layers=layers, feat_size=feat_size) self.feature_info.extend(stage_feat[:-1]) prev_chs = stage_feat[-1]['num_chs'] @@ -836,3 +1044,24 @@ def repvgg_b3g4(pretrained=False, **kwargs): `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 """ return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs) + + +@register_model +def resnet52q(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs) + + +@register_model +def geresnet50t(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnet50t(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs) diff --git a/timm/models/layers/__init__.py b/timm/models/layers/__init__.py index cd192281..30a1b40d 100644 --- a/timm/models/layers/__init__.py +++ b/timm/models/layers/__init__.py @@ -14,20 +14,22 @@ from .create_conv2d import create_conv2d from .create_norm_act import get_norm_act_layer, create_norm_act, convert_norm_act from .create_self_attn import get_self_attn, create_self_attn from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path -from .eca import EcaModule, CecaModule +from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn from .evo_norm import EvoNormBatch2d, EvoNormSample2d +from .gather_excite import GatherExcite +from .global_context import GlobalContext from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible from .inplace_abn import InplaceAbn from .involution import Involution from .linear import Linear from .mixed_conv2d import MixedConv2d from .mlp import Mlp, GluMlp, GatedMlp -from .norm import GroupNorm +from .norm import GroupNorm, LayerNorm2d from .norm_act import BatchNormAct2d, GroupNormAct from .padding import get_padding, get_same_padding, pad_same from .patch_embed import PatchEmbed from .pool2d_same import AvgPool2dSame, create_pool2d -from .se import SEModule +from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite from .selective_kernel import SelectiveKernelConv from .separable_conv import SeparableConv2d, SeparableConvBnAct from .space_to_depth import SpaceToDepthModule diff --git a/timm/models/layers/cbam.py b/timm/models/layers/cbam.py index 44e2fe6d..bacf5cf0 100644 --- a/timm/models/layers/cbam.py +++ b/timm/models/layers/cbam.py @@ -7,78 +7,87 @@ some tasks, especially fine-grained it seems. I may end up removing this impl. Hacked together by / Copyright 2020 Ross Wightman """ - import torch from torch import nn as nn import torch.nn.functional as F + from .conv_bn_act import ConvBnAct +from .create_act import create_act_layer, get_act_layer +from .helpers import make_divisible class ChannelAttn(nn.Module): """ Original CBAM channel attention module, currently avg + max pool variant only. """ - def __init__(self, channels, reduction=16, act_layer=nn.ReLU): + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): super(ChannelAttn, self).__init__() - self.fc1 = nn.Conv2d(channels, channels // reduction, 1, bias=False) + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.fc1 = nn.Conv2d(channels, rd_channels, 1, bias=mlp_bias) self.act = act_layer(inplace=True) - self.fc2 = nn.Conv2d(channels // reduction, channels, 1, bias=False) + self.fc2 = nn.Conv2d(rd_channels, channels, 1, bias=mlp_bias) + self.gate = create_act_layer(gate_layer) def forward(self, x): - x_avg = x.mean((2, 3), keepdim=True) - x_max = F.adaptive_max_pool2d(x, 1) - x_avg = self.fc2(self.act(self.fc1(x_avg))) - x_max = self.fc2(self.act(self.fc1(x_max))) - x_attn = x_avg + x_max - return x * x_attn.sigmoid() + x_avg = self.fc2(self.act(self.fc1(x.mean((2, 3), keepdim=True)))) + x_max = self.fc2(self.act(self.fc1(x.amax((2, 3), keepdim=True)))) + return x * self.gate(x_avg + x_max) class LightChannelAttn(ChannelAttn): """An experimental 'lightweight' that sums avg + max pool first """ - def __init__(self, channels, reduction=16): - super(LightChannelAttn, self).__init__(channels, reduction) + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): + super(LightChannelAttn, self).__init__( + channels, rd_ratio, rd_channels, rd_divisor, act_layer, gate_layer, mlp_bias) def forward(self, x): - x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * F.adaptive_max_pool2d(x, 1) + x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * x.amax((2, 3), keepdim=True) x_attn = self.fc2(self.act(self.fc1(x_pool))) - return x * x_attn.sigmoid() + return x * F.sigmoid(x_attn) class SpatialAttn(nn.Module): """ Original CBAM spatial attention module """ - def __init__(self, kernel_size=7): + def __init__(self, kernel_size=7, gate_layer='sigmoid'): super(SpatialAttn, self).__init__() self.conv = ConvBnAct(2, 1, kernel_size, act_layer=None) + self.gate = create_act_layer(gate_layer) def forward(self, x): - x_avg = torch.mean(x, dim=1, keepdim=True) - x_max = torch.max(x, dim=1, keepdim=True)[0] - x_attn = torch.cat([x_avg, x_max], dim=1) + x_attn = torch.cat([x.mean(dim=1, keepdim=True), x.amax(dim=1, keepdim=True)], dim=1) x_attn = self.conv(x_attn) - return x * x_attn.sigmoid() + return x * self.gate(x_attn) class LightSpatialAttn(nn.Module): """An experimental 'lightweight' variant that sums avg_pool and max_pool results. """ - def __init__(self, kernel_size=7): + def __init__(self, kernel_size=7, gate_layer='sigmoid'): super(LightSpatialAttn, self).__init__() self.conv = ConvBnAct(1, 1, kernel_size, act_layer=None) + self.gate = create_act_layer(gate_layer) def forward(self, x): - x_avg = torch.mean(x, dim=1, keepdim=True) - x_max = torch.max(x, dim=1, keepdim=True)[0] - x_attn = 0.5 * x_avg + 0.5 * x_max + x_attn = 0.5 * x.mean(dim=1, keepdim=True) + 0.5 * x.amax(dim=1, keepdim=True) x_attn = self.conv(x_attn) - return x * x_attn.sigmoid() + return x * self.gate(x_attn) class CbamModule(nn.Module): - def __init__(self, channels, spatial_kernel_size=7): + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): super(CbamModule, self).__init__() - self.channel = ChannelAttn(channels) - self.spatial = SpatialAttn(spatial_kernel_size) + self.channel = ChannelAttn( + channels, rd_ratio=rd_ratio, rd_channels=rd_channels, + rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias) + self.spatial = SpatialAttn(spatial_kernel_size, gate_layer=gate_layer) def forward(self, x): x = self.channel(x) @@ -87,9 +96,13 @@ class CbamModule(nn.Module): class LightCbamModule(nn.Module): - def __init__(self, channels, spatial_kernel_size=7): + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): super(LightCbamModule, self).__init__() - self.channel = LightChannelAttn(channels) + self.channel = LightChannelAttn( + channels, rd_ratio=rd_ratio, rd_channels=rd_channels, + rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias) self.spatial = LightSpatialAttn(spatial_kernel_size) def forward(self, x): diff --git a/timm/models/layers/create_attn.py b/timm/models/layers/create_attn.py index ff20e5df..de866eea 100644 --- a/timm/models/layers/create_attn.py +++ b/timm/models/layers/create_attn.py @@ -3,9 +3,12 @@ Hacked together by / Copyright 2020 Ross Wightman """ import torch -from .se import SEModule, EffectiveSEModule -from .eca import EcaModule, CecaModule + from .cbam import CbamModule, LightCbamModule +from .eca import EcaModule, CecaModule +from .gather_excite import GatherExcite +from .global_context import GlobalContext +from .squeeze_excite import SEModule, EffectiveSEModule def get_attn(attn_type): @@ -23,6 +26,10 @@ def get_attn(attn_type): module_cls = EcaModule elif attn_type == 'ceca': module_cls = CecaModule + elif attn_type == 'ge': + module_cls = GatherExcite + elif attn_type == 'gc': + module_cls = GlobalContext elif attn_type == 'cbam': module_cls = CbamModule elif attn_type == 'lcbam': diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py index 3a7f8b82..d0d8f74a 100644 --- a/timm/models/layers/eca.py +++ b/timm/models/layers/eca.py @@ -65,6 +65,9 @@ class EcaModule(nn.Module): return x * y.expand_as(x) +EfficientChannelAttn = EcaModule # alias + + class CecaModule(nn.Module): """Constructs a circular ECA module. @@ -105,3 +108,6 @@ class CecaModule(nn.Module): y = self.conv(y) y = y.view(x.shape[0], -1, 1, 1).sigmoid() return x * y.expand_as(x) + + +CircularEfficientChannelAttn = CecaModule \ No newline at end of file diff --git a/timm/models/layers/gather_excite.py b/timm/models/layers/gather_excite.py new file mode 100644 index 00000000..2d60dc96 --- /dev/null +++ b/timm/models/layers/gather_excite.py @@ -0,0 +1,90 @@ +""" Gather-Excite Attention Block + +Paper: `Gather-Excite: Exploiting Feature Context in CNNs` - https://arxiv.org/abs/1810.12348 + +Official code here, but it's only partial impl in Caffe: https://github.com/hujie-frank/GENet + +I've tried to support all of the extent both w/ and w/o params. I don't believe I've seen another +impl that covers all of the cases. + +NOTE: extent=0 + extra_params=False is equivalent to Squeeze-and-Excitation + +Hacked together by / Copyright 2021 Ross Wightman +""" +import math + +from torch import nn as nn +import torch.nn.functional as F + +from .create_act import create_act_layer, get_act_layer +from .create_conv2d import create_conv2d +from .helpers import make_divisible +from .mlp import ConvMlp + + +class GatherExcite(nn.Module): + """ Gather-Excite Attention Module + """ + def __init__( + self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True, + rd_ratio=1./16, rd_channels=None, rd_divisor=1, add_maxpool=False, + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'): + super(GatherExcite, self).__init__() + self.add_maxpool = add_maxpool + act_layer = get_act_layer(act_layer) + self.extent = extent + if extra_params: + self.gather = nn.Sequential() + if extent == 0: + assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params' + self.gather.add_module( + 'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm1', nn.BatchNorm2d(channels)) + else: + assert extent % 2 == 0 + num_conv = int(math.log2(extent)) + for i in range(num_conv): + self.gather.add_module( + f'conv{i + 1}', + create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels)) + if i != num_conv - 1: + self.gather.add_module(f'act{i + 1}', act_layer(inplace=True)) + else: + self.gather = None + if self.extent == 0: + self.gk = 0 + self.gs = 0 + else: + assert extent % 2 == 0 + self.gk = self.extent * 2 - 1 + self.gs = self.extent + + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity() + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + size = x.shape[-2:] + if self.gather is not None: + x_ge = self.gather(x) + else: + if self.extent == 0: + # global extent + x_ge = x.mean(dim=(2, 3), keepdims=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True) + else: + x_ge = F.avg_pool2d( + x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2) + x_ge = self.mlp(x_ge) + if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1: + x_ge = F.interpolate(x_ge, size=size) + return x * self.gate(x_ge) diff --git a/timm/models/layers/global_context.py b/timm/models/layers/global_context.py new file mode 100644 index 00000000..4c2c82f3 --- /dev/null +++ b/timm/models/layers/global_context.py @@ -0,0 +1,67 @@ +""" Global Context Attention Block + +Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond` + - https://arxiv.org/abs/1904.11492 + +Official code consulted as reference: https://github.com/xvjiarui/GCNet + +Hacked together by / Copyright 2021 Ross Wightman +""" +from torch import nn as nn +import torch.nn.functional as F + +from .create_act import create_act_layer, get_act_layer +from .helpers import make_divisible +from .mlp import ConvMlp +from .norm import LayerNorm2d + + +class GlobalContext(nn.Module): + + def __init__(self, channels, use_attn=True, fuse_add=True, fuse_scale=False, init_last_zero=False, + rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'): + super(GlobalContext, self).__init__() + act_layer = get_act_layer(act_layer) + + self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None + + if rd_channels is None: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + if fuse_add: + self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_add = None + if fuse_scale: + self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_scale = None + + self.gate = create_act_layer(gate_layer) + self.init_last_zero = init_last_zero + self.reset_parameters() + + def reset_parameters(self): + if self.conv_attn is not None: + nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu') + if self.mlp_add is not None: + nn.init.zeros_(self.mlp_add.fc2.weight) + + def forward(self, x): + B, C, H, W = x.shape + + if self.conv_attn is not None: + attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W) + attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1) + context = x.reshape(B, C, H * W).unsqueeze(1) @ attn + context = context.view(B, C, 1, 1) + else: + context = x.mean(dim=(2, 3), keepdim=True) + + if self.mlp_scale is not None: + mlp_x = self.mlp_scale(context) + x = x * self.gate(mlp_x) + if self.mlp_add is not None: + mlp_x = self.mlp_add(context) + x = x + mlp_x + + return x diff --git a/timm/models/layers/involution.py b/timm/models/layers/involution.py index 0dba9fae..ccdeefcb 100644 --- a/timm/models/layers/involution.py +++ b/timm/models/layers/involution.py @@ -16,7 +16,7 @@ class Involution(nn.Module): kernel_size=3, stride=1, group_size=16, - reduction_ratio=4, + rd_ratio=4, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, ): @@ -28,12 +28,12 @@ class Involution(nn.Module): self.groups = self.channels // self.group_size self.conv1 = ConvBnAct( in_channels=channels, - out_channels=channels // reduction_ratio, + out_channels=channels // rd_ratio, kernel_size=1, norm_layer=norm_layer, act_layer=act_layer) self.conv2 = self.conv = create_conv2d( - in_channels=channels // reduction_ratio, + in_channels=channels // rd_ratio, out_channels=kernel_size**2 * self.groups, kernel_size=1, stride=1) diff --git a/timm/models/layers/mlp.py b/timm/models/layers/mlp.py index b3f8de11..4739ba74 100644 --- a/timm/models/layers/mlp.py +++ b/timm/models/layers/mlp.py @@ -77,3 +77,26 @@ class GatedMlp(nn.Module): x = self.fc2(x) x = self.drop(x) return x + + +class ConvMlp(nn.Module): + """ MLP using 1x1 convs that keeps spatial dims + """ + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, norm_layer=None, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=True) + self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity() + self.act = act_layer() + self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=True) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.norm(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + return x diff --git a/timm/models/layers/norm.py b/timm/models/layers/norm.py index 2925e5c7..433552b4 100644 --- a/timm/models/layers/norm.py +++ b/timm/models/layers/norm.py @@ -12,3 +12,12 @@ class GroupNorm(nn.GroupNorm): def forward(self, x): return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + + +class LayerNorm2d(nn.LayerNorm): + """ Layernorm for channels of '2d' spatial BCHW tensors """ + def __init__(self, num_channels): + super().__init__([num_channels, 1, 1]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) diff --git a/timm/models/layers/se.py b/timm/models/layers/se.py deleted file mode 100644 index 4354144d..00000000 --- a/timm/models/layers/se.py +++ /dev/null @@ -1,50 +0,0 @@ -from torch import nn as nn -import torch.nn.functional as F - -from .create_act import create_act_layer -from .helpers import make_divisible - - -class SEModule(nn.Module): - """ SE Module as defined in original SE-Nets with a few additions - Additions include: - * min_channels can be specified to keep reduced channel count at a minimum (default: 8) - * divisor can be specified to keep channels rounded to specified values (default: 1) - * reduction channels can be specified directly by arg (if reduction_channels is set) - * reduction channels can be specified by float ratio (if reduction_ratio is set) - """ - def __init__(self, channels, reduction=16, act_layer=nn.ReLU, gate_layer='sigmoid', - reduction_ratio=None, reduction_channels=None, min_channels=8, divisor=1): - super(SEModule, self).__init__() - if reduction_channels is not None: - reduction_channels = reduction_channels # direct specification highest priority, no rounding/min done - elif reduction_ratio is not None: - reduction_channels = make_divisible(channels * reduction_ratio, divisor, min_channels) - else: - reduction_channels = make_divisible(channels // reduction, divisor, min_channels) - self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True) - self.act = act_layer(inplace=True) - self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True) - self.gate = create_act_layer(gate_layer) - - def forward(self, x): - x_se = x.mean((2, 3), keepdim=True) - x_se = self.fc1(x_se) - x_se = self.act(x_se) - x_se = self.fc2(x_se) - return x * self.gate(x_se) - - -class EffectiveSEModule(nn.Module): - """ 'Effective Squeeze-Excitation - From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 - """ - def __init__(self, channels, gate_layer='hard_sigmoid'): - super(EffectiveSEModule, self).__init__() - self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) - self.gate = create_act_layer(gate_layer) - - def forward(self, x): - x_se = x.mean((2, 3), keepdim=True) - x_se = self.fc(x_se) - return x * self.gate(x_se) diff --git a/timm/models/layers/squeeze_excite.py b/timm/models/layers/squeeze_excite.py new file mode 100644 index 00000000..3e8a05bb --- /dev/null +++ b/timm/models/layers/squeeze_excite.py @@ -0,0 +1,74 @@ +""" Squeeze-and-Excitation Channel Attention + +An SE implementation originally based on PyTorch SE-Net impl. +Has since evolved with additional functionality / configuration. + +Paper: `Squeeze-and-Excitation Networks` - https://arxiv.org/abs/1709.01507 + +Also included is Effective Squeeze-Excitation (ESE). +Paper: `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + +Hacked together by / Copyright 2021 Ross Wightman +""" +from torch import nn as nn + +from .create_act import create_act_layer +from .helpers import make_divisible + + +class SEModule(nn.Module): + """ SE Module as defined in original SE-Nets with a few additions + Additions include: + * divisor can be specified to keep channels % div == 0 (default: 8) + * reduction channels can be specified directly by arg (if rd_channels is set) + * reduction channels can be specified by float rd_ratio (default: 1/16) + * global max pooling can be added to the squeeze aggregation + * customizable activation, normalization, and gate layer + """ + def __init__( + self, channels, rd_ratio=1. / 16, rd_channels=None, rd_divisor=8, add_maxpool=False, + act_layer=nn.ReLU, norm_layer=None, gate_layer='sigmoid'): + super(SEModule, self).__init__() + self.add_maxpool = add_maxpool + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.fc1 = nn.Conv2d(channels, rd_channels, kernel_size=1, bias=True) + self.bn = norm_layer(rd_channels) if norm_layer else nn.Identity() + self.act = create_act_layer(act_layer, inplace=True) + self.fc2 = nn.Conv2d(rd_channels, channels, kernel_size=1, bias=True) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True) + x_se = self.fc1(x_se) + x_se = self.act(self.bn(x_se)) + x_se = self.fc2(x_se) + return x * self.gate(x_se) + + +SqueezeExcite = SEModule # alias + + +class EffectiveSEModule(nn.Module): + """ 'Effective Squeeze-Excitation + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + """ + def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid'): + super(EffectiveSEModule, self).__init__() + self.add_maxpool = add_maxpool + self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.gate(x_se) + + +EffectiveSqueezeExcite = EffectiveSEModule # alias diff --git a/timm/models/nfnet.py b/timm/models/nfnet.py index 1b67581e..593796a5 100644 --- a/timm/models/nfnet.py +++ b/timm/models/nfnet.py @@ -182,7 +182,7 @@ def _nfres_cfg( def _nfreg_cfg(depths, channels=(48, 104, 208, 440)): num_features = 1280 * channels[-1] // 440 - attn_kwargs = dict(reduction_ratio=0.5, divisor=8) + attn_kwargs = dict(rd_ratio=0.5) cfg = NfCfg( depths=depths, channels=channels, stem_type='3x3', group_size=8, width_factor=0.75, bottle_ratio=2.25, num_features=num_features, reg=True, attn_layer='se', attn_kwargs=attn_kwargs) @@ -193,7 +193,7 @@ def _nfnet_cfg( depths, channels=(256, 512, 1536, 1536), group_size=128, bottle_ratio=0.5, feat_mult=2., act_layer='gelu', attn_layer='se', attn_kwargs=None): num_features = int(channels[-1] * feat_mult) - attn_kwargs = attn_kwargs if attn_kwargs is not None else dict(reduction_ratio=0.5, divisor=8) + attn_kwargs = attn_kwargs if attn_kwargs is not None else dict(rd_ratio=0.5) cfg = NfCfg( depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=group_size, bottle_ratio=bottle_ratio, extra_conv=True, num_features=num_features, act_layer=act_layer, @@ -202,11 +202,10 @@ def _nfnet_cfg( def _dm_nfnet_cfg(depths, channels=(256, 512, 1536, 1536), act_layer='gelu', skipinit=True): - attn_kwargs = dict(reduction_ratio=0.5, divisor=8) cfg = NfCfg( depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=128, bottle_ratio=0.5, extra_conv=True, gamma_in_act=True, same_padding=True, skipinit=skipinit, - num_features=int(channels[-1] * 2.0), act_layer=act_layer, attn_layer='se', attn_kwargs=attn_kwargs) + num_features=int(channels[-1] * 2.0), act_layer=act_layer, attn_layer='se', attn_kwargs=dict(rd_ratio=0.5)) return cfg @@ -243,7 +242,7 @@ model_cfgs = dict( # Experimental 'light' versions of NFNet-F that are little leaner nfnet_l0=_nfnet_cfg( depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25, - attn_kwargs=dict(reduction_ratio=0.25, divisor=8), act_layer='silu'), + attn_kwargs=dict(rd_ratio=0.25, rd_divisor=8), act_layer='silu'), eca_nfnet_l0=_nfnet_cfg( depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25, attn_layer='eca', attn_kwargs=dict(), act_layer='silu'), @@ -272,9 +271,9 @@ model_cfgs = dict( nf_resnet50=_nfres_cfg(depths=(3, 4, 6, 3)), nf_resnet101=_nfres_cfg(depths=(3, 4, 23, 3)), - nf_seresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)), - nf_seresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)), - nf_seresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)), + nf_seresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), + nf_seresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), + nf_seresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), nf_ecaresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='eca', attn_kwargs=dict()), nf_ecaresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='eca', attn_kwargs=dict()), diff --git a/timm/models/regnet.py b/timm/models/regnet.py index 3b7dba52..6a381074 100644 --- a/timm/models/regnet.py +++ b/timm/models/regnet.py @@ -146,7 +146,7 @@ class Bottleneck(nn.Module): groups=groups, **cargs) if se_ratio: se_channels = int(round(in_chs * se_ratio)) - self.se = SEModule(bottleneck_chs, reduction_channels=se_channels) + self.se = SEModule(bottleneck_chs, rd_channels=se_channels) else: self.se = None cargs['act_layer'] = None diff --git a/timm/models/resnet.py b/timm/models/resnet.py index 2b0b0339..2f02f12a 100644 --- a/timm/models/resnet.py +++ b/timm/models/resnet.py @@ -1122,7 +1122,7 @@ def resnetrs50(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1135,7 +1135,7 @@ def resnetrs101(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1148,7 +1148,7 @@ def resnetrs152(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1161,7 +1161,7 @@ def resnetrs200(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1174,7 +1174,7 @@ def resnetrs270(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1188,7 +1188,7 @@ def resnetrs350(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) @@ -1201,7 +1201,7 @@ def resnetrs420(pretrained=False, **kwargs): Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs """ - attn_layer = partial(get_attn('se'), reduction_ratio=0.25) + attn_layer = partial(get_attn('se'), rd_ratio=0.25) model_args = dict( block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) diff --git a/timm/models/rexnet.py b/timm/models/rexnet.py index 859b584e..7ab8d659 100644 --- a/timm/models/rexnet.py +++ b/timm/models/rexnet.py @@ -11,11 +11,12 @@ Copyright 2020 Ross Wightman """ import torch.nn as nn +from functools import partial from math import ceil from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg -from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath, make_divisible +from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath, make_divisible, SEModule from .registry import register_model from .efficientnet_builder import efficientnet_init_weights @@ -48,26 +49,7 @@ default_cfgs = dict( url=''), ) - -class SEWithNorm(nn.Module): - - def __init__(self, channels, se_ratio=1 / 12., act_layer=nn.ReLU, divisor=1, reduction_channels=None, - gate_layer='sigmoid'): - super(SEWithNorm, self).__init__() - reduction_channels = reduction_channels or make_divisible(int(channels * se_ratio), divisor=divisor) - self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True) - self.bn = nn.BatchNorm2d(reduction_channels) - self.act = act_layer(inplace=True) - self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True) - self.gate = create_act_layer(gate_layer) - - def forward(self, x): - x_se = x.mean((2, 3), keepdim=True) - x_se = self.fc1(x_se) - x_se = self.bn(x_se) - x_se = self.act(x_se) - x_se = self.fc2(x_se) - return x * self.gate(x_se) +SEWithNorm = partial(SEModule, norm_layer=nn.BatchNorm2d) class LinearBottleneck(nn.Module): @@ -86,7 +68,10 @@ class LinearBottleneck(nn.Module): self.conv_exp = None self.conv_dw = ConvBnAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False) - self.se = SEWithNorm(dw_chs, se_ratio=se_ratio, divisor=ch_div) if se_ratio > 0. else None + if se_ratio > 0: + self.se = SEWithNorm(dw_chs, rd_channels=make_divisible(int(dw_chs * se_ratio), ch_div)) + else: + self.se = None self.act_dw = create_act_layer(dw_act_layer) self.conv_pwl = ConvBnAct(dw_chs, out_chs, 1, apply_act=False) diff --git a/timm/models/tresnet.py b/timm/models/tresnet.py index 9fb34c20..372bfb7b 100644 --- a/timm/models/tresnet.py +++ b/timm/models/tresnet.py @@ -84,8 +84,8 @@ class BasicBlock(nn.Module): self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride - reduction_chs = max(planes * self.expansion // 4, 64) - self.se = SEModule(planes * self.expansion, reduction_channels=reduction_chs) if use_se else None + rd_chs = max(planes * self.expansion // 4, 64) + self.se = SEModule(planes * self.expansion, rd_channels=rd_chs) if use_se else None def forward(self, x): if self.downsample is not None: @@ -125,7 +125,7 @@ class Bottleneck(nn.Module): aa_layer(channels=planes, filt_size=3, stride=2)) reduction_chs = max(planes * self.expansion // 8, 64) - self.se = SEModule(planes, reduction_channels=reduction_chs) if use_se else None + self.se = SEModule(planes, rd_channels=reduction_chs) if use_se else None self.conv3 = conv2d_iabn( planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity") diff --git a/timm/models/visformer.py b/timm/models/visformer.py index 33a2fe87..5583ea3c 100644 --- a/timm/models/visformer.py +++ b/timm/models/visformer.py @@ -13,7 +13,7 @@ import torch.nn.functional as F from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg, overlay_external_default_cfg -from .layers import to_2tuple, trunc_normal_, DropPath, PatchEmbed +from .layers import to_2tuple, trunc_normal_, DropPath, PatchEmbed, LayerNorm2d from .registry import register_model @@ -39,15 +39,6 @@ default_cfgs = dict( ) -class LayerNormBHWC(nn.LayerNorm): - def __init__(self, dim): - super().__init__(dim) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return F.layer_norm( - x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2) - - class SpatialMlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., group=8, spatial_conv=False): @@ -119,7 +110,7 @@ class Attention(nn.Module): class Block(nn.Module): def __init__(self, dim, num_heads, head_dim_ratio=1., mlp_ratio=4., - drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=LayerNormBHWC, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm2d, group=8, attn_disabled=False, spatial_conv=False): super().__init__() self.spatial_conv = spatial_conv @@ -148,7 +139,7 @@ class Block(nn.Module): class Visformer(nn.Module): def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., drop_rate=0., attn_drop_rate=0., drop_path_rate=0., - norm_layer=LayerNormBHWC, attn_stage='111', pos_embed=True, spatial_conv='111', + norm_layer=LayerNorm2d, attn_stage='111', pos_embed=True, spatial_conv='111', vit_stem=False, group=8, pool=True, conv_init=False, embed_norm=None): super().__init__() self.num_classes = num_classes From f615474be317b1e015c082b7dabd391f461c10b7 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 27 May 2021 18:12:22 -0700 Subject: [PATCH 02/12] Fix broken test, repvgg block doesn't have attn_last attr. --- timm/models/byobnet.py | 5 ++--- timm/models/layers/eca.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index 3f162c79..aab44365 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -573,9 +573,8 @@ class RepVggBlock(nn.Module): if isinstance(m, nn.BatchNorm2d): nn.init.normal_(m.weight, .1, .1) nn.init.normal_(m.bias, 0, .1) - for attn in (self.attn, self.attn_last): - if hasattr(attn, 'reset_parameters'): - attn.reset_parameters() + if hasattr(self.attn, 'reset_parameters'): + self.attn.reset_parameters() def forward(self, x): if self.identity is None: diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py index d0d8f74a..f2980730 100644 --- a/timm/models/layers/eca.py +++ b/timm/models/layers/eca.py @@ -110,4 +110,4 @@ class CecaModule(nn.Module): return x * y.expand_as(x) -CircularEfficientChannelAttn = CecaModule \ No newline at end of file +CircularEfficientChannelAttn = CecaModule From 02f9d4bc34d8fda03903fe2d8e6f3599e3f1fd38 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 28 May 2021 09:53:16 -0700 Subject: [PATCH 03/12] Add weights for resnet51q model, add 61q def. --- timm/models/byobnet.py | 273 +++++++++++++++++++++++------------------ 1 file changed, 156 insertions(+), 117 deletions(-) diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index aab44365..8214b490 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -88,9 +88,16 @@ default_cfgs = { first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), # experimental configs - 'resnet52qs': _cfg(first_conv='stem.conv1.conv'), - 'geresnet50t': _cfg(first_conv='stem.conv1.conv'), - 'gcresnet50t': _cfg(first_conv='stem.conv1.conv'), + 'resnet51q': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet51q_ra2-d47dcc76.pth', + first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), + test_input_size=(3, 288, 288), crop_pct=1.0), + 'resnet61q': _cfg( + first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'), + 'geresnet50t': _cfg( + first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'), + 'gcresnet50t': _cfg( + first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'), } @@ -241,7 +248,7 @@ model_cfgs = dict( ), # WARN: experimental, may vanish/change - resnet52q=ByoModelCfg( + resnet51q=ByoModelCfg( blocks=( ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), @@ -249,11 +256,27 @@ model_cfgs = dict( ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), ), stem_chs=128, - stem_type='quad', + stem_type='quad2', + stem_pool=None, num_features=2048, act_layer='silu', ), + resnet61q=ByoModelCfg( + blocks=( + ByoBlockCfg(type='edge', d=1, c=256, s=1, gs=0, br=1.0, block_kwargs=dict()), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), + ), + stem_chs=128, + stem_type='quad', + stem_pool=None, + num_features=2048, + act_layer='silu', + block_kwargs=dict(extra_conv=True), + ), + # WARN: experimental, may vanish/change geresnet50t=ByoModelCfg( blocks=( @@ -287,6 +310,122 @@ model_cfgs = dict( ) +@register_model +def gernet_l(pretrained=False, **kwargs): + """ GEResNet-Large (GENet-Large from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs) + + +@register_model +def gernet_m(pretrained=False, **kwargs): + """ GEResNet-Medium (GENet-Normal from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs) + + +@register_model +def gernet_s(pretrained=False, **kwargs): + """ EResNet-Small (GENet-Small from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_a2(pretrained=False, **kwargs): + """ RepVGG-A2 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b0(pretrained=False, **kwargs): + """ RepVGG-B0 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b1(pretrained=False, **kwargs): + """ RepVGG-B1 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b1g4(pretrained=False, **kwargs): + """ RepVGG-B1g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b2(pretrained=False, **kwargs): + """ RepVGG-B2 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b2g4(pretrained=False, **kwargs): + """ RepVGG-B2g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b3(pretrained=False, **kwargs): + """ RepVGG-B3 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b3g4(pretrained=False, **kwargs): + """ RepVGG-B3g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs) + + +@register_model +def resnet51q(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet51q', pretrained=pretrained, **kwargs) + + +@register_model +def resnet61q(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet61q', pretrained=pretrained, **kwargs) + + +@register_model +def geresnet50t(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnet50t(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs) + + def expand_blocks_cfg(stage_blocks_cfg: Union[ByoBlockCfg, Sequence[ByoBlockCfg]]) -> List[ByoBlockCfg]: if not isinstance(stage_blocks_cfg, Sequence): stage_blocks_cfg = (stage_blocks_cfg,) @@ -391,8 +530,8 @@ class BottleneckBlock(nn.Module): """ def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, - downsample='avg', attn_last=False, linear_out=False, layers: LayerFn = None, drop_block=None, - drop_path_rate=0.): + downsample='avg', attn_last=False, linear_out=False, extra_conv=False, layers: LayerFn = None, + drop_block=None, drop_path_rate=0.): super(BottleneckBlock, self).__init__() layers = layers or LayerFn() mid_chs = make_divisible(out_chs * bottle_ratio) @@ -409,6 +548,14 @@ class BottleneckBlock(nn.Module): self.conv2_kxk = layers.conv_norm_act( mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_block=drop_block) + self.conv2_kxk = layers.conv_norm_act( + mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], + groups=groups, drop_block=drop_block) + if extra_conv: + self.conv2b_kxk = layers.conv_norm_act( + mid_chs, mid_chs, kernel_size, dilation=dilation[1], groups=groups, drop_block=drop_block) + else: + self.conv2b_kxk = nn.Identity() self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) @@ -427,6 +574,7 @@ class BottleneckBlock(nn.Module): x = self.conv1_1x1(x) x = self.conv2_kxk(x) + x = self.conv2b_kxk(x) x = self.attn(x) x = self.conv3_1x1(x) x = self.attn_last(x) @@ -714,7 +862,7 @@ class Stem(nn.Sequential): def create_byob_stem(in_chs, out_chs, stem_type='', pool_type='', feat_prefix='stem', layers: LayerFn = None): layers = layers or LayerFn() - assert stem_type in ('', 'quad', 'tiered', 'deep', 'rep', '7x7', '3x3') + assert stem_type in ('', 'quad', 'quad2', 'tiered', 'deep', 'rep', '7x7', '3x3') if 'quad' in stem_type: # based on NFNet stem, stack of 4 3x3 convs num_act = 2 if 'quad2' in stem_type else None @@ -955,112 +1103,3 @@ def _create_byobnet(variant, pretrained=False, **kwargs): model_cfg=model_cfgs[variant], feature_cfg=dict(flatten_sequential=True), **kwargs) - - -@register_model -def gernet_l(pretrained=False, **kwargs): - """ GEResNet-Large (GENet-Large from official impl) - `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 - """ - return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs) - - -@register_model -def gernet_m(pretrained=False, **kwargs): - """ GEResNet-Medium (GENet-Normal from official impl) - `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 - """ - return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs) - - -@register_model -def gernet_s(pretrained=False, **kwargs): - """ EResNet-Small (GENet-Small from official impl) - `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 - """ - return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_a2(pretrained=False, **kwargs): - """ RepVGG-A2 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b0(pretrained=False, **kwargs): - """ RepVGG-B0 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b1(pretrained=False, **kwargs): - """ RepVGG-B1 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b1g4(pretrained=False, **kwargs): - """ RepVGG-B1g4 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b2(pretrained=False, **kwargs): - """ RepVGG-B2 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b2g4(pretrained=False, **kwargs): - """ RepVGG-B2g4 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b3(pretrained=False, **kwargs): - """ RepVGG-B3 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs) - - -@register_model -def repvgg_b3g4(pretrained=False, **kwargs): - """ RepVGG-B3g4 - `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 - """ - return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs) - - -@register_model -def resnet52q(pretrained=False, **kwargs): - """ - """ - return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs) - - -@register_model -def geresnet50t(pretrained=False, **kwargs): - """ - """ - return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs) - - -@register_model -def gcresnet50t(pretrained=False, **kwargs): - """ - """ - return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs) From 9611458e199793a5a46c3fb5ce7031195e16bfbe Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 28 May 2021 20:47:24 -0700 Subject: [PATCH 04/12] Throw in some FBNetV3 code I had lying around, some refactoring of SE reduction channel calcs for all EffNet archs. --- timm/models/byobnet.py | 2 +- timm/models/efficientnet_blocks.py | 16 ++-- timm/models/efficientnet_builder.py | 21 ++--- timm/models/ghostnet.py | 2 +- timm/models/hardcorenas.py | 4 +- timm/models/layers/helpers.py | 2 +- timm/models/mobilenetv3.py | 119 ++++++++++++++++++++++++++-- 7 files changed, 135 insertions(+), 31 deletions(-) diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index 8214b490..8ec8690a 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -90,7 +90,7 @@ default_cfgs = { # experimental configs 'resnet51q': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet51q_ra2-d47dcc76.pth', - first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), + first_conv='stem.conv1', input_size=(3, 256, 256), pool_size=(8, 8), test_input_size=(3, 288, 288), crop_pct=1.0), 'resnet61q': _cfg( first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'), diff --git a/timm/models/efficientnet_blocks.py b/timm/models/efficientnet_blocks.py index 7853db0e..ea0c791e 100644 --- a/timm/models/efficientnet_blocks.py +++ b/timm/models/efficientnet_blocks.py @@ -22,18 +22,16 @@ class SqueezeExcite(nn.Module): se_ratio (float): ratio of squeeze reduction act_layer (nn.Module): activation layer of containing block gate_fn (Callable): attention gate function - block_in_chs (int): input channels of containing block (for calculating reduction from) - reduce_from_block (bool): calculate reduction from block input channels if True force_act_layer (nn.Module): override block's activation fn if this is set/bound - divisor (int): make reduction channels divisible by this + round_chs_fn (Callable): specify a fn to calculate rounding of reduced chs """ def __init__( self, in_chs, se_ratio=0.25, act_layer=nn.ReLU, gate_fn=sigmoid, - block_in_chs=None, reduce_from_block=True, force_act_layer=None, divisor=1): + force_act_layer=None, round_chs_fn=None): super(SqueezeExcite, self).__init__() - reduced_chs = (block_in_chs or in_chs) if reduce_from_block else in_chs - reduced_chs = make_divisible(reduced_chs * se_ratio, divisor) + round_chs_fn = round_chs_fn or round + reduced_chs = round_chs_fn(in_chs * se_ratio) act_layer = force_act_layer or act_layer self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) self.act1 = create_act_layer(act_layer, inplace=True) @@ -168,8 +166,7 @@ class InvertedResidual(nn.Module): self.act2 = act_layer(inplace=True) # Squeeze-and-excitation - self.se = se_layer( - mid_chs, se_ratio=se_ratio, act_layer=act_layer, block_in_chs=in_chs) if has_se else nn.Identity() + self.se = se_layer(mid_chs, se_ratio=se_ratio, act_layer=act_layer) if has_se else nn.Identity() # Point-wise linear projection self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs) @@ -292,8 +289,7 @@ class EdgeResidual(nn.Module): self.act1 = act_layer(inplace=True) # Squeeze-and-excitation - self.se = SqueezeExcite( - mid_chs, se_ratio=se_ratio, act_layer=act_layer, block_in_chs=in_chs) if has_se else nn.Identity() + self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, act_layer=act_layer) if has_se else nn.Identity() # Point-wise linear projection self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type) diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py index 57e2039b..35019747 100644 --- a/timm/models/efficientnet_builder.py +++ b/timm/models/efficientnet_builder.py @@ -265,11 +265,12 @@ class EfficientNetBuilder: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py """ - def __init__(self, output_stride=32, pad_type='', round_chs_fn=round_channels, + def __init__(self, output_stride=32, pad_type='', round_chs_fn=round_channels, se_from_exp=False, act_layer=None, norm_layer=None, se_layer=None, drop_path_rate=0., feature_location=''): self.output_stride = output_stride self.pad_type = pad_type self.round_chs_fn = round_chs_fn + self.se_from_exp = se_from_exp # calculate se channel reduction from expanded (mid) chs self.act_layer = act_layer self.norm_layer = norm_layer self.se_layer = se_layer @@ -301,6 +302,8 @@ class EfficientNetBuilder: ba['norm_layer'] = self.norm_layer if bt != 'cn': ba['se_layer'] = self.se_layer + if not self.se_from_exp and ba['se_ratio']: + ba['se_ratio'] /= ba.get('exp_ratio', 1.0) ba['drop_path_rate'] = drop_path_rate if bt == 'ir': @@ -418,28 +421,28 @@ def _init_weight_goog(m, n='', fix_group_fanout=True): if fix_group_fanout: fan_out //= m.groups init_weight_fn = get_condconv_initializer( - lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) init_weight_fn(m.weight) if m.bias is not None: - m.bias.data.zero_() + nn.init.zeros_(m.bias) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels if fix_group_fanout: fan_out //= m.groups - m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) if m.bias is not None: - m.bias.data.zero_() + nn.init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1.0) - m.bias.data.zero_() + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): fan_out = m.weight.size(0) # fan-out fan_in = 0 if 'routing_fn' in n: fan_in = m.weight.size(1) init_range = 1.0 / math.sqrt(fan_in + fan_out) - m.weight.data.uniform_(-init_range, init_range) - m.bias.data.zero_() + nn.init.uniform_(m.weight, -init_range, init_range) + nn.init.zeros_(m.bias) def efficientnet_init_weights(model: nn.Module, init_fn=None): diff --git a/timm/models/ghostnet.py b/timm/models/ghostnet.py index 1783ff7a..d82a91b4 100644 --- a/timm/models/ghostnet.py +++ b/timm/models/ghostnet.py @@ -40,7 +40,7 @@ default_cfgs = { } -_SE_LAYER = partial(SqueezeExcite, gate_fn='hard_sigmoid', divisor=4) +_SE_LAYER = partial(SqueezeExcite, gate_fn='hard_sigmoid', round_chs_fn=partial(make_divisible, divisor=4)) class GhostModule(nn.Module): diff --git a/timm/models/hardcorenas.py b/timm/models/hardcorenas.py index 231bb4b6..16b9c4bc 100644 --- a/timm/models/hardcorenas.py +++ b/timm/models/hardcorenas.py @@ -4,7 +4,7 @@ import torch.nn as nn from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .efficientnet_blocks import SqueezeExcite -from .efficientnet_builder import decode_arch_def, resolve_act_layer, resolve_bn_args +from .efficientnet_builder import decode_arch_def, resolve_act_layer, resolve_bn_args, round_channels from .helpers import build_model_with_cfg, default_cfg_for_features from .layers import get_act_fn from .mobilenetv3 import MobileNetV3, MobileNetV3Features @@ -40,7 +40,7 @@ def _gen_hardcorenas(pretrained, variant, arch_def, **kwargs): """ num_features = 1280 se_layer = partial( - SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, reduce_from_block=False, divisor=8) + SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, round_chs_fn=round_channels) model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=num_features, diff --git a/timm/models/layers/helpers.py b/timm/models/layers/helpers.py index 64573ef6..cc54ca7f 100644 --- a/timm/models/layers/helpers.py +++ b/timm/models/layers/helpers.py @@ -28,4 +28,4 @@ def make_divisible(v, divisor=8, min_value=None, round_limit=.9): # Make sure that round down does not go down by more than 10%. if new_v < round_limit * v: new_v += divisor - return new_v \ No newline at end of file + return new_v diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py index 9afa3d75..fad88aa7 100644 --- a/timm/models/mobilenetv3.py +++ b/timm/models/mobilenetv3.py @@ -72,6 +72,10 @@ default_cfgs = { 'tf_mobilenetv3_small_minimal_100': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth', mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + + 'fbnetv3_b': _cfg(), + 'fbnetv3_d': _cfg(), + 'fbnetv3_g': _cfg(), } @@ -86,7 +90,7 @@ class MobileNetV3(nn.Module): """ def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True, - pad_type='', act_layer=None, norm_layer=None, se_layer=None, + pad_type='', act_layer=None, norm_layer=None, se_layer=None, se_from_exp=True, round_chs_fn=round_channels, drop_rate=0., drop_path_rate=0., global_pool='avg'): super(MobileNetV3, self).__init__() act_layer = act_layer or nn.ReLU @@ -104,7 +108,7 @@ class MobileNetV3(nn.Module): # Middle stages (IR/ER/DS Blocks) builder = EfficientNetBuilder( - output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, + output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate) self.blocks = nn.Sequential(*builder(stem_size, block_args)) self.feature_info = builder.features @@ -161,8 +165,8 @@ class MobileNetV3Features(nn.Module): and object detection models. """ - def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', - in_chans=3, stem_size=16, output_stride=32, pad_type='', round_chs_fn=round_channels, + def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3, + stem_size=16, output_stride=32, pad_type='', round_chs_fn=round_channels, se_from_exp=True, act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.): super(MobileNetV3Features, self).__init__() act_layer = act_layer or nn.ReLU @@ -178,7 +182,7 @@ class MobileNetV3Features(nn.Module): # Middle stages (IR/ER/DS Blocks) builder = EfficientNetBuilder( - output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, + output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate, feature_location=feature_location) self.blocks = nn.Sequential(*builder(stem_size, block_args)) @@ -262,7 +266,7 @@ def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kw round_chs_fn=partial(round_channels, multiplier=channel_multiplier), norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), act_layer=resolve_act_layer(kwargs, 'hard_swish'), - se_layer=partial(SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), reduce_from_block=False), + se_layer=partial(SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid')), **kwargs, ) model = _create_mnv3(variant, pretrained, **model_kwargs) @@ -351,7 +355,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg ['cn_r1_k1_s1_c960'], # hard-swish ] se_layer = partial( - SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, reduce_from_block=False, divisor=8) + SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, round_chs_fn=round_channels) model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=num_features, @@ -366,6 +370,86 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg return model +def _gen_fbnetv3(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """ FBNetV3 + FIXME untested, this is a preliminary impl of some FBNet-V3 variants. + """ + vl = variant.split('_')[-1] + if vl in ('a', 'b'): + stem_size = 16 + arch_def = [ + # stage 0, 112x112 in + ['ds_r2_k3_s1_e1_c16'], + # stage 1, 112x112 in + ['ir_r1_k5_s2_e4_c24', 'ir_r3_k5_s1_e2_c24'], + # stage 2, 56x56 in + ['ir_r1_k5_s2_e5_c40_se0.25', 'ir_r4_k5_s1_e3_c40_se0.25'], + # stage 3, 28x28 in + ['ir_r1_k5_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], + # stage 4, 14x14in + ['ir_r1_k3_s1_e5_c120_se0.25', 'ir_r5_k5_s1_e3_c120_se0.25'], + # stage 5, 14x14in + ['ir_r1_k3_s2_e6_c184_se0.25', 'ir_r5_k5_s1_e4_c184_se0.25', 'ir_r1_k5_s1_e6_c224_se0.25'], + # stage 6, 7x7 in + ['cn_r1_k1_s1_c1344'], + ] + elif vl == 'd': + stem_size = 24 + arch_def = [ + # stage 0, 112x112 in + ['ds_r2_k3_s1_e1_c16'], + # stage 1, 112x112 in + ['ir_r1_k3_s2_e5_c24', 'ir_r5_k3_s1_e2_c24'], + # stage 2, 56x56 in + ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r4_k3_s1_e3_c40_se0.25'], + # stage 3, 28x28 in + ['ir_r1_k3_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], + # stage 4, 14x14in + ['ir_r1_k3_s1_e5_c128_se0.25', 'ir_r6_k5_s1_e3_c128_se0.25'], + # stage 5, 14x14in + ['ir_r1_k3_s2_e6_c208_se0.25', 'ir_r5_k5_s1_e5_c208_se0.25', 'ir_r1_k5_s1_e6_c240_se0.25'], + # stage 6, 7x7 in + ['cn_r1_k1_s1_c1440'], + ] + elif vl == 'g': + stem_size = 32 + arch_def = [ + # stage 0, 112x112 in + ['ds_r3_k3_s1_e1_c24'], + # stage 1, 112x112 in + ['ir_r1_k5_s2_e4_c40', 'ir_r4_k5_s1_e2_c40'], + # stage 2, 56x56 in + ['ir_r1_k5_s2_e4_c56_se0.25', 'ir_r4_k5_s1_e3_c56_se0.25'], + # stage 3, 28x28 in + ['ir_r1_k5_s2_e5_c104', 'ir_r4_k3_s1_e3_c104'], + # stage 4, 14x14in + ['ir_r1_k3_s1_e5_c160_se0.25', 'ir_r8_k5_s1_e3_c160_se0.25'], + # stage 5, 14x14in + ['ir_r1_k3_s2_e6_c264_se0.25', 'ir_r6_k5_s1_e5_c264_se0.25', 'ir_r2_k5_s1_e6_c288_se0.25'], + # stage 6, 7x7 in + ['cn_r1_k1_s1_c1728'], # hard-swish + ] + else: + raise NotImplemented + round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.95) + se_layer = partial(SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), round_chs_fn=round_chs_fn) + act_layer = resolve_act_layer(kwargs, 'hard_swish') + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + num_features=1984, + head_bias=False, + stem_size=stem_size, + round_chs_fn=round_chs_fn, + se_from_exp=False, + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=act_layer, + se_layer=se_layer, + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + @register_model def mobilenetv3_large_075(pretrained=False, **kwargs): """ MobileNet V3 """ @@ -474,3 +558,24 @@ def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs): kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs) return model + + +@register_model +def fbnetv3_b(pretrained=False, **kwargs): + """ FBNetV3-B """ + model = _gen_fbnetv3('fbnetv3_b', pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetv3_d(pretrained=False, **kwargs): + """ FBNetV3-D """ + model = _gen_fbnetv3('fbnetv3_d', pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetv3_g(pretrained=False, **kwargs): + """ FBNetV3-G """ + model = _gen_fbnetv3('fbnetv3_g', pretrained=pretrained, **kwargs) + return model From bcec14d3b585d7b5c469705f99fb2d830bdcdb7d Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sat, 29 May 2021 23:41:38 -0700 Subject: [PATCH 05/12] Bring EfficientNet SE layer in line with others, pull se_ratio outside of blocks. Allows swapping w/ other attn layers. --- timm/models/efficientnet_blocks.py | 53 ++++++++++++++--------------- timm/models/efficientnet_builder.py | 33 ++++++++++++------ timm/models/ghostnet.py | 4 +-- timm/models/hardcorenas.py | 3 +- timm/models/mobilenetv3.py | 32 ++++------------- 5 files changed, 57 insertions(+), 68 deletions(-) diff --git a/timm/models/efficientnet_blocks.py b/timm/models/efficientnet_blocks.py index ea0c791e..b43f38f5 100644 --- a/timm/models/efficientnet_blocks.py +++ b/timm/models/efficientnet_blocks.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from torch.nn import functional as F -from .layers import create_conv2d, drop_path, make_divisible, get_act_fn, create_act_layer +from .layers import create_conv2d, drop_path, make_divisible, create_act_layer from .layers.activations import sigmoid __all__ = [ @@ -19,31 +19,32 @@ class SqueezeExcite(nn.Module): Args: in_chs (int): input channels to layer - se_ratio (float): ratio of squeeze reduction + rd_ratio (float): ratio of squeeze reduction act_layer (nn.Module): activation layer of containing block - gate_fn (Callable): attention gate function + gate_layer (Callable): attention gate function force_act_layer (nn.Module): override block's activation fn if this is set/bound - round_chs_fn (Callable): specify a fn to calculate rounding of reduced chs + rd_round_fn (Callable): specify a fn to calculate rounding of reduced chs """ def __init__( - self, in_chs, se_ratio=0.25, act_layer=nn.ReLU, gate_fn=sigmoid, - force_act_layer=None, round_chs_fn=None): + self, in_chs, rd_ratio=0.25, rd_channels=None, act_layer=nn.ReLU, + gate_layer=nn.Sigmoid, force_act_layer=None, rd_round_fn=None): super(SqueezeExcite, self).__init__() - round_chs_fn = round_chs_fn or round - reduced_chs = round_chs_fn(in_chs * se_ratio) + if rd_channels is None: + rd_round_fn = rd_round_fn or round + rd_channels = rd_round_fn(in_chs * rd_ratio) act_layer = force_act_layer or act_layer - self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) + self.conv_reduce = nn.Conv2d(in_chs, rd_channels, 1, bias=True) self.act1 = create_act_layer(act_layer, inplace=True) - self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) - self.gate_fn = get_act_fn(gate_fn) + self.conv_expand = nn.Conv2d(rd_channels, in_chs, 1, bias=True) + self.gate = create_act_layer(gate_layer) def forward(self, x): x_se = x.mean((2, 3), keepdim=True) x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) - return x * self.gate_fn(x_se) + return x * self.gate(x_se) class ConvBnAct(nn.Module): @@ -85,10 +86,9 @@ class DepthwiseSeparableConv(nn.Module): """ def __init__( self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='', - noskip=False, pw_kernel_size=1, pw_act=False, se_ratio=0., - act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.): + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): super(DepthwiseSeparableConv, self).__init__() - has_se = se_layer is not None and se_ratio > 0. self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip self.has_pw_act = pw_act # activation after point-wise conv self.drop_path_rate = drop_path_rate @@ -99,7 +99,7 @@ class DepthwiseSeparableConv(nn.Module): self.act1 = act_layer(inplace=True) # Squeeze-and-excitation - self.se = se_layer(in_chs, se_ratio=se_ratio, act_layer=act_layer) if has_se else nn.Identity() + self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity() self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type) self.bn2 = norm_layer(out_chs) @@ -144,12 +144,11 @@ class InvertedResidual(nn.Module): def __init__( self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='', - noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, se_ratio=0., - act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, se_layer=None, conv_kwargs=None, drop_path_rate=0.): + noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, conv_kwargs=None, drop_path_rate=0.): super(InvertedResidual, self).__init__() conv_kwargs = conv_kwargs or {} mid_chs = make_divisible(in_chs * exp_ratio) - has_se = se_layer is not None and se_ratio > 0. self.has_residual = (in_chs == out_chs and stride == 1) and not noskip self.drop_path_rate = drop_path_rate @@ -166,7 +165,7 @@ class InvertedResidual(nn.Module): self.act2 = act_layer(inplace=True) # Squeeze-and-excitation - self.se = se_layer(mid_chs, se_ratio=se_ratio, act_layer=act_layer) if has_se else nn.Identity() + self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity() # Point-wise linear projection self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs) @@ -212,8 +211,8 @@ class CondConvResidual(InvertedResidual): def __init__( self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='', - noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, se_ratio=0., - act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, se_layer=None, num_experts=0, drop_path_rate=0.): + noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, num_experts=0, drop_path_rate=0.): self.num_experts = num_experts conv_kwargs = dict(num_experts=self.num_experts) @@ -221,8 +220,8 @@ class CondConvResidual(InvertedResidual): super(CondConvResidual, self).__init__( in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type, act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size, - pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_layer=se_layer, - norm_layer=norm_layer, conv_kwargs=conv_kwargs, drop_path_rate=drop_path_rate) + pw_kernel_size=pw_kernel_size, se_layer=se_layer, norm_layer=norm_layer, conv_kwargs=conv_kwargs, + drop_path_rate=drop_path_rate) self.routing_fn = nn.Linear(in_chs, self.num_experts) @@ -271,8 +270,8 @@ class EdgeResidual(nn.Module): def __init__( self, in_chs, out_chs, exp_kernel_size=3, stride=1, dilation=1, pad_type='', - force_in_chs=0, noskip=False, exp_ratio=1.0, pw_kernel_size=1, se_ratio=0., - act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.): + force_in_chs=0, noskip=False, exp_ratio=1.0, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.): super(EdgeResidual, self).__init__() if force_in_chs > 0: mid_chs = make_divisible(force_in_chs * exp_ratio) @@ -289,7 +288,7 @@ class EdgeResidual(nn.Module): self.act1 = act_layer(inplace=True) # Squeeze-and-excitation - self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, act_layer=act_layer) if has_se else nn.Identity() + self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity() # Point-wise linear projection self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type) diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py index 35019747..f44cf158 100644 --- a/timm/models/efficientnet_builder.py +++ b/timm/models/efficientnet_builder.py @@ -10,11 +10,12 @@ import logging import math import re from copy import deepcopy +from functools import partial import torch.nn as nn from .efficientnet_blocks import * -from .layers import CondConv2d, get_condconv_initializer, get_act_layer, make_divisible +from .layers import CondConv2d, get_condconv_initializer, get_act_layer, get_attn, make_divisible __all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights", 'resolve_bn_args', 'resolve_act_layer', 'round_channels', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT'] @@ -120,7 +121,9 @@ def _decode_block_str(block_str): elif v == 'hs': value = get_act_layer('hard_swish') elif v == 'sw': - value = get_act_layer('swish') + value = get_act_layer('swish') # aka SiLU + elif v == 'mi': + value = get_act_layer('mish') else: continue options[key] = value @@ -273,7 +276,12 @@ class EfficientNetBuilder: self.se_from_exp = se_from_exp # calculate se channel reduction from expanded (mid) chs self.act_layer = act_layer self.norm_layer = norm_layer - self.se_layer = se_layer + self.se_layer = get_attn(se_layer) + try: + self.se_layer(8, rd_ratio=1.0) + self.se_has_ratio = True + except RuntimeError as e: + self.se_has_ratio = False self.drop_path_rate = drop_path_rate if feature_location == 'depthwise': # old 'depthwise' mode renamed 'expansion' to match TF impl, old expansion mode didn't make sense @@ -300,18 +308,21 @@ class EfficientNetBuilder: ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer assert ba['act_layer'] is not None ba['norm_layer'] = self.norm_layer + ba['drop_path_rate'] = drop_path_rate if bt != 'cn': - ba['se_layer'] = self.se_layer - if not self.se_from_exp and ba['se_ratio']: - ba['se_ratio'] /= ba.get('exp_ratio', 1.0) - ba['drop_path_rate'] = drop_path_rate + se_ratio = ba.pop('se_ratio') + if se_ratio and self.se_layer is not None: + if not self.se_from_exp: + # adjust se_ratio by expansion ratio if calculating se channels from block input + se_ratio /= ba.get('exp_ratio', 1.0) + if self.se_has_ratio: + ba['se_layer'] = partial(self.se_layer, rd_ratio=se_ratio) + else: + ba['se_layer'] = self.se_layer if bt == 'ir': _log_info_if(' InvertedResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose) - if ba.get('num_experts', 0) > 0: - block = CondConvResidual(**ba) - else: - block = InvertedResidual(**ba) + block = CondConvResidual(**ba) if ba.get('num_experts', 0) else InvertedResidual(**ba) elif bt == 'ds' or bt == 'dsa': _log_info_if(' DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)), self.verbose) block = DepthwiseSeparableConv(**ba) diff --git a/timm/models/ghostnet.py b/timm/models/ghostnet.py index d82a91b4..48dee6ec 100644 --- a/timm/models/ghostnet.py +++ b/timm/models/ghostnet.py @@ -40,7 +40,7 @@ default_cfgs = { } -_SE_LAYER = partial(SqueezeExcite, gate_fn='hard_sigmoid', round_chs_fn=partial(make_divisible, divisor=4)) +_SE_LAYER = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=partial(make_divisible, divisor=4)) class GhostModule(nn.Module): @@ -92,7 +92,7 @@ class GhostBottleneck(nn.Module): self.bn_dw = None # Squeeze-and-excitation - self.se = _SE_LAYER(mid_chs, se_ratio=se_ratio) if has_se else None + self.se = _SE_LAYER(mid_chs, rd_ratio=se_ratio) if has_se else None # Point-wise linear projection self.ghost2 = GhostModule(mid_chs, out_chs, relu=False) diff --git a/timm/models/hardcorenas.py b/timm/models/hardcorenas.py index 16b9c4bc..9988a044 100644 --- a/timm/models/hardcorenas.py +++ b/timm/models/hardcorenas.py @@ -39,8 +39,7 @@ def _gen_hardcorenas(pretrained, variant, arch_def, **kwargs): """ num_features = 1280 - se_layer = partial( - SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, round_chs_fn=round_channels) + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels) model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=num_features, diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py index fad88aa7..e85112e6 100644 --- a/timm/models/mobilenetv3.py +++ b/timm/models/mobilenetv3.py @@ -266,7 +266,7 @@ def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kw round_chs_fn=partial(round_channels, multiplier=channel_multiplier), norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), act_layer=resolve_act_layer(kwargs, 'hard_swish'), - se_layer=partial(SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid')), + se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid'), **kwargs, ) model = _create_mnv3(variant, pretrained, **model_kwargs) @@ -354,8 +354,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg # stage 6, 7x7 in ['cn_r1_k1_s1_c960'], # hard-swish ] - se_layer = partial( - SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), force_act_layer=nn.ReLU, round_chs_fn=round_channels) + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels) model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=num_features, @@ -372,67 +371,48 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg def _gen_fbnetv3(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """ FBNetV3 + Paper: `FBNetV3: Joint Architecture-Recipe Search using Predictor Pretraining` + - https://arxiv.org/abs/2006.02049 FIXME untested, this is a preliminary impl of some FBNet-V3 variants. """ vl = variant.split('_')[-1] if vl in ('a', 'b'): stem_size = 16 arch_def = [ - # stage 0, 112x112 in ['ds_r2_k3_s1_e1_c16'], - # stage 1, 112x112 in ['ir_r1_k5_s2_e4_c24', 'ir_r3_k5_s1_e2_c24'], - # stage 2, 56x56 in ['ir_r1_k5_s2_e5_c40_se0.25', 'ir_r4_k5_s1_e3_c40_se0.25'], - # stage 3, 28x28 in ['ir_r1_k5_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], - # stage 4, 14x14in ['ir_r1_k3_s1_e5_c120_se0.25', 'ir_r5_k5_s1_e3_c120_se0.25'], - # stage 5, 14x14in ['ir_r1_k3_s2_e6_c184_se0.25', 'ir_r5_k5_s1_e4_c184_se0.25', 'ir_r1_k5_s1_e6_c224_se0.25'], - # stage 6, 7x7 in ['cn_r1_k1_s1_c1344'], ] elif vl == 'd': stem_size = 24 arch_def = [ - # stage 0, 112x112 in ['ds_r2_k3_s1_e1_c16'], - # stage 1, 112x112 in ['ir_r1_k3_s2_e5_c24', 'ir_r5_k3_s1_e2_c24'], - # stage 2, 56x56 in ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r4_k3_s1_e3_c40_se0.25'], - # stage 3, 28x28 in ['ir_r1_k3_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], - # stage 4, 14x14in ['ir_r1_k3_s1_e5_c128_se0.25', 'ir_r6_k5_s1_e3_c128_se0.25'], - # stage 5, 14x14in ['ir_r1_k3_s2_e6_c208_se0.25', 'ir_r5_k5_s1_e5_c208_se0.25', 'ir_r1_k5_s1_e6_c240_se0.25'], - # stage 6, 7x7 in ['cn_r1_k1_s1_c1440'], ] elif vl == 'g': stem_size = 32 arch_def = [ - # stage 0, 112x112 in ['ds_r3_k3_s1_e1_c24'], - # stage 1, 112x112 in ['ir_r1_k5_s2_e4_c40', 'ir_r4_k5_s1_e2_c40'], - # stage 2, 56x56 in ['ir_r1_k5_s2_e4_c56_se0.25', 'ir_r4_k5_s1_e3_c56_se0.25'], - # stage 3, 28x28 in ['ir_r1_k5_s2_e5_c104', 'ir_r4_k3_s1_e3_c104'], - # stage 4, 14x14in ['ir_r1_k3_s1_e5_c160_se0.25', 'ir_r8_k5_s1_e3_c160_se0.25'], - # stage 5, 14x14in ['ir_r1_k3_s2_e6_c264_se0.25', 'ir_r6_k5_s1_e5_c264_se0.25', 'ir_r2_k5_s1_e6_c288_se0.25'], - # stage 6, 7x7 in - ['cn_r1_k1_s1_c1728'], # hard-swish + ['cn_r1_k1_s1_c1728'], ] else: raise NotImplemented round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.95) - se_layer = partial(SqueezeExcite, gate_fn=get_act_fn('hard_sigmoid'), round_chs_fn=round_chs_fn) + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=round_chs_fn) act_layer = resolve_act_layer(kwargs, 'hard_swish') model_kwargs = dict( block_args=decode_arch_def(arch_def), From 8bf63b6c6cc2b4ba69030cb043bf33cd562b399c Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sun, 30 May 2021 12:47:02 -0700 Subject: [PATCH 06/12] Able to use other attn layer in EfficientNet now. Create test ECA + GC B0 configs. Make ECA more configurable. --- tests/test_models.py | 2 +- timm/models/efficientnet.py | 24 +++++++++++++++++++ timm/models/efficientnet_builder.py | 4 ++-- timm/models/layers/eca.py | 36 +++++++++++++++++++++-------- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 18298dff..1093e609 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,7 +24,7 @@ NUM_NON_STD = len(NON_STD_FILTERS) if 'GITHUB_ACTIONS' in os.environ: # and 'Linux' in platform.system(): # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models EXCLUDE_FILTERS = [ - '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', + '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', '*50x3_bitm' '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*', '*resnetrs350*', '*resnetrs420*'] else: diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py index 8aa61ec5..09e47684 100644 --- a/timm/models/efficientnet.py +++ b/timm/models/efficientnet.py @@ -91,6 +91,12 @@ default_cfgs = { url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth', interpolation='bilinear'), + # NOTE experimenting with alternate attention + 'eca_efficientnet_b0': _cfg( + url=''), + 'gc_efficientnet_b0': _cfg( + url=''), + 'efficientnet_b0': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'), 'efficientnet_b1': _cfg( @@ -1223,6 +1229,24 @@ def efficientnet_b0(pretrained=False, **kwargs): return model +@register_model +def eca_efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ ECA attn """ + # NOTE experimental config + model = _gen_efficientnet( + 'eca_efficientnet_b0', se_layer='eca', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def gc_efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ GlobalContext """ + # NOTE experminetal config + model = _gen_efficientnet( + 'gc_efficientnet_b0', se_layer='gc', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + @register_model def efficientnet_b1(pretrained=False, **kwargs): """ EfficientNet-B1 """ diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py index f44cf158..a23e8273 100644 --- a/timm/models/efficientnet_builder.py +++ b/timm/models/efficientnet_builder.py @@ -278,9 +278,9 @@ class EfficientNetBuilder: self.norm_layer = norm_layer self.se_layer = get_attn(se_layer) try: - self.se_layer(8, rd_ratio=1.0) + self.se_layer(8, rd_ratio=1.0) # test if attn layer accepts rd_ratio arg self.se_has_ratio = True - except RuntimeError as e: + except TypeError: self.se_has_ratio = False self.drop_path_rate = drop_path_rate if feature_location == 'depthwise': diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py index f2980730..5c024108 100644 --- a/timm/models/layers/eca.py +++ b/timm/models/layers/eca.py @@ -38,6 +38,9 @@ from torch import nn import torch.nn.functional as F +from .create_act import create_act_layer + + class EcaModule(nn.Module): """Constructs an ECA module. @@ -48,20 +51,27 @@ class EcaModule(nn.Module): refer to original paper https://arxiv.org/pdf/1910.03151.pdf (default=None. if channel size not given, use k_size given for kernel size.) kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use """ - def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1): + def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): super(EcaModule, self).__init__() - assert kernel_size % 2 == 1 if channels is not None: t = int(abs(math.log(channels, 2) + beta) / gamma) kernel_size = max(t if t % 2 else t + 1, 3) - - self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False) + assert kernel_size % 2 == 1 + has_act = act_layer is not None + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=has_act) + self.act = create_act_layer(act_layer) if has_act else nn.Identity() + self.gate = create_act_layer(gate_layer) def forward(self, x): y = x.mean((2, 3)).view(x.shape[0], 1, -1) # view for 1d conv y = self.conv(y) - y = y.view(x.shape[0], -1, 1, 1).sigmoid() + y = self.act(y) # NOTE: usually a no-op, added for experimentation + y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x) @@ -86,27 +96,35 @@ class CecaModule(nn.Module): refer to original paper https://arxiv.org/pdf/1910.03151.pdf (default=None. if channel size not given, use k_size given for kernel size.) kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use """ - def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1): + def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): super(CecaModule, self).__init__() - assert kernel_size % 2 == 1 if channels is not None: t = int(abs(math.log(channels, 2) + beta) / gamma) kernel_size = max(t if t % 2 else t + 1, 3) + has_act = act_layer is not None + assert kernel_size % 2 == 1 # PyTorch circular padding mode is buggy as of pytorch 1.4 # see https://github.com/pytorch/pytorch/pull/17240 # implement manual circular padding - self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False) self.padding = (kernel_size - 1) // 2 + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=has_act) + self.act = create_act_layer(act_layer) if has_act else nn.Identity() + self.gate = create_act_layer(gate_layer) def forward(self, x): y = x.mean((2, 3)).view(x.shape[0], 1, -1) # Manually implement circular padding, F.pad does not seemed to be bugged y = F.pad(y, (self.padding, self.padding), mode='circular') y = self.conv(y) - y = y.view(x.shape[0], -1, 1, 1).sigmoid() + y = self.act(y) # NOTE: usually a no-op, added for experimentation + y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x) From 34522097b1d847f11263d9005d8dd1ff584c3edb Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sun, 30 May 2021 21:12:10 -0700 Subject: [PATCH 07/12] See if we can use tcmalloc in test runner --- .github/workflows/tests.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9f7aebdb..f404085a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,11 +36,16 @@ jobs: run: pip install --no-cache-dir torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }} - name: Install torch on ubuntu if: startsWith(matrix.os, 'ubuntu') - run: pip install --no-cache-dir torch==${{ matrix.torch }}+cpu torchvision==${{ matrix.torchvision }}+cpu -f https://download.pytorch.org/whl/torch_stable.html + run: | + pip install --no-cache-dir torch==${{ matrix.torch }}+cpu torchvision==${{ matrix.torchvision }}+cpu -f https://download.pytorch.org/whl/torch_stable.html + sudo apt update + sudo apt install -y google-perftools - name: Install requirements run: | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install --no-cache-dir git+https://github.com/mapillary/inplace_abn.git@v1.0.12 - name: Run tests + env: + LD_PRELOAD: /usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 run: | pytest -vv --durations=0 ./tests From 17dc47c8e64e1452a0f2be7883a55b2f618229eb Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sun, 30 May 2021 22:00:43 -0700 Subject: [PATCH 08/12] Missed comma in test filters. --- tests/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 1093e609..5a31935e 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,7 +24,7 @@ NUM_NON_STD = len(NON_STD_FILTERS) if 'GITHUB_ACTIONS' in os.environ: # and 'Linux' in platform.system(): # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models EXCLUDE_FILTERS = [ - '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', '*50x3_bitm' + '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', '*50x3_bitm', '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*', '*resnetrs350*', '*resnetrs420*'] else: From 307a935b790b5af8d551ebecda053cb1a9b16fcb Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 31 May 2021 13:18:11 -0700 Subject: [PATCH 09/12] Add non-local and BAT attention. Merge attn and self-attn factories into one. Add attention references to README. Add mlp 'mode' to ECA. --- README.md | 16 ++- timm/models/byobnet.py | 6 +- timm/models/efficientnet.py | 6 +- timm/models/layers/__init__.py | 6 +- timm/models/layers/create_attn.py | 45 +++++++- timm/models/layers/create_self_attn.py | 25 ----- timm/models/layers/eca.py | 28 +++-- timm/models/layers/non_local_attn.py | 145 +++++++++++++++++++++++++ timm/models/layers/selective_kernel.py | 17 +-- timm/models/layers/split_attn.py | 39 +++---- timm/models/layers/squeeze_excite.py | 2 +- timm/models/resnest.py | 17 ++- timm/models/sknet.py | 16 +-- 13 files changed, 276 insertions(+), 92 deletions(-) delete mode 100644 timm/models/layers/create_self_attn.py create mode 100644 timm/models/layers/non_local_attn.py diff --git a/README.md b/README.md index 06aee7ec..0b878a0a 100644 --- a/README.md +++ b/README.md @@ -295,10 +295,24 @@ Several (less common) features that I often utilize in my projects are included. * SplitBachNorm - allows splitting batch norm layers between clean and augmented (auxiliary batch norm) data * DropPath aka "Stochastic Depth" (https://arxiv.org/abs/1603.09382) * DropBlock (https://arxiv.org/abs/1810.12890) -* Efficient Channel Attention - ECA (https://arxiv.org/abs/1910.03151) * Blur Pooling (https://arxiv.org/abs/1904.11486) * Space-to-Depth by [mrT23](https://github.com/mrT23/TResNet/blob/master/src/models/tresnet/layers/space_to_depth.py) (https://arxiv.org/abs/1801.04590) -- original paper? * Adaptive Gradient Clipping (https://arxiv.org/abs/2102.06171, https://github.com/deepmind/deepmind-research/tree/master/nfnets) +* An extensive selection of channel and/or spatial attention modules: + * Bottleneck Transformer - https://arxiv.org/abs/2101.11605 + * CBAM - https://arxiv.org/abs/1807.06521 + * Effective Squeeze-Excitation (ESE) - https://arxiv.org/abs/1911.06667 + * Efficient Channel Attention (ECA) - https://arxiv.org/abs/1910.03151 + * Gather-Excite (GE) - https://arxiv.org/abs/1810.12348 + * Global Context (GC) - https://arxiv.org/abs/1904.11492 + * Halo - https://arxiv.org/abs/2103.12731 + * Involution - https://arxiv.org/abs/2103.06255 + * Lambda Layer - https://arxiv.org/abs/2102.08602 + * Non-Local (NL) - https://arxiv.org/abs/1711.07971 + * Squeeze-and-Excitation (SE) - https://arxiv.org/abs/1709.01507 + * Selective Kernel (SK) - (https://arxiv.org/abs/1903.06586 + * Split (SPLAT) - https://arxiv.org/abs/2004.08955 + * Shifted Window (SWIN) - https://arxiv.org/abs/2103.14030 ## Results diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index 8ec8690a..d41245f5 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -35,7 +35,7 @@ import torch.nn as nn from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg from .layers import ClassifierHead, ConvBnAct, BatchNormAct2d, DropPath, AvgPool2dSame, \ - create_conv2d, get_act_layer, convert_norm_act, get_attn, get_self_attn, make_divisible, to_2tuple + create_conv2d, get_act_layer, convert_norm_act, get_attn, make_divisible, to_2tuple from .registry import register_model __all__ = ['ByobNet', 'ByoModelCfg', 'ByoBlockCfg', 'create_byob_stem', 'create_block'] @@ -935,7 +935,7 @@ def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, mo else: self_attn_kwargs = override_kwargs(block_cfg.self_attn_kwargs, model_cfg.self_attn_kwargs) self_attn_layer = block_cfg.self_attn_layer or model_cfg.self_attn_layer - self_attn_layer = partial(get_self_attn(self_attn_layer), *self_attn_kwargs) \ + self_attn_layer = partial(get_attn(self_attn_layer), *self_attn_kwargs) \ if self_attn_layer is not None else None layer_fns = replace(layer_fns, self_attn=self_attn_layer) @@ -1010,7 +1010,7 @@ def get_layer_fns(cfg: ByoModelCfg): norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act) conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act) attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None - self_attn = partial(get_self_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None + self_attn = partial(get_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn) return layer_fn diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py index 09e47684..6426b540 100644 --- a/timm/models/efficientnet.py +++ b/timm/models/efficientnet.py @@ -1234,7 +1234,8 @@ def eca_efficientnet_b0(pretrained=False, **kwargs): """ EfficientNet-B0 w/ ECA attn """ # NOTE experimental config model = _gen_efficientnet( - 'eca_efficientnet_b0', se_layer='eca', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + 'eca_efficientnet_b0', se_layer='ecam', channel_multiplier=1.0, depth_multiplier=1.0, + pretrained=pretrained, **kwargs) return model @@ -1243,7 +1244,8 @@ def gc_efficientnet_b0(pretrained=False, **kwargs): """ EfficientNet-B0 w/ GlobalContext """ # NOTE experminetal config model = _gen_efficientnet( - 'gc_efficientnet_b0', se_layer='gc', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + 'gc_efficientnet_b0', se_layer='gc', channel_multiplier=1.0, depth_multiplier=1.0, + pretrained=pretrained, **kwargs) return model diff --git a/timm/models/layers/__init__.py b/timm/models/layers/__init__.py index 30a1b40d..77d1026e 100644 --- a/timm/models/layers/__init__.py +++ b/timm/models/layers/__init__.py @@ -12,7 +12,6 @@ from .create_act import create_act_layer, get_act_layer, get_act_fn from .create_attn import get_attn, create_attn from .create_conv2d import create_conv2d from .create_norm_act import get_norm_act_layer, create_norm_act, convert_norm_act -from .create_self_attn import get_self_attn, create_self_attn from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn from .evo_norm import EvoNormBatch2d, EvoNormSample2d @@ -24,16 +23,17 @@ from .involution import Involution from .linear import Linear from .mixed_conv2d import MixedConv2d from .mlp import Mlp, GluMlp, GatedMlp +from .non_local_attn import NonLocalAttn, BatNonLocalAttn from .norm import GroupNorm, LayerNorm2d from .norm_act import BatchNormAct2d, GroupNormAct from .padding import get_padding, get_same_padding, pad_same from .patch_embed import PatchEmbed from .pool2d_same import AvgPool2dSame, create_pool2d from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite -from .selective_kernel import SelectiveKernelConv +from .selective_kernel import SelectiveKernel from .separable_conv import SeparableConv2d, SeparableConvBnAct from .space_to_depth import SpaceToDepthModule -from .split_attn import SplitAttnConv2d +from .split_attn import SplitAttn from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model from .std_conv import StdConv2d, StdConv2dSame, ScaledStdConv2d, ScaledStdConv2dSame from .test_time_pool import TestTimePoolHead, apply_test_time_pool diff --git a/timm/models/layers/create_attn.py b/timm/models/layers/create_attn.py index de866eea..3fed646b 100644 --- a/timm/models/layers/create_attn.py +++ b/timm/models/layers/create_attn.py @@ -1,14 +1,23 @@ -""" Select AttentionFactory Method +""" Attention Factory -Hacked together by / Copyright 2020 Ross Wightman +Hacked together by / Copyright 2021 Ross Wightman """ import torch +from functools import partial +from .bottleneck_attn import BottleneckAttn from .cbam import CbamModule, LightCbamModule from .eca import EcaModule, CecaModule from .gather_excite import GatherExcite from .global_context import GlobalContext +from .halo_attn import HaloAttn +from .involution import Involution +from .lambda_layer import LambdaLayer +from .non_local_attn import NonLocalAttn, BatNonLocalAttn +from .selective_kernel import SelectiveKernel +from .split_attn import SplitAttn from .squeeze_excite import SEModule, EffectiveSEModule +from .swin_attn import WindowAttention def get_attn(attn_type): @@ -18,12 +27,16 @@ def get_attn(attn_type): if attn_type is not None: if isinstance(attn_type, str): attn_type = attn_type.lower() + # Lightweight attention modules (channel and/or coarse spatial). + # Typically added to existing network architecture blocks in addition to existing convolutions. if attn_type == 'se': module_cls = SEModule elif attn_type == 'ese': module_cls = EffectiveSEModule elif attn_type == 'eca': module_cls = EcaModule + elif attn_type == 'ecam': + module_cls = partial(EcaModule, use_mlp=True) elif attn_type == 'ceca': module_cls = CecaModule elif attn_type == 'ge': @@ -34,6 +47,34 @@ def get_attn(attn_type): module_cls = CbamModule elif attn_type == 'lcbam': module_cls = LightCbamModule + + # Attention / attention-like modules w/ significant params + # Typically replace some of the existing workhorse convs in a network architecture. + # All of these accept a stride argument and can spatially downsample the input. + elif attn_type == 'sk': + module_cls = SelectiveKernel + elif attn_type == 'splat': + module_cls = SplitAttn + + # Self-attention / attention-like modules w/ significant compute and/or params + # Typically replace some of the existing workhorse convs in a network architecture. + # All of these accept a stride argument and can spatially downsample the input. + elif attn_type == 'lambda': + return LambdaLayer + elif attn_type == 'bottleneck': + return BottleneckAttn + elif attn_type == 'halo': + return HaloAttn + elif attn_type == 'swin': + return WindowAttention + elif attn_type == 'involution': + return Involution + elif attn_type == 'nl': + module_cls = NonLocalAttn + elif attn_type == 'bat': + module_cls = BatNonLocalAttn + + # Woops! else: assert False, "Invalid attn module (%s)" % attn_type elif isinstance(attn_type, bool): diff --git a/timm/models/layers/create_self_attn.py b/timm/models/layers/create_self_attn.py deleted file mode 100644 index 448ddb34..00000000 --- a/timm/models/layers/create_self_attn.py +++ /dev/null @@ -1,25 +0,0 @@ -from .bottleneck_attn import BottleneckAttn -from .halo_attn import HaloAttn -from .involution import Involution -from .lambda_layer import LambdaLayer -from .swin_attn import WindowAttention - - -def get_self_attn(attn_type): - if attn_type == 'bottleneck': - return BottleneckAttn - elif attn_type == 'halo': - return HaloAttn - elif attn_type == 'lambda': - return LambdaLayer - elif attn_type == 'swin': - return WindowAttention - elif attn_type == 'involution': - return Involution - else: - assert False, f"Unknown attn type ({attn_type})" - - -def create_self_attn(attn_type, dim, stride=1, **kwargs): - attn_fn = get_self_attn(attn_type) - return attn_fn(dim, stride=stride, **kwargs) diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py index 5c024108..e29be6ac 100644 --- a/timm/models/layers/eca.py +++ b/timm/models/layers/eca.py @@ -39,6 +39,7 @@ import torch.nn.functional as F from .create_act import create_act_layer +from .helpers import make_divisible class EcaModule(nn.Module): @@ -56,21 +57,36 @@ class EcaModule(nn.Module): act_layer: optional non-linearity after conv, enables conv bias, this is an experiment gate_layer: gating non-linearity to use """ - def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): + def __init__( + self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid', + rd_ratio=1/8, rd_channels=None, rd_divisor=8, use_mlp=False): super(EcaModule, self).__init__() if channels is not None: t = int(abs(math.log(channels, 2) + beta) / gamma) kernel_size = max(t if t % 2 else t + 1, 3) assert kernel_size % 2 == 1 - has_act = act_layer is not None - self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=has_act) - self.act = create_act_layer(act_layer) if has_act else nn.Identity() + padding = (kernel_size - 1) // 2 + if use_mlp: + # NOTE 'mlp' mode is a timm experiment, not in paper + assert channels is not None + if rd_channels is None: + rd_channels = make_divisible(channels * rd_ratio, divisor=rd_divisor) + act_layer = act_layer or nn.ReLU + self.conv = nn.Conv1d(1, rd_channels, kernel_size=1, padding=0, bias=True) + self.act = create_act_layer(act_layer) + self.conv2 = nn.Conv1d(rd_channels, 1, kernel_size=kernel_size, padding=padding, bias=True) + else: + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) + self.act = None + self.conv2 = None self.gate = create_act_layer(gate_layer) def forward(self, x): y = x.mean((2, 3)).view(x.shape[0], 1, -1) # view for 1d conv y = self.conv(y) - y = self.act(y) # NOTE: usually a no-op, added for experimentation + if self.conv2 is not None: + y = self.act(y) + y = self.conv2(y) y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x) @@ -115,7 +131,6 @@ class CecaModule(nn.Module): # implement manual circular padding self.padding = (kernel_size - 1) // 2 self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=has_act) - self.act = create_act_layer(act_layer) if has_act else nn.Identity() self.gate = create_act_layer(gate_layer) def forward(self, x): @@ -123,7 +138,6 @@ class CecaModule(nn.Module): # Manually implement circular padding, F.pad does not seemed to be bugged y = F.pad(y, (self.padding, self.padding), mode='circular') y = self.conv(y) - y = self.act(y) # NOTE: usually a no-op, added for experimentation y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x) diff --git a/timm/models/layers/non_local_attn.py b/timm/models/layers/non_local_attn.py new file mode 100644 index 00000000..d20a5f3e --- /dev/null +++ b/timm/models/layers/non_local_attn.py @@ -0,0 +1,145 @@ +""" Bilinear-Attention-Transform and Non-Local Attention + +Paper: `Non-Local Neural Networks With Grouped Bilinear Attentional Transforms` + - https://openaccess.thecvf.com/content_CVPR_2020/html/Chi_Non-Local_Neural_Networks_With_Grouped_Bilinear_Attentional_Transforms_CVPR_2020_paper.html +Adapted from original code: https://github.com/BA-Transform/BAT-Image-Classification +""" +import torch +from torch import nn +from torch.nn import functional as F + +from .conv_bn_act import ConvBnAct +from .helpers import make_divisible + + +class NonLocalAttn(nn.Module): + """Spatial NL block for image classification. + + This was adapted from https://github.com/BA-Transform/BAT-Image-Classification + Their NonLocal impl inspired by https://github.com/facebookresearch/video-nonlocal-net. + """ + + def __init__(self, in_channels, use_scale=True, rd_ratio=1/8, rd_channels=None, rd_divisor=8, **kwargs): + super(NonLocalAttn, self).__init__() + if rd_channels is None: + rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor) + self.scale = in_channels ** -0.5 if use_scale else 1.0 + self.t = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.p = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.g = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.z = nn.Conv2d(rd_channels, in_channels, kernel_size=1, stride=1, bias=True) + self.norm = nn.BatchNorm2d(in_channels) + self.reset_parameters() + + def forward(self, x): + shortcut = x + + t = self.t(x) + p = self.p(x) + g = self.g(x) + + B, C, H, W = t.size() + t = t.view(B, C, -1).permute(0, 2, 1) + p = p.view(B, C, -1) + g = g.view(B, C, -1).permute(0, 2, 1) + + att = torch.bmm(t, p) * self.scale + att = F.softmax(att, dim=2) + x = torch.bmm(att, g) + + x = x.permute(0, 2, 1).reshape(B, C, H, W) + x = self.z(x) + x = self.norm(x) + shortcut + + return x + + def reset_parameters(self): + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + if len(list(m.parameters())) > 1: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 0) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 0) + nn.init.constant_(m.bias, 0) + + +class BilinearAttnTransform(nn.Module): + + def __init__(self, in_channels, block_size, groups, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d): + super(BilinearAttnTransform, self).__init__() + + self.conv1 = ConvBnAct(in_channels, groups, 1, act_layer=act_layer, norm_layer=norm_layer) + self.conv_p = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(block_size, 1)) + self.conv_q = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(1, block_size)) + self.conv2 = ConvBnAct(in_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.block_size = block_size + self.groups = groups + self.in_channels = in_channels + + def resize_mat(self, x, t): + B, C, block_size, block_size1 = x.shape + assert block_size == block_size1 + if t <= 1: + return x + x = x.view(B * C, -1, 1, 1) + x = x * torch.eye(t, t, dtype=x.dtype, device=x.device) + x = x.view(B * C, block_size, block_size, t, t) + x = torch.cat(torch.split(x, 1, dim=1), dim=3) + x = torch.cat(torch.split(x, 1, dim=2), dim=4) + x = x.view(B, C, block_size * t, block_size * t) + return x + + def forward(self, x): + assert x.shape[-1] % self.block_size == 0 and x.shape[-2] % self.block_size == 0 + B, C, H, W = x.shape + out = self.conv1(x) + rp = F.adaptive_max_pool2d(out, (self.block_size, 1)) + cp = F.adaptive_max_pool2d(out, (1, self.block_size)) + p = self.conv_p(rp).view(B, self.groups, self.block_size, self.block_size) + q = self.conv_q(cp).view(B, self.groups, self.block_size, self.block_size) + p = F.sigmoid(p) + q = F.sigmoid(q) + p = p / p.sum(dim=3, keepdim=True) + q = q / q.sum(dim=2, keepdim=True) + p = p.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size( + 0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous() + p = p.view(B, C, self.block_size, self.block_size) + q = q.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size( + 0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous() + q = q.view(B, C, self.block_size, self.block_size) + p = self.resize_mat(p, H // self.block_size) + q = self.resize_mat(q, W // self.block_size) + y = p.matmul(x) + y = y.matmul(q) + + y = self.conv2(y) + return y + + +class BatNonLocalAttn(nn.Module): + """ BAT + Adapted from: https://github.com/BA-Transform/BAT-Image-Classification + """ + + def __init__( + self, in_channels, block_size=7, groups=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8, + drop_rate=0.2, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, **_): + super().__init__() + if rd_channels is None: + rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor) + self.conv1 = ConvBnAct(in_channels, rd_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.ba = BilinearAttnTransform(rd_channels, block_size, groups, act_layer=act_layer, norm_layer=norm_layer) + self.conv2 = ConvBnAct(rd_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.dropout = nn.Dropout2d(p=drop_rate) + + def forward(self, x): + xl = self.conv1(x) + y = self.ba(xl) + y = self.conv2(y) + y = self.dropout(y) + return y + x diff --git a/timm/models/layers/selective_kernel.py b/timm/models/layers/selective_kernel.py index 10bfd0e0..246f72a6 100644 --- a/timm/models/layers/selective_kernel.py +++ b/timm/models/layers/selective_kernel.py @@ -8,6 +8,7 @@ import torch from torch import nn as nn from .conv_bn_act import ConvBnAct +from .helpers import make_divisible def _kernel_valid(k): @@ -45,10 +46,10 @@ class SelectiveKernelAttn(nn.Module): return x -class SelectiveKernelConv(nn.Module): +class SelectiveKernel(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=None, stride=1, dilation=1, groups=1, - attn_reduction=16, min_attn_channels=32, keep_3x3=True, split_input=False, + def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1, + rd_ratio=1./16, rd_channels=None, min_rd_channels=16, rd_divisor=8, keep_3x3=True, split_input=True, drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None): """ Selective Kernel Convolution Module @@ -66,8 +67,8 @@ class SelectiveKernelConv(nn.Module): stride (int): stride for convolutions dilation (int): dilation for module as a whole, impacts dilation of each branch groups (int): number of groups for each branch - attn_reduction (int, float): reduction factor for attention features - min_attn_channels (int): minimum attention feature channels + rd_ratio (int, float): reduction factor for attention features + min_rd_channels (int): minimum attention feature channels keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations split_input (bool): split input channels evenly across each convolution branch, keeps param count lower, can be viewed as grouping by path, output expands to module out_channels count @@ -75,7 +76,8 @@ class SelectiveKernelConv(nn.Module): act_layer (nn.Module): activation layer to use norm_layer (nn.Module): batchnorm/norm layer to use """ - super(SelectiveKernelConv, self).__init__() + super(SelectiveKernel, self).__init__() + out_channels = out_channels or in_channels kernel_size = kernel_size or [3, 5] # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation _kernel_valid(kernel_size) if not isinstance(kernel_size, list): @@ -101,7 +103,8 @@ class SelectiveKernelConv(nn.Module): ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs) for k, d in zip(kernel_size, dilation)]) - attn_channels = max(int(out_channels / attn_reduction), min_attn_channels) + attn_channels = rd_channels or make_divisible( + out_channels * rd_ratio, min_value=min_rd_channels, divisor=rd_divisor) self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels) self.drop_block = drop_block diff --git a/timm/models/layers/split_attn.py b/timm/models/layers/split_attn.py index 5615aa0b..dde601be 100644 --- a/timm/models/layers/split_attn.py +++ b/timm/models/layers/split_attn.py @@ -10,6 +10,8 @@ import torch import torch.nn.functional as F from torch import nn +from .helpers import make_divisible + class RadixSoftmax(nn.Module): def __init__(self, radix, cardinality): @@ -28,41 +30,37 @@ class RadixSoftmax(nn.Module): return x -class SplitAttnConv2d(nn.Module): - """Split-Attention Conv2d +class SplitAttn(nn.Module): + """Split-Attention (aka Splat) """ - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, - dilation=1, groups=1, bias=False, radix=2, reduction_factor=4, + def __init__(self, in_channels, out_channels=None, kernel_size=3, stride=1, padding=None, + dilation=1, groups=1, bias=False, radix=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8, act_layer=nn.ReLU, norm_layer=None, drop_block=None, **kwargs): - super(SplitAttnConv2d, self).__init__() + super(SplitAttn, self).__init__() + out_channels = out_channels or in_channels self.radix = radix self.drop_block = drop_block mid_chs = out_channels * radix - attn_chs = max(in_channels * radix // reduction_factor, 32) + if rd_channels is None: + attn_chs = make_divisible(in_channels * radix * rd_ratio, min_value=32, divisor=rd_divisor) + else: + attn_chs = rd_channels * radix + padding = kernel_size // 2 if padding is None else padding self.conv = nn.Conv2d( in_channels, mid_chs, kernel_size, stride, padding, dilation, groups=groups * radix, bias=bias, **kwargs) - self.bn0 = norm_layer(mid_chs) if norm_layer is not None else None + self.bn0 = norm_layer(mid_chs) if norm_layer else nn.Identity() self.act0 = act_layer(inplace=True) self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups) - self.bn1 = norm_layer(attn_chs) if norm_layer is not None else None + self.bn1 = norm_layer(attn_chs) if norm_layer else nn.Identity() self.act1 = act_layer(inplace=True) self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, groups=groups) self.rsoftmax = RadixSoftmax(radix, groups) - @property - def in_channels(self): - return self.conv.in_channels - - @property - def out_channels(self): - return self.fc1.out_channels - def forward(self, x): x = self.conv(x) - if self.bn0 is not None: - x = self.bn0(x) + x = self.bn0(x) if self.drop_block is not None: x = self.drop_block(x) x = self.act0(x) @@ -73,10 +71,9 @@ class SplitAttnConv2d(nn.Module): x_gap = x.sum(dim=1) else: x_gap = x - x_gap = F.adaptive_avg_pool2d(x_gap, 1) + x_gap = x_gap.mean((2, 3), keepdim=True) x_gap = self.fc1(x_gap) - if self.bn1 is not None: - x_gap = self.bn1(x_gap) + x_gap = self.bn1(x_gap) x_gap = self.act1(x_gap) x_attn = self.fc2(x_gap) diff --git a/timm/models/layers/squeeze_excite.py b/timm/models/layers/squeeze_excite.py index 3e8a05bb..e5da29ef 100644 --- a/timm/models/layers/squeeze_excite.py +++ b/timm/models/layers/squeeze_excite.py @@ -56,7 +56,7 @@ class EffectiveSEModule(nn.Module): """ 'Effective Squeeze-Excitation From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 """ - def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid'): + def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid', **_): super(EffectiveSEModule, self).__init__() self.add_maxpool = add_maxpool self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) diff --git a/timm/models/resnest.py b/timm/models/resnest.py index ac3b2559..31eebd80 100644 --- a/timm/models/resnest.py +++ b/timm/models/resnest.py @@ -11,7 +11,7 @@ from torch import nn from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg -from .layers import SplitAttnConv2d +from .layers import SplitAttn from .registry import register_model from .resnet import ResNet @@ -83,11 +83,11 @@ class ResNestBottleneck(nn.Module): self.avd_first = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and avd_first else None if self.radix >= 1: - self.conv2 = SplitAttnConv2d( + self.conv2 = SplitAttn( group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation, dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_block=drop_block) - self.bn2 = None # FIXME revisit, here to satisfy current torchscript fussyness - self.act2 = None + self.bn2 = nn.Identity() + self.act2 = nn.Identity() else: self.conv2 = nn.Conv2d( group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation, @@ -117,11 +117,10 @@ class ResNestBottleneck(nn.Module): out = self.avd_first(out) out = self.conv2(out) - if self.bn2 is not None: - out = self.bn2(out) - if self.drop_block is not None: - out = self.drop_block(out) - out = self.act2(out) + out = self.bn2(out) + if self.drop_block is not None: + out = self.drop_block(out) + out = self.act2(out) if self.avd_last is not None: out = self.avd_last(out) diff --git a/timm/models/sknet.py b/timm/models/sknet.py index eb7ad8c3..82ca5bfe 100644 --- a/timm/models/sknet.py +++ b/timm/models/sknet.py @@ -14,7 +14,7 @@ from torch import nn as nn from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg -from .layers import SelectiveKernelConv, ConvBnAct, create_attn +from .layers import SelectiveKernel, ConvBnAct, create_attn from .registry import register_model from .resnet import ResNet @@ -59,7 +59,7 @@ class SelectiveKernelBasic(nn.Module): outplanes = planes * self.expansion first_dilation = first_dilation or dilation - self.conv1 = SelectiveKernelConv( + self.conv1 = SelectiveKernel( inplanes, first_planes, stride=stride, dilation=first_dilation, **conv_kwargs, **sk_kwargs) conv_kwargs['act_layer'] = None self.conv2 = ConvBnAct( @@ -107,7 +107,7 @@ class SelectiveKernelBottleneck(nn.Module): first_dilation = first_dilation or dilation self.conv1 = ConvBnAct(inplanes, first_planes, kernel_size=1, **conv_kwargs) - self.conv2 = SelectiveKernelConv( + self.conv2 = SelectiveKernel( first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality, **conv_kwargs, **sk_kwargs) conv_kwargs['act_layer'] = None @@ -153,10 +153,7 @@ def skresnet18(pretrained=False, **kwargs): Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this variation splits the input channels to the selective convolutions to keep param count down. """ - sk_kwargs = dict( - min_attn_channels=16, - attn_reduction=8, - split_input=True) + sk_kwargs = dict(min_rd_channels=16, rd_ratio=1/8, split_input=True) model_args = dict( block=SelectiveKernelBasic, layers=[2, 2, 2, 2], block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) @@ -170,10 +167,7 @@ def skresnet34(pretrained=False, **kwargs): Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this variation splits the input channels to the selective convolutions to keep param count down. """ - sk_kwargs = dict( - min_attn_channels=16, - attn_reduction=8, - split_input=True) + sk_kwargs = dict(min_rd_channels=16, rd_ratio=1/8, split_input=True) model_args = dict( block=SelectiveKernelBasic, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) From a27f4aec4aaa22c6a6e82c7d8a9a69d73176525e Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 31 May 2021 14:06:34 -0700 Subject: [PATCH 10/12] Missed args for skresnext w/ refactoring. --- timm/models/layers/selective_kernel.py | 2 +- timm/models/sknet.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/timm/models/layers/selective_kernel.py b/timm/models/layers/selective_kernel.py index 246f72a6..bf7df4d2 100644 --- a/timm/models/layers/selective_kernel.py +++ b/timm/models/layers/selective_kernel.py @@ -49,7 +49,7 @@ class SelectiveKernelAttn(nn.Module): class SelectiveKernel(nn.Module): def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1, - rd_ratio=1./16, rd_channels=None, min_rd_channels=16, rd_divisor=8, keep_3x3=True, split_input=True, + rd_ratio=1./16, rd_channels=None, min_rd_channels=32, rd_divisor=8, keep_3x3=True, split_input=True, drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None): """ Selective Kernel Convolution Module diff --git a/timm/models/sknet.py b/timm/models/sknet.py index 82ca5bfe..bba8bcf9 100644 --- a/timm/models/sknet.py +++ b/timm/models/sknet.py @@ -207,8 +207,9 @@ def skresnext50_32x4d(pretrained=False, **kwargs): """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to the SKNet-50 model in the Select Kernel Paper """ + sk_kwargs = dict(min_rd_channels=32, rd_ratio=1/16, split_input=False) model_args = dict( block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, - zero_init_last_bn=False, **kwargs) + block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) return _create_skresnet('skresnext50_32x4d', pretrained, **model_args) From bda8ab015ac5ee0ec75b8c59d5c0c3b399abda94 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 31 May 2021 15:38:56 -0700 Subject: [PATCH 11/12] Remove min channels for SelectiveKernel, divisor should cover cases well enough. --- timm/models/layers/selective_kernel.py | 6 ++---- timm/models/sknet.py | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/timm/models/layers/selective_kernel.py b/timm/models/layers/selective_kernel.py index bf7df4d2..f28b8d2e 100644 --- a/timm/models/layers/selective_kernel.py +++ b/timm/models/layers/selective_kernel.py @@ -49,7 +49,7 @@ class SelectiveKernelAttn(nn.Module): class SelectiveKernel(nn.Module): def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1, - rd_ratio=1./16, rd_channels=None, min_rd_channels=32, rd_divisor=8, keep_3x3=True, split_input=True, + rd_ratio=1./16, rd_channels=None, rd_divisor=8, keep_3x3=True, split_input=True, drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None): """ Selective Kernel Convolution Module @@ -68,7 +68,6 @@ class SelectiveKernel(nn.Module): dilation (int): dilation for module as a whole, impacts dilation of each branch groups (int): number of groups for each branch rd_ratio (int, float): reduction factor for attention features - min_rd_channels (int): minimum attention feature channels keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations split_input (bool): split input channels evenly across each convolution branch, keeps param count lower, can be viewed as grouping by path, output expands to module out_channels count @@ -103,8 +102,7 @@ class SelectiveKernel(nn.Module): ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs) for k, d in zip(kernel_size, dilation)]) - attn_channels = rd_channels or make_divisible( - out_channels * rd_ratio, min_value=min_rd_channels, divisor=rd_divisor) + attn_channels = rd_channels or make_divisible(out_channels * rd_ratio, divisor=rd_divisor) self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels) self.drop_block = drop_block diff --git a/timm/models/sknet.py b/timm/models/sknet.py index bba8bcf9..4dc2aa53 100644 --- a/timm/models/sknet.py +++ b/timm/models/sknet.py @@ -153,7 +153,7 @@ def skresnet18(pretrained=False, **kwargs): Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this variation splits the input channels to the selective convolutions to keep param count down. """ - sk_kwargs = dict(min_rd_channels=16, rd_ratio=1/8, split_input=True) + sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True) model_args = dict( block=SelectiveKernelBasic, layers=[2, 2, 2, 2], block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) @@ -167,7 +167,7 @@ def skresnet34(pretrained=False, **kwargs): Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this variation splits the input channels to the selective convolutions to keep param count down. """ - sk_kwargs = dict(min_rd_channels=16, rd_ratio=1/8, split_input=True) + sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True) model_args = dict( block=SelectiveKernelBasic, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) @@ -207,7 +207,7 @@ def skresnext50_32x4d(pretrained=False, **kwargs): """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to the SKNet-50 model in the Select Kernel Paper """ - sk_kwargs = dict(min_rd_channels=32, rd_ratio=1/16, split_input=False) + sk_kwargs = dict(rd_ratio=1/16, rd_divisor=32, split_input=False) model_args = dict( block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs) From 02320c3e3d217c90860e2576f1d644fdac09c09b Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 31 May 2021 15:41:51 -0700 Subject: [PATCH 12/12] Bump version to 0.4.11 --- timm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timm/version.py b/timm/version.py index b94cbb01..d4f33464 100644 --- a/timm/version.py +++ b/timm/version.py @@ -1 +1 @@ -__version__ = '0.4.10' +__version__ = '0.4.11'