Adding support to EfficientNet / MobileNetV3 to for different attention layers in .se position

2025-06-03 15:01:08 +08:00 · 2020-02-24 18:07:09 -08:00 · 2020-02-24 18:07:09 -08:00 · 67e759f710
commit 67e759f710
parent 27b3680d49
12 changed files with 334 additions and 149 deletions
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -94,6 +94,14 @@ default_cfgs = {
        url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
    'efficientnet_l2': _cfg(
        url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
+    'efficientnet_eca_b0': _cfg(
+        url=''),
+    'efficientnet_eca_b1': _cfg(
+        url='',
+        input_size=(3, 240, 240), pool_size=(8, 8)),
+    'efficientnet_eca_b2': _cfg(
+        url='',
+        input_size=(3, 260, 260), pool_size=(9, 9)),
    'efficientnet_es': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'),
    'efficientnet_em': _cfg(
@ -254,7 +262,7 @@ class EfficientNet(nn.Module):
    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(EfficientNet, self).__init__()
        norm_kwargs = norm_kwargs or {}

@ -272,8 +280,8 @@ class EfficientNet(nn.Module):

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer,
+            attn_layer, attn_kwargs, norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs
@ -334,7 +342,7 @@ class EfficientNetFeatures(nn.Module):
    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(EfficientNetFeatures, self).__init__()
        norm_kwargs = norm_kwargs or {}

@ -354,8 +362,8 @@ class EfficientNetFeatures(nn.Module):

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, attn_layer,
+            attn_kwargs, norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
        self._in_chs = builder.in_chs
@ -627,13 +635,13 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre

    """
    arch_def = [
-        ['ds_r1_k3_s1_e1_c16_se0.25'],
-        ['ir_r2_k3_s2_e6_c24_se0.25'],
-        ['ir_r2_k5_s2_e6_c40_se0.25'],
-        ['ir_r3_k3_s2_e6_c80_se0.25'],
-        ['ir_r3_k5_s1_e6_c112_se0.25'],
-        ['ir_r4_k5_s2_e6_c192_se0.25'],
-        ['ir_r1_k3_s1_e6_c320_se0.25'],
+        ['ds_r1_k3_s1_e1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r2_k5_s2_e6_c40'],
+        ['ir_r3_k3_s2_e6_c80'],
+        ['ir_r3_k5_s1_e6_c112'],
+        ['ir_r4_k5_s2_e6_c192'],
+        ['ir_r1_k3_s1_e6_c320'],
    ]
    model_kwargs = dict(
        block_args=decode_arch_def(arch_def, depth_multiplier),
@ -641,6 +649,8 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
        stem_size=32,
        channel_multiplier=channel_multiplier,
        act_layer=Swish,
+        attn_layer='sev2',
+        attn_kwargs=dict(se_ratio=0.25),
        norm_kwargs=resolve_bn_args(kwargs),
        **kwargs,
    )
@ -707,6 +717,53 @@ def _gen_efficientnet_condconv(
    return model


+def _gen_efficientnet_eca(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates an EfficientNet model w/ ECA attention instead of SE.
+
+    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+    'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r2_k5_s2_e6_c40'],
+        ['ir_r3_k3_s2_e6_c80'],
+        ['ir_r3_k5_s1_e6_c112'],
+        ['ir_r4_k5_s2_e6_c192'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        act_layer=Swish,
+        attn_layer='eca',
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs,
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
+    return model
+
+
 def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
    """Creates a MixNet Small model.

@ -980,6 +1037,33 @@ def efficientnet_l2(pretrained=False, **kwargs):
    return model


+@register_model
+def efficientnet_eca_b0(pretrained=False, **kwargs):
+    """ EfficientNet-ECA-B0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_eca(
+        'efficientnet_eca_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_eca_b1(pretrained=False, **kwargs):
+    """ EfficientNet-ECA-B1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_eca(
+        'efficientnet_eca_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_eca_b2(pretrained=False, **kwargs):
+    """ EfficientNet-ECA-B2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet_eca(
+        'efficientnet_eca_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
@register_model
 def efficientnet_es(pretrained=False, **kwargs):
    """ EfficientNet-Edge Small. """
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -1,8 +1,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from .layers.activations import sigmoid
-from .layers import create_conv2d, drop_path
+from .layers import create_conv2d, create_attn, drop_path


 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
@ -30,26 +29,21 @@ def resolve_bn_args(kwargs):
    return bn_args


-_SE_ARGS_DEFAULT = dict(
-    gate_fn=sigmoid,
-    act_layer=None,
-    reduce_mid=False,
-    divisor=1)
-
-
-def resolve_se_args(kwargs, in_chs, act_layer=None):
-    se_kwargs = kwargs.copy() if kwargs is not None else {}
-    # fill in args that aren't specified with the defaults
-    for k, v in _SE_ARGS_DEFAULT.items():
-        se_kwargs.setdefault(k, v)
-    # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
-    if not se_kwargs.pop('reduce_mid'):
-        se_kwargs['reduced_base_chs'] = in_chs
-    # act_layer override, if it remains None, the containing block's act_layer will be used
-    if se_kwargs['act_layer'] is None:
-        assert act_layer is not None
-        se_kwargs['act_layer'] = act_layer
-    return se_kwargs
+def resolve_attn_args(layer, kwargs, in_chs, act_layer=None):
+    attn_kwargs = kwargs.copy() if kwargs is not None else {}
+    if isinstance(layer, nn.Module):
+        is_se = 'SqueezeExciteV2' in layer.__name__
+    else:
+        is_se = layer == 'sev2'
+    if is_se:
+        # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
+        if not attn_kwargs.pop('reduce_mid', False):
+            attn_kwargs['reduced_base_chs'] = in_chs
+        # if act_layer it is not defined by attn kwargs, the containing block's act_layer will be used for attn
+        if attn_kwargs.get('act_layer', None) is None:
+            assert act_layer is not None
+            attn_kwargs['act_layer'] = act_layer
+    return attn_kwargs


 def make_divisible(v, divisor=8, min_value=None):
@ -90,26 +84,6 @@ class ChannelShuffle(nn.Module):
        )


-class SqueezeExcite(nn.Module):
-    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
-                 act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1, **_):
-        super(SqueezeExcite, self).__init__()
-        self.gate_fn = gate_fn
-        reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
-        self.act1 = act_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
-
-    def forward(self, x):
-        x_se = self.avg_pool(x)
-        x_se = self.conv_reduce(x_se)
-        x_se = self.act1(x_se)
-        x_se = self.conv_expand(x_se)
-        x = x * self.gate_fn(x_se)
-        return x
-
-
 class ConvBnAct(nn.Module):
    def __init__(self, in_chs, out_chs, kernel_size,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
@ -140,11 +114,10 @@ class DepthwiseSeparableConv(nn.Module):
    """
    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
+                 pw_kernel_size=1, pw_act=False, attn_layer=None, attn_kwargs=None,
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0.):
        super(DepthwiseSeparableConv, self).__init__()
        norm_kwargs = norm_kwargs or {}
-        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
        self.has_pw_act = pw_act  # activation after point-wise conv
        self.drop_path_rate = drop_path_rate
@ -154,10 +127,10 @@ class DepthwiseSeparableConv(nn.Module):
        self.bn1 = norm_layer(in_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+        # Attention block (Squeeze-Excitation, ECA, etc)
+        if attn_layer is not None:
+            attn_kwargs = resolve_attn_args(attn_layer, attn_kwargs, in_chs, act_layer)
+            self.se = create_attn(attn_layer, in_chs, **attn_kwargs)
        else:
            self.se = None

@ -199,13 +172,12 @@ class InvertedResidual(nn.Module):
    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
                 conv_kwargs=None, drop_path_rate=0.):
        super(InvertedResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
        conv_kwargs = conv_kwargs or {}
        mid_chs = make_divisible(in_chs * exp_ratio)
-        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
        self.drop_path_rate = drop_path_rate

@ -221,10 +193,10 @@ class InvertedResidual(nn.Module):
        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
        self.act2 = act_layer(inplace=True)

-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        # Attention block (Squeeze-Excitation, ECA, etc)
+        if attn_layer is not None:
+            attn_kwargs = resolve_attn_args(attn_layer, attn_kwargs, in_chs, act_layer)
+            self.se = create_attn(attn_layer, mid_chs, **attn_kwargs)
        else:
            self.se = None

@ -256,7 +228,7 @@ class InvertedResidual(nn.Module):
        x = self.bn2(x)
        x = self.act2(x)

-        # Squeeze-and-excitation
+        # Attention
        if self.se is not None:
            x = self.se(x)

@ -278,7 +250,7 @@ class CondConvResidual(InvertedResidual):
    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
                 num_experts=0, drop_path_rate=0.):

        self.num_experts = num_experts
@ -287,7 +259,7 @@ class CondConvResidual(InvertedResidual):
        super(CondConvResidual, self).__init__(
            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type,
            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
-            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
+            pw_kernel_size=pw_kernel_size, attn_layer=attn_layer, attn_kwargs=attn_kwargs,
            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
            drop_path_rate=drop_path_rate)

@ -310,7 +282,7 @@ class CondConvResidual(InvertedResidual):
        x = self.bn2(x)
        x = self.act2(x)

-        # Squeeze-and-excitation
+        # Attention
        if self.se is not None:
            x = self.se(x)

@ -330,7 +302,7 @@ class EdgeResidual(nn.Module):

    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
                 drop_path_rate=0.):
        super(EdgeResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -338,7 +310,6 @@ class EdgeResidual(nn.Module):
            mid_chs = make_divisible(fake_in_chs * exp_ratio)
        else:
            mid_chs = make_divisible(in_chs * exp_ratio)
-        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
        self.drop_path_rate = drop_path_rate

@ -347,10 +318,10 @@ class EdgeResidual(nn.Module):
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        # Attention block (Squeeze-Excitation, ECA, etc)
+        if attn_layer is not None:
+            attn_kwargs = resolve_attn_args(attn_layer, attn_kwargs, in_chs, act_layer)
+            self.se = create_attn(attn_layer, mid_chs, **attn_kwargs)
        else:
            self.se = None

@ -378,7 +349,7 @@ class EdgeResidual(nn.Module):
        x = self.bn1(x)
        x = self.act1(x)

-        # Squeeze-and-excitation
+        # Attention
        if self.se is not None:
            x = self.se(x)

--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -79,6 +79,13 @@ def _decode_block_str(block_str):
    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
    fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
+    attn_layer = None
+    attn_kwargs = None
+    if 'se' in options:
+        attn_layer = 'sev2'
+        attn_kwargs = dict(se_ratio=float(options['se']))
+    elif 'eca' in options:
+        attn_layer = 'eca'

    num_repeat = int(options['r'])
    # each type of block has different valid arguments, fill accordingly
@ -90,7 +97,8 @@ def _decode_block_str(block_str):
            pw_kernel_size=pw_kernel_size,
            out_chs=int(options['c']),
            exp_ratio=float(options['e']),
-            se_ratio=float(options['se']) if 'se' in options else None,
+            attn_layer=attn_layer,
+            attn_kwargs=attn_kwargs,
            stride=int(options['s']),
            act_layer=act_layer,
            noskip=noskip,
@ -103,7 +111,8 @@ def _decode_block_str(block_str):
            dw_kernel_size=_parse_ksize(options['k']),
            pw_kernel_size=pw_kernel_size,
            out_chs=int(options['c']),
-            se_ratio=float(options['se']) if 'se' in options else None,
+            attn_layer=attn_layer,
+            attn_kwargs=attn_kwargs,
            stride=int(options['s']),
            act_layer=act_layer,
            pw_act=block_type == 'dsa',
@ -117,7 +126,8 @@ def _decode_block_str(block_str):
            out_chs=int(options['c']),
            exp_ratio=float(options['e']),
            fake_in_chs=fake_in_chs,
-            se_ratio=float(options['se']) if 'se' in options else None,
+            attn_layer=attn_layer,
+            attn_kwargs=attn_kwargs,
            stride=int(options['s']),
            act_layer=act_layer,
            noskip=noskip,
@ -201,7 +211,7 @@ class EfficientNetBuilder:

    """
    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
+                 output_stride=32, pad_type='', act_layer=None, attn_layer=None, attn_kwargs=None,
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0., feature_location='',
                 verbose=False):
        self.channel_multiplier = channel_multiplier
@ -210,7 +220,8 @@ class EfficientNetBuilder:
        self.output_stride = output_stride
        self.pad_type = pad_type
        self.act_layer = act_layer
-        self.se_kwargs = se_kwargs
+        self.attn_layer = attn_layer
+        self.attn_kwargs = attn_kwargs
        self.norm_layer = norm_layer
        self.norm_kwargs = norm_kwargs
        self.drop_path_rate = drop_path_rate
@ -239,9 +250,19 @@ class EfficientNetBuilder:
        # block act fn overrides the model default
        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
        assert ba['act_layer'] is not None
+        if 'attn_layer' in ba:
+            assert'attn_kwargs' in ba  # block args should have both or neither
+            # per-block attn layer overrides model default
+            ba['attn_layer'] = ba['attn_layer'] if ba['attn_layer'] is not None else self.attn_layer
+            if self.attn_kwargs is not None:
+                # merge per-block attn kwargs with model if both exist
+                if ba['attn_kwargs'] is None:
+                    ba['attn_kwargs'] = self.attn_kwargs
+                else:
+                    ba['attn_kwargs'].update(self.attn_kwargs)
+
        if bt == 'ir':
            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
            if ba.get('num_experts', 0) > 0:
@ -250,13 +271,11 @@ class EfficientNetBuilder:
                block = InvertedResidual(**ba)
        elif bt == 'ds' or bt == 'dsa':
            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
            block = DepthwiseSeparableConv(**ba)
        elif bt == 'er':
            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
            block = EdgeResidual(**ba)
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@ -11,7 +11,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .layers import SEModule
+from .layers import SqueezeExcite
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 from .resnet import ResNet, Bottleneck, BasicBlock
@ -321,7 +321,7 @@ def gluon_seresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kw
    default_cfg = default_cfgs['gluon_seresnext50_32x4d']
    model = ResNet(
        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
-        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SqueezeExcite), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -335,7 +335,7 @@ def gluon_seresnext101_32x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    default_cfg = default_cfgs['gluon_seresnext101_32x4d']
    model = ResNet(
        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4,
-        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SqueezeExcite), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -347,7 +347,7 @@ def gluon_seresnext101_64x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    """Constructs a SEResNeXt-101-64x4d model.
    """
    default_cfg = default_cfgs['gluon_seresnext101_64x4d']
-    block_args = dict(attn_layer=SEModule)
+    block_args = dict(attn_layer=SqueezeExcite)
    model = ResNet(
        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4,
        num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
@ -362,7 +362,7 @@ def gluon_senet154(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs an SENet-154 model.
    """
    default_cfg = default_cfgs['gluon_senet154']
-    block_args = dict(attn_layer=SEModule)
+    block_args = dict(attn_layer=SqueezeExcite)
    model = ResNet(
        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep', down_kernel_size=3,
        block_reduce_first=2, num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -7,8 +7,8 @@ from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .create_conv2d import create_conv2d
 from .create_attn import create_attn
 from .selective_kernel import SelectiveKernelConv
-from .se import SEModule
-from .eca import EcaModule, CecaModule
+from .se import SqueezeExcite, SqueezeExciteV2
+from .eca import EfficientChannelAttn, CircularEfficientChannelAttn
 from .activations import *
 from .adaptive_avgmax_pool import \
    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
--- a/timm/models/layers/cbam.py
+++ b/timm/models/layers/cbam.py
@ -75,9 +75,9 @@ class LightSpatialAttn(nn.Module):
        return x * x_attn.sigmoid()


-class CbamModule(nn.Module):
+class ConvBlockAttn(nn.Module):
    def __init__(self, channels, spatial_kernel_size=7):
-        super(CbamModule, self).__init__()
+        super(ConvBlockAttn, self).__init__()
        self.channel = ChannelAttn(channels)
        self.spatial = SpatialAttn(spatial_kernel_size)

@ -87,9 +87,9 @@ class CbamModule(nn.Module):
        return x


-class LightCbamModule(nn.Module):
+class LightConvBlockAttn(nn.Module):
    def __init__(self, channels, spatial_kernel_size=7):
-        super(LightCbamModule, self).__init__()
+        super(LightConvBlockAttn, self).__init__()
        self.channel = LightChannelAttn(channels)
        self.spatial = LightSpatialAttn(spatial_kernel_size)

--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -3,9 +3,9 @@
 Hacked together by Ross Wightman
 """
 import torch
-from .se import SEModule
-from .eca import EcaModule, CecaModule
-from .cbam import CbamModule, LightCbamModule
+from .se import SqueezeExcite, SqueezeExciteV2
+from .eca import EfficientChannelAttn, CircularEfficientChannelAttn
+from .cbam import ConvBlockAttn, LightConvBlockAttn


 def create_attn(attn_type, channels, **kwargs):
@ -14,20 +14,19 @@ def create_attn(attn_type, channels, **kwargs):
        if isinstance(attn_type, str):
            attn_type = attn_type.lower()
            if attn_type == 'se':
-                module_cls = SEModule
+                module_cls = SqueezeExcite
+            elif attn_type == 'sev2':
+                module_cls = SqueezeExciteV2
            elif attn_type == 'eca':
-                module_cls = EcaModule
-            elif attn_type == 'eca':
-                module_cls = CecaModule
+                module_cls = EfficientChannelAttn
+            elif attn_type == 'ceca':
+                module_cls = CircularEfficientChannelAttn
            elif attn_type == 'cbam':
-                module_cls = CbamModule
+                module_cls = ConvBlockAttn
            elif attn_type == 'lcbam':
-                module_cls = LightCbamModule
+                module_cls = LightConvBlockAttn
            else:
                assert False, "Invalid attn module (%s)" % attn_type
-        elif isinstance(attn_type, bool):
-            if attn_type:
-                module_cls = SEModule
        else:
            module_cls = attn_type
    if module_cls is not None:
--- a/timm/models/layers/eca.py
+++ b/timm/models/layers/eca.py
@ -38,7 +38,7 @@ from torch import nn
 import torch.nn.functional as F


-class EcaModule(nn.Module):
+class EfficientChannelAttn(nn.Module):
    """Constructs an ECA module.

    Args:
@ -49,8 +49,8 @@ class EcaModule(nn.Module):
            (default=None. if channel size not given, use k_size given for kernel size.)
        kernel_size: Adaptive selection of kernel size (default=3)
    """
-    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
-        super(EcaModule, self).__init__()
+    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, gate_fn=None):
+        super(EfficientChannelAttn, self).__init__()
        assert kernel_size % 2 == 1

        if channels is not None:
@ -59,20 +59,18 @@ class EcaModule(nn.Module):

        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+        self.gate_fn = gate_fn

    def forward(self, x):
-        # Feature descriptor on the global spatial information
-        y = self.avg_pool(x)
-        # Reshape for convolution
-        y = y.view(x.shape[0], 1, -1)
-        # Two different branches of ECA module
+        y = self.avg_pool(x)  # Feature descriptor on the global spatial information
+        y = y.view(x.shape[0], 1, -1)  # Reshape for convolution
        y = self.conv(y)
-        # Multi-scale information fusion
-        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
+        y = y.view(x.shape[0], -1, 1, 1)
+        y = y.sigmoid() if self.gate_fn is None else self.gate_fn(y)
        return x * y.expand_as(x)


-class CecaModule(nn.Module):
+class CircularEfficientChannelAttn(nn.Module):
    """Constructs a circular ECA module.

    ECA module where the conv uses circular padding rather than zero padding.
@ -92,13 +90,14 @@ class CecaModule(nn.Module):
        kernel_size: Adaptive selection of kernel size (default=3)
    """

-    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
-        super(CecaModule, self).__init__()
+    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, gate_fn=None):
+        super(CircularEfficientChannelAttn, self).__init__()
        assert kernel_size % 2 == 1

        if channels is not None:
            t = int(abs(math.log(channels, 2) + beta) / gamma)
            kernel_size = max(t if t % 2 else t + 1, 3)
+        self.padding = (kernel_size - 1) // 2

        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        #pytorch circular padding mode is buggy as of pytorch 1.4
@ -106,19 +105,13 @@ class CecaModule(nn.Module):

        #implement manual circular padding
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False)
-        self.padding = (kernel_size - 1) // 2
+        self.gate_fn = gate_fn

    def forward(self, x):
-        # Feature descriptor on the global spatial information
-        y = self.avg_pool(x)
-
+        y = self.avg_pool(x)  # Feature descriptor on the global spatial information
        # Manually implement circular padding, F.pad does not seemed to be bugged
        y = F.pad(y.view(x.shape[0], 1, -1), (self.padding, self.padding), mode='circular')
-
-        # Two different branches of ECA module
        y = self.conv(y)
-
-        # Multi-scale information fusion
-        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
-
+        y = y.view(x.shape[0], -1, 1, 1)
+        y = y.sigmoid() if self.gate_fn is None else self.gate_fn(y)
        return x * y.expand_as(x)
--- a/timm/models/layers/helpers.py
+++ b/timm/models/layers/helpers.py
@ -21,7 +21,13 @@ tup_triple = _ntuple(3)
 tup_quadruple = _ntuple(4)


-
+def make_divisible(v, divisor=8, min_value=None):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v



--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@ -1,12 +1,22 @@
+import torch
 from torch import nn as nn

+from .helpers import make_divisible

-class SEModule(nn.Module):

-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
-        super(SEModule, self).__init__()
+class SqueezeExcite(nn.Module):
+    """ Squeeze-and-Excitation module as used in Pytorch SENet, SE-ResNeXt implementations
+
+    Args:
+        channels (int): number of input and output channels
+        reduction (int, float): divisor for attention (squeezed) channels
+        act_layer (nn.Module): override the default ReLU activation
+    """
+
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, divisible_by=1):
+        super(SqueezeExcite, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        reduction_channels = max(channels // reduction, 8)
+        reduction_channels = make_divisible(channels // reduction, divisible_by)
        self.fc1 = nn.Conv2d(
            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
        self.act = act_layer(inplace=True)
@ -19,3 +29,38 @@ class SEModule(nn.Module):
        x_se = self.act(x_se)
        x_se = self.fc2(x_se)
        return x * x_se.sigmoid()
+
+
+class SqueezeExciteV2(nn.Module):
+    """ Squeeze-and-Excitation module as used in EfficientNet, MobileNetV3, related models
+
+    Differs from the original SqueezeExcite impl in that:
+      * reduction is specified as a float multiplier instead of divisor (se_ratio)
+      * gate function is changeable from sigmoid to alternate (ie hard_sigmoid)
+      * layer names match those in weights for the EfficientNet/MobileNetV3 families
+
+    Args:
+        channels (int): number of input and output channels
+        se_ratio (float): multiplier for attention (squeezed) channels
+        reduced_base_chs (int): specify alternate channel count to base the reduction channels on
+        act_layer (nn.Module): override the default ReLU activation
+        gate_fn (callable): override the default gate function
+    """
+
+    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
+                 act_layer=nn.ReLU, gate_fn=torch.sigmoid, divisible_by=1, **_):
+        super(SqueezeExciteV2, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisible_by)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layer(inplace=True)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = x * self.gate_fn(x_se)
+        return x
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -30,10 +30,11 @@ def _cfg(url='', **kwargs):


 default_cfgs = {
-    'mobilenetv3_large_075': _cfg(url=''),
-    'mobilenetv3_large_100': _cfg(url=''),
-    'mobilenetv3_small_075': _cfg(url=''),
-    'mobilenetv3_small_100': _cfg(url=''),
+    'mobilenetv3_large_075': _cfg(url='', interoplation='bicubic'),
+    'mobilenetv3_large_100': _cfg(url='', interoplation='bicubic'),
+    'mobilenetv3_small_075': _cfg(url='', interoplation='bicubic'),
+    'mobilenetv3_small_100': _cfg(url='', interoplation='bicubic'),
+    'mobilenetv3_eca_large': _cfg(url='', interoplation='bicubic'),
    'mobilenetv3_rw': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
        interpolation='bicubic'),
@ -72,7 +73,7 @@ class MobileNetV3(nn.Module):

    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
+                 attn_layer=None, attn_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(MobileNetV3, self).__init__()
        
        self.num_classes = num_classes
@ -89,7 +90,7 @@ class MobileNetV3(nn.Module):

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
+            channel_multiplier, 8, None, 32, pad_type, act_layer, attn_layer, attn_kwargs,
            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
@ -148,7 +149,7 @@ class MobileNetV3Features(nn.Module):

    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
-                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., se_kwargs=None,
+                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., attn_layer=None, attn_kwargs=None,
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(MobileNetV3Features, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -169,7 +170,7 @@ class MobileNetV3Features(nn.Module):

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
+            channel_multiplier, 8, None, output_stride, pad_type, act_layer, attn_layer, attn_kwargs,
            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
@ -256,7 +257,7 @@ def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kw
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
        act_layer=HardSwish,
-        se_kwargs=dict(gate_fn=hard_sigmoid, reduce_mid=True, divisor=1),
+        attn_kwargs=dict(gate_fn=hard_sigmoid, reduce_mid=True, divisor=1),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -352,7 +353,68 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
        act_layer=act_layer,
-        se_kwargs=dict(act_layer=nn.ReLU, gate_fn=hard_sigmoid, reduce_mid=True, divisor=8),
+        attn_kwargs=dict(act_layer=nn.ReLU, gate_fn=hard_sigmoid, reduce_mid=True, divisible_by=8),
+        **kwargs,
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
+    return model
+
+
+def _gen_mobilenet_v3_eca(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    if 'small' in variant:
+        num_features = 1024
+        act_layer = HardSwish
+        arch_def = [
+            # stage 0, 112x112 in
+            ['ds_r1_k3_s2_e1_c16_nre'],  # relu
+            # stage 1, 56x56 in
+            ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'],  # relu
+            # stage 2, 28x28 in
+            ['ir_r1_k5_s2_e4_c40', 'ir_r2_k5_s1_e6_c40'],  # hard-swish
+            # stage 3, 14x14 in
+            ['ir_r2_k5_s1_e3_c48'],  # hard-swish
+            # stage 4, 14x14in
+            ['ir_r3_k5_s2_e6_c96'],  # hard-swish
+            # stage 6, 7x7 in
+            ['cn_r1_k1_s1_c576'],  # hard-swish
+        ]
+    else:
+        num_features = 1280
+        act_layer = HardSwish
+        arch_def = [
+            # stage 0, 112x112 in
+            ['ds_r1_k3_s1_e1_c16_nre'],  # relu
+            # stage 1, 112x112 in
+            ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+            # stage 2, 56x56 in
+            ['ir_r3_k5_s2_e3_c40_nre'],  # relu
+            # stage 3, 28x28 in
+            ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+            # stage 4, 14x14in
+            ['ir_r2_k3_s1_e6_c112'],  # hard-swish
+            # stage 5, 14x14in
+            ['ir_r3_k5_s2_e6_c160'],  # hard-swish
+            # stage 6, 7x7 in
+            ['cn_r1_k1_s1_c960'],  # hard-swish
+        ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=num_features,
+        stem_size=16,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=act_layer,
+        attn_layer='eca',
+        attn_kwargs=dict(gate_fn=hard_sigmoid),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -382,12 +444,18 @@ def mobilenetv3_small_075(pretrained=False, **kwargs):

@register_model
 def mobilenetv3_small_100(pretrained=False, **kwargs):
-    print(kwargs)
    """ MobileNet V3 """
    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
    return model


+@register_model
+def mobilenetv3_eca_large(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3_eca('mobilenetv3_eca_large', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
@register_model
 def mobilenetv3_rw(pretrained=False, **kwargs):
    """ MobileNet V3 """
--- a/timm/models/res2net.py
+++ b/timm/models/res2net.py
@ -11,7 +11,7 @@ import torch.nn.functional as F
 from .resnet import ResNet
 from .registry import register_model
 from .helpers import load_pretrained
-from .layers import SEModule
+from .layers import SqueezeExcite
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 __all__ = []