diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
index b44e1161..3f023572 100644
--- a/timm/layers/__init__.py
+++ b/timm/layers/__init__.py
@@ -4,7 +4,7 @@ from .adaptive_avgmax_pool import \
 from .attention2d import MultiQueryAttention2d, Attention2d, MultiQueryAttentionV2
 from .attention_pool import AttentionPoolLatent
 from .attention_pool2d import AttentionPool2d, RotAttentionPool2d, RotaryEmbedding
-from .blur_pool import BlurPool2d
+from .blur_pool import BlurPool2d, create_aa
 from .classifier import ClassifierHead, create_classifier, NormMlpClassifierHead
 from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .config import is_exportable, is_scriptable, is_no_jit, use_fused_attn, \
diff --git a/timm/layers/blur_pool.py b/timm/layers/blur_pool.py
index e73d8863..6a4b668c 100644
--- a/timm/layers/blur_pool.py
+++ b/timm/layers/blur_pool.py
@@ -5,12 +5,16 @@ BlurPool layer inspired by
 
 Hacked together by Chris Ha and Ross Wightman
 """
+from functools import partial
+from typing import Optional, Type
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+
 from .padding import get_padding
+from .typing import LayerType
 
 
 class BlurPool2d(nn.Module):
@@ -26,17 +30,62 @@ class BlurPool2d(nn.Module):
     Returns:
         torch.Tensor: the transformed tensor.
     """
-    def __init__(self, channels, filt_size=3, stride=2) -> None:
+    def __init__(
+            self,
+            channels: Optional[int] = None,
+            filt_size: int = 3,
+            stride: int = 2,
+            pad_mode: str = 'reflect',
+    ) -> None:
         super(BlurPool2d, self).__init__()
         assert filt_size > 1
         self.channels = channels
         self.filt_size = filt_size
         self.stride = stride
+        self.pad_mode = pad_mode
         self.padding = [get_padding(filt_size, stride, dilation=1)] * 4
+
         coeffs = torch.tensor((np.poly1d((0.5, 0.5)) ** (self.filt_size - 1)).coeffs.astype(np.float32))
-        blur_filter = (coeffs[:, None] * coeffs[None, :])[None, None, :, :].repeat(self.channels, 1, 1, 1)
+        blur_filter = (coeffs[:, None] * coeffs[None, :])[None, None, :, :]
+        if channels is not None:
+            blur_filter = blur_filter.repeat(self.channels, 1, 1, 1)
         self.register_buffer('filt', blur_filter, persistent=False)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = F.pad(x, self.padding, 'reflect')
-        return F.conv2d(x, self.filt, stride=self.stride, groups=self.channels)
+        x = F.pad(x, self.padding, mode=self.pad_mode)
+        if self.channels is None:
+            channels = x.shape[1]
+            weight = self.filt.expand(channels, 1, self.filt_size, self.filt_size)
+        else:
+            channels = self.channels
+            weight = self.filt
+        return F.conv2d(x, weight, stride=self.stride, groups=channels)
+
+
+def create_aa(
+        aa_layer: LayerType,
+        channels: Optional[int] = None,
+        stride: int = 2,
+        enable: bool = True,
+        noop: Optional[Type[nn.Module]] = nn.Identity
+) -> nn.Module:
+    """ Anti-aliasing """
+    if not aa_layer or not enable:
+        return noop() if noop is not None else None
+
+    if isinstance(aa_layer, str):
+        aa_layer = aa_layer.lower().replace('_', '').replace('-', '')
+        if aa_layer == 'avg' or aa_layer == 'avgpool':
+            aa_layer = nn.AvgPool2d
+        elif aa_layer == 'blur' or aa_layer == 'blurpool':
+            aa_layer = BlurPool2d
+        elif aa_layer == 'blurpc':
+            aa_layer = partial(BlurPool2d, pad_mode='constant')
+
+        else:
+            assert False, f"Unknown anti-aliasing layer ({aa_layer})."
+
+    try:
+        return aa_layer(channels=channels, stride=stride)
+    except TypeError as e:
+        return aa_layer(stride)
diff --git a/timm/layers/conv_bn_act.py b/timm/layers/conv_bn_act.py
index 84aaf4bf..17847d76 100644
--- a/timm/layers/conv_bn_act.py
+++ b/timm/layers/conv_bn_act.py
@@ -2,9 +2,12 @@
 
 Hacked together by / Copyright 2020 Ross Wightman
 """
-import functools
+from typing import Any, Dict, Optional, Type
+
 from torch import nn as nn
 
+from .typing import LayerType, PadType
+from .blur_pool import create_aa
 from .create_conv2d import create_conv2d
 from .create_norm_act import get_norm_act_layer
 
@@ -12,28 +15,38 @@ from .create_norm_act import get_norm_act_layer
 class ConvNormAct(nn.Module):
     def __init__(
             self,
-            in_channels,
-            out_channels,
-            kernel_size=1,
-            stride=1,
-            padding='',
-            dilation=1,
-            groups=1,
-            bias=False,
-            apply_act=True,
-            norm_layer=nn.BatchNorm2d,
-            norm_kwargs=None,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            drop_layer=None,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: PadType = '',
+            dilation: int = 1,
+            groups: int = 1,
+            bias: bool = False,
+            apply_act: bool = True,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            act_layer: LayerType = nn.ReLU,
+            drop_layer: Optional[Type[nn.Module]] = None,
+            conv_kwargs: Optional[Dict[str, Any]] = None,
+            norm_kwargs: Optional[Dict[str, Any]] = None,
+            act_kwargs: Optional[Dict[str, Any]] = None,
     ):
         super(ConvNormAct, self).__init__()
+        conv_kwargs = conv_kwargs or {}
         norm_kwargs = norm_kwargs or {}
         act_kwargs = act_kwargs or {}
 
         self.conv = create_conv2d(
-            in_channels, out_channels, kernel_size, stride=stride,
-            padding=padding, dilation=dilation, groups=groups, bias=bias)
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **conv_kwargs,
+        )
 
         # NOTE for backwards compatibility with models that use separate norm and act layer definitions
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
@@ -64,54 +77,53 @@ class ConvNormAct(nn.Module):
 ConvBnAct = ConvNormAct
 
 
-def create_aa(aa_layer, channels, stride=2, enable=True):
-    if not aa_layer or not enable:
-        return nn.Identity()
-    if isinstance(aa_layer, functools.partial):
-        if issubclass(aa_layer.func, nn.AvgPool2d):
-            return aa_layer()
-        else:
-            return aa_layer(channels)
-    elif issubclass(aa_layer, nn.AvgPool2d):
-        return aa_layer(stride)
-    else:
-        return aa_layer(channels=channels, stride=stride)
-
-
 class ConvNormActAa(nn.Module):
     def __init__(
             self,
-            in_channels,
-            out_channels,
-            kernel_size=1,
-            stride=1,
-            padding='',
-            dilation=1,
-            groups=1,
-            bias=False,
-            apply_act=True,
-            norm_layer=nn.BatchNorm2d,
-            norm_kwargs=None,
-            act_layer=nn.ReLU,
-            act_kwargs=None,
-            aa_layer=None,
-            drop_layer=None,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: PadType = '',
+            dilation: int = 1,
+            groups: int = 1,
+            bias: bool = False,
+            apply_act: bool = True,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            act_layer: LayerType = nn.ReLU,
+            aa_layer: Optional[LayerType] = None,
+            drop_layer: Optional[Type[nn.Module]] = None,
+            conv_kwargs: Optional[Dict[str, Any]] = None,
+            norm_kwargs: Optional[Dict[str, Any]] = None,
+            act_kwargs: Optional[Dict[str, Any]] = None,
     ):
         super(ConvNormActAa, self).__init__()
         use_aa = aa_layer is not None and stride == 2
+        conv_kwargs = conv_kwargs or {}
         norm_kwargs = norm_kwargs or {}
         act_kwargs = act_kwargs or {}
 
         self.conv = create_conv2d(
-            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
-            padding=padding, dilation=dilation, groups=groups, bias=bias)
+            in_channels, out_channels, kernel_size,
+            stride=1 if use_aa else stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **conv_kwargs,
+        )
 
         # NOTE for backwards compatibility with models that use separate norm and act layer definitions
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
         # NOTE for backwards (weight) compatibility, norm layer name remains `.bn`
         if drop_layer:
             norm_kwargs['drop_layer'] = drop_layer
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, act_kwargs=act_kwargs, **norm_kwargs)
+        self.bn = norm_act_layer(
+            out_channels,
+            apply_act=apply_act,
+            act_kwargs=act_kwargs,
+            **norm_kwargs,
+        )
         self.aa = create_aa(aa_layer, out_channels, stride=stride, enable=use_aa)
 
     @property
diff --git a/timm/models/_efficientnet_blocks.py b/timm/models/_efficientnet_blocks.py
index be00b01c..f33dacd5 100644
--- a/timm/models/_efficientnet_blocks.py
+++ b/timm/models/_efficientnet_blocks.py
@@ -2,22 +2,24 @@
 
 Hacked together by / Copyright 2019, Ross Wightman
 """
-from typing import Optional
+from typing import Callable, Dict, Optional, Type
 
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 
-from timm.layers import create_conv2d, DropPath, make_divisible, create_act_layer, to_2tuple,\
-    get_norm_act_layer, MultiQueryAttention2d, MultiQueryAttentionV2, Attention2d
+from timm.layers import create_conv2d, DropPath, make_divisible, create_act_layer, create_aa, to_2tuple, LayerType,\
+    ConvNormAct, ConvNormActAa, get_norm_act_layer, MultiQueryAttention2d, Attention2d
 
 __all__ = [
     'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual',
     'UniversalInvertedResidual', 'MobileAttention'
 ]
 
+ModuleType = Type[nn.Module]
 
-def num_groups(group_size, channels):
+
+def num_groups(group_size: Optional[int], channels: int):
     if not group_size:  # 0 or None
         return 1  # normal conv with 1 group
     else:
@@ -40,13 +42,13 @@ class SqueezeExcite(nn.Module):
 
     def __init__(
             self,
-            in_chs,
-            rd_ratio=0.25,
-            rd_channels=None,
-            act_layer=nn.ReLU,
-            gate_layer=nn.Sigmoid,
-            force_act_layer=None,
-            rd_round_fn=None,
+            in_chs: int,
+            rd_ratio: float = 0.25,
+            rd_channels: Optional[int] = None,
+            act_layer: LayerType = nn.ReLU,
+            gate_layer: LayerType = nn.Sigmoid,
+            force_act_layer: Optional[LayerType] = None,
+            rd_round_fn: Optional[Callable] = None,
     ):
         super(SqueezeExcite, self).__init__()
         if rd_channels is None:
@@ -71,27 +73,31 @@ class ConvBnAct(nn.Module):
     """
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            kernel_size,
-            stride=1,
-            dilation=1,
-            group_size=0,
-            pad_type='',
-            skip=False,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            drop_path_rate=0.,
+            in_chs: int,
+            out_chs: int,
+            kernel_size: int,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 0,
+            pad_type: str = '',
+            skip: bool = False,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            drop_path_rate: float = 0.,
     ):
         super(ConvBnAct, self).__init__()
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
         groups = num_groups(group_size, in_chs)
         self.has_skip = skip and stride == 1 and in_chs == out_chs
+        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
 
         self.conv = create_conv2d(
             in_chs, out_chs, kernel_size,
-            stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+            stride=1 if use_aa else stride,
+            dilation=dilation, groups=groups, padding=pad_type)
         self.bn1 = norm_act_layer(out_chs, inplace=True)
+        self.aa = create_aa(aa_layer, channels=out_chs, stride=stride, enable=use_aa)
         self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
 
     def feature_info(self, location):
@@ -104,6 +110,7 @@ class ConvBnAct(nn.Module):
         shortcut = x
         x = self.conv(x)
         x = self.bn1(x)
+        x = self.aa(x)
         if self.has_skip:
             x = self.drop_path(x) + shortcut
         return x
@@ -116,37 +123,38 @@ class DepthwiseSeparableConv(nn.Module):
     """
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            dw_kernel_size=3,
-            stride=1,
-            dilation=1,
-            group_size=1,
-            pad_type='',
-            noskip=False,
-            pw_kernel_size=1,
-            pw_act=False,
-            s2d=0,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            se_layer=None,
-            drop_path_rate=0.,
+            in_chs: int,
+            out_chs: int,
+            dw_kernel_size: int = 3,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 1,
+            pad_type: str = '',
+            noskip: bool = False,
+            pw_kernel_size: int = 1,
+            pw_act: bool = False,
+            s2d: int = 0,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[ModuleType] = None,
+            drop_path_rate: float = 0.,
     ):
         super(DepthwiseSeparableConv, self).__init__()
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
         self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip
         self.has_pw_act = pw_act  # activation after point-wise conv
+        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
 
         # Space to depth
         if s2d == 1:
             sd_chs = int(in_chs * 4)
-            #sd_pad_type = 'sam'
-            self.conv_s2d = create_conv2d(
-                in_chs, sd_chs, kernel_size=2, stride=2, padding=0) #'same')
+            self.conv_s2d = create_conv2d(in_chs, sd_chs, kernel_size=2, stride=2, padding='same')
             self.bn_s2d = norm_act_layer(sd_chs, sd_chs)
             dw_kernel_size = (dw_kernel_size + 1) // 2
             dw_pad_type = 'same' if dw_kernel_size == 2 else pad_type
             in_chs = sd_chs
+            use_aa = False  # disable AA
         else:
             self.conv_s2d = None
             self.bn_s2d = None
@@ -156,8 +164,10 @@ class DepthwiseSeparableConv(nn.Module):
 
         self.conv_dw = create_conv2d(
             in_chs, in_chs, dw_kernel_size,
-            stride=stride, dilation=dilation, padding=dw_pad_type, groups=groups)
+            stride=1 if use_aa else stride,
+            dilation=dilation, padding=dw_pad_type, groups=groups)
         self.bn1 = norm_act_layer(in_chs, inplace=True)
+        self.aa = create_aa(aa_layer, channels=out_chs, stride=stride, enable=use_aa)
 
         # Squeeze-and-excitation
         self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity()
@@ -174,13 +184,12 @@ class DepthwiseSeparableConv(nn.Module):
 
     def forward(self, x):
         shortcut = x
-        #print('ii', x.shape)  # FIXME debug s2d
         if self.conv_s2d is not None:
             x = self.conv_s2d(x)
             x = self.bn_s2d(x)
-        #print('id', x.shape)  # FIXME debug s2d
         x = self.conv_dw(x)
         x = self.bn1(x)
+        x = self.aa(x)
         x = self.se(x)
         x = self.conv_pw(x)
         x = self.bn2(x)
@@ -201,37 +210,40 @@ class InvertedResidual(nn.Module):
 
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            dw_kernel_size=3,
-            stride=1,
-            dilation=1,
-            group_size=1,
-            pad_type='',
-            noskip=False,
-            exp_ratio=1.0,
-            exp_kernel_size=1,
-            pw_kernel_size=1,
-            s2d=0,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            se_layer=None,
-            conv_kwargs=None,
-            drop_path_rate=0.,
+            in_chs: int,
+            out_chs: int,
+            dw_kernel_size: int = 3,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 1,
+            pad_type: str = '',
+            noskip: bool = False,
+            exp_ratio: float = 1.0,
+            exp_kernel_size: int = 1,
+            pw_kernel_size: int = 1,
+            s2d: int = 0,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[ModuleType] = None,
+            conv_kwargs: Optional[Dict] = None,
+            drop_path_rate: float = 0.,
     ):
         super(InvertedResidual, self).__init__()
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
         conv_kwargs = conv_kwargs or {}
         self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
+        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
 
         # Space to depth
         if s2d == 1:
             sd_chs = int(in_chs * 4)
-            self.conv_s2d = create_conv2d(in_chs, sd_chs, kernel_size=2, stride=2, padding=pad_type)
+            self.conv_s2d = create_conv2d(in_chs, sd_chs, kernel_size=2, stride=2, padding='same')
             self.bn_s2d = norm_act_layer(sd_chs, sd_chs)
             dw_kernel_size = (dw_kernel_size + 1) // 2
             dw_pad_type = 'same' if dw_kernel_size == 2 else pad_type
             in_chs = sd_chs
+            use_aa = False  # disable AA
         else:
             self.conv_s2d = None
             self.bn_s2d = None
@@ -247,8 +259,10 @@ class InvertedResidual(nn.Module):
         # Depth-wise convolution
         self.conv_dw = create_conv2d(
             mid_chs, mid_chs, dw_kernel_size,
-            stride=stride, dilation=dilation, groups=groups, padding=dw_pad_type, **conv_kwargs)
+            stride=1 if use_aa else stride,
+            dilation=dilation, groups=groups, padding=dw_pad_type, **conv_kwargs)
         self.bn2 = norm_act_layer(mid_chs, inplace=True)
+        self.aa = create_aa(aa_layer, channels=mid_chs, stride=stride, enable=use_aa)
 
         # Squeeze-and-excitation
         self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
@@ -273,6 +287,7 @@ class InvertedResidual(nn.Module):
         x = self.bn1(x)
         x = self.conv_dw(x)
         x = self.bn2(x)
+        x = self.aa(x)
         x = self.se(x)
         x = self.conv_pwl(x)
         x = self.bn3(x)
@@ -282,7 +297,7 @@ class InvertedResidual(nn.Module):
 
 
 class LayerScale2d(nn.Module):
-    def __init__(self, dim, init_values=1e-5, inplace=False):
+    def __init__(self, dim: int, init_values: float = 1e-5, inplace: bool = False):
         super().__init__()
         self.inplace = inplace
         self.gamma = nn.Parameter(init_values * torch.ones(dim))
@@ -293,7 +308,7 @@ class LayerScale2d(nn.Module):
 
 
 class UniversalInvertedResidual(nn.Module):
-    """ Universal Inverted Residual Block
+    """ Universal Inverted Residual Block (aka Universal Inverted Bottleneck, UIB)
 
     For MobileNetV4 - https://arxiv.org/abs/, referenced from
     https://github.com/tensorflow/models/blob/d93c7e932de27522b2fa3b115f58d06d6f640537/official/vision/modeling/layers/nn_blocks.py#L778
@@ -301,89 +316,109 @@ class UniversalInvertedResidual(nn.Module):
 
     def __init__(
             self,
-            in_chs,
-            out_chs,
+            in_chs: int,
+            out_chs: int,
             dw_kernel_size_start: int = 0,
             dw_kernel_size_mid: int = 3,
             dw_kernel_size_end: int = 0,
-            stride=1,
-            dilation=1,
-            group_size=1,
-            pad_type='',
-            noskip=False,
-            exp_ratio=1.0,
-            act_layer=nn.ReLU,
-            dw_act_layer=None,
-            norm_layer=nn.BatchNorm2d,
-            se_layer=None,
-            conv_kwargs=None,
-            drop_path_rate=0.,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 1,
+            pad_type: str = '',
+            noskip: bool = False,
+            exp_ratio: float = 1.0,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[ModuleType] = None,
+            conv_kwargs: Optional[Dict] = None,
+            drop_path_rate: float = 0.,
             layer_scale_init_value: Optional[float] = 1e-5,
     ):
         super(UniversalInvertedResidual, self).__init__()
-        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
-        dw_act_layer = dw_act_layer or act_layer
-        dw_norm_act_layer = get_norm_act_layer(norm_layer, dw_act_layer)
         conv_kwargs = conv_kwargs or {}
         self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
+        if stride > 1:
+            assert dw_kernel_size_start or dw_kernel_size_mid or dw_kernel_size_end
 
         # FIXME dilation isn't right w/ extra ks > 1 convs
         if dw_kernel_size_start:
-            self.conv_dw_start = create_conv2d(
+            dw_start_stride = stride if not dw_kernel_size_mid else 1
+            dw_start_groups = num_groups(group_size, in_chs)
+            self.dw_start = ConvNormActAa(
                 in_chs, in_chs, dw_kernel_size_start,
+                stride=dw_start_stride,
                 dilation=dilation,  # FIXME
-                depthwise=True,
+                groups=dw_start_groups,
                 padding=pad_type,
+                apply_act=False,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                aa_layer=aa_layer,
                 **conv_kwargs,
             )
-            self.norm_dw_start = dw_norm_act_layer(in_chs, apply_act=False)
         else:
-            # start is None when not used for cleaner repr
-            self.conv_dw_start = None
-            self.norm_dw_start = None
+            self.dw_start = nn.Identity()
 
         # Point-wise expansion
         mid_chs = make_divisible(in_chs * exp_ratio)
-        self.conv_pw = create_conv2d(in_chs, mid_chs, 1, padding=pad_type, **conv_kwargs)
-        self.norm_pw = norm_act_layer(mid_chs, inplace=True)
+        self.pw_exp = ConvNormAct(
+            in_chs, mid_chs, 1,
+            padding=pad_type,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            **conv_kwargs,
+        )
 
-        # Depth-wise convolution
+        # Middle depth-wise convolution
         if dw_kernel_size_mid:
             groups = num_groups(group_size, mid_chs)
-            self.conv_dw_mid = create_conv2d(
+            self.dw_mid = ConvNormActAa(
                 mid_chs, mid_chs, dw_kernel_size_mid,
                 stride=stride,
                 dilation=dilation,  # FIXME
                 groups=groups,
                 padding=pad_type,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                aa_layer=aa_layer,
                 **conv_kwargs,
             )
-            self.norm_dw_mid = dw_norm_act_layer(mid_chs, inplace=True)
         else:
             # keeping mid as identity so it can be hooked more easily for features
-            self.conv_dw_mid = nn.Identity()
-            self.norm_dw_mid = nn.Identity()
+            self.dw_mid = nn.Identity()
 
         # Squeeze-and-excitation
         self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
 
         # Point-wise linear projection
-        self.conv_pwl = create_conv2d(mid_chs, out_chs, 1, padding=pad_type, **conv_kwargs)
-        self.norm_pwl = norm_act_layer(out_chs, apply_act=False)
+        self.pw_proj = ConvNormAct(
+            mid_chs, out_chs, 1,
+            padding=pad_type,
+            apply_act=False,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            **conv_kwargs,
+        )
 
         if dw_kernel_size_end:
-            self.conv_dw_end = create_conv2d(
+            dw_end_stride = stride if not dw_kernel_size_start and not dw_kernel_size_mid else 1
+            dw_end_groups = num_groups(group_size, out_chs)
+            if dw_end_stride > 1:
+                assert not aa_layer
+            self.dw_end = ConvNormAct(
                 out_chs, out_chs, dw_kernel_size_end,
+                stride=dw_end_stride,
                 dilation=dilation,
-                depthwise=True,
+                groups=dw_end_groups,
                 padding=pad_type,
+                apply_act=False,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
                 **conv_kwargs,
             )
-            self.norm_dw_end = dw_norm_act_layer(out_chs, apply_act=False)
         else:
-            # end is None when not in use for cleaner repr
-            self.conv_dw_end = None
-            self.norm_dw_end = None
+            self.dw_end = nn.Identity()
 
         if layer_scale_init_value is not None:
             self.layer_scale = LayerScale2d(out_chs, layer_scale_init_value)
@@ -393,25 +428,18 @@ class UniversalInvertedResidual(nn.Module):
 
     def feature_info(self, location):
         if location == 'expansion':  # after SE, input to PWL
-            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+            return dict(module='pw_proj.conv', hook_type='forward_pre', num_chs=self.pw_proj.conv.in_channels)
         else:  # location == 'bottleneck', block output
-            return dict(module='', num_chs=self.conv_pwl.out_channels)
+            return dict(module='', num_chs=self.pw_proj.conv.out_channels)
 
     def forward(self, x):
         shortcut = x
-        if self.conv_dw_start is not None:
-            x = self.conv_dw_start(x)
-            x = self.norm_dw_start(x)
-        x = self.conv_pw(x)
-        x = self.norm_pw(x)
-        x = self.conv_dw_mid(x)
-        x = self.norm_dw_mid(x)
+        x = self.dw_start(x)
+        x = self.pw_exp(x)
+        x = self.dw_mid(x)
         x = self.se(x)
-        x = self.conv_pwl(x)
-        x = self.norm_pwl(x)
-        if self.conv_dw_end is not None:
-            x = self.conv_dw_end(x)
-            x = self.norm_dw_end(x)
+        x = self.pw_proj(x)
+        x = self.dw_end(x)
         x = self.layer_scale(x)
         if self.has_skip:
             x = self.drop_path(x) + shortcut
@@ -426,29 +454,30 @@ class MobileAttention(nn.Module):
     """
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            stride=1,
-            dw_kernel_size=3,
-            dilation=1,
-            group_size=1,
-            pad_type='',
+            in_chs: int,
+            out_chs: int,
+            stride: int = 1,
+            dw_kernel_size: int = 3,
+            dilation: int = 1,
+            group_size: int = 1,
+            pad_type: str = '',
             num_heads: int = 8,
             key_dim: int = 64,
             value_dim: int = 64,
             use_multi_query: bool = False,
             query_strides: int = (1, 1),
             kv_stride: int = 1,
-            cpe_dw_kernel_size=3,
-            noskip=False,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            drop_path_rate=0.,
-            attn_drop=0.0,
-            proj_drop=0.0,
+            cpe_dw_kernel_size: int = 3,
+            noskip: bool = False,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            drop_path_rate: float = 0.,
+            attn_drop: float = 0.0,
+            proj_drop: float = 0.0,
             layer_scale_init_value: Optional[float] = 1e-5,
-            use_bias=False,
-            use_cpe=False,
+            use_bias: bool = False,
+            use_cpe: bool = False,
     ):
         super(MobileAttention, self).__init__()
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
@@ -512,7 +541,6 @@ class MobileAttention(nn.Module):
 
         self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()
 
-
     def feature_info(self, location):
         if location == 'expansion':  # after SE, input to PW
             return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
@@ -539,22 +567,23 @@ class CondConvResidual(InvertedResidual):
 
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            dw_kernel_size=3,
-            stride=1,
-            dilation=1,
-            group_size=1,
-            pad_type='',
-            noskip=False,
-            exp_ratio=1.0,
-            exp_kernel_size=1,
-            pw_kernel_size=1,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            se_layer=None,
-            num_experts=0,
-            drop_path_rate=0.,
+            in_chs: int,
+            out_chs: int,
+            dw_kernel_size: int = 3,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 1,
+            pad_type: str = '',
+            noskip: bool = False,
+            exp_ratio: float = 1.0,
+            exp_kernel_size: int = 1,
+            pw_kernel_size: int = 1,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[ModuleType] = None,
+            num_experts: int = 0,
+            drop_path_rate: float = 0.,
     ):
 
         self.num_experts = num_experts
@@ -567,13 +596,14 @@ class CondConvResidual(InvertedResidual):
             dilation=dilation,
             group_size=group_size,
             pad_type=pad_type,
-            act_layer=act_layer,
             noskip=noskip,
             exp_ratio=exp_ratio,
             exp_kernel_size=exp_kernel_size,
             pw_kernel_size=pw_kernel_size,
-            se_layer=se_layer,
+            act_layer=act_layer,
             norm_layer=norm_layer,
+            aa_layer=aa_layer,
+            se_layer=se_layer,
             conv_kwargs=conv_kwargs,
             drop_path_rate=drop_path_rate,
         )
@@ -609,21 +639,22 @@ class EdgeResidual(nn.Module):
 
     def __init__(
             self,
-            in_chs,
-            out_chs,
-            exp_kernel_size=3,
-            stride=1,
-            dilation=1,
-            group_size=0,
-            pad_type='',
-            force_in_chs=0,
-            noskip=False,
-            exp_ratio=1.0,
-            pw_kernel_size=1,
-            act_layer=nn.ReLU,
-            norm_layer=nn.BatchNorm2d,
-            se_layer=None,
-            drop_path_rate=0.,
+            in_chs: int,
+            out_chs: int,
+            exp_kernel_size: int = 3,
+            stride: int = 1,
+            dilation: int = 1,
+            group_size: int = 0,
+            pad_type: str = '',
+            force_in_chs: int = 0,
+            noskip: bool = False,
+            exp_ratio: float = 1.0,
+            pw_kernel_size:  int = 1,
+            act_layer: LayerType = nn.ReLU,
+            norm_layer: LayerType = nn.BatchNorm2d,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[ModuleType] = None,
+            drop_path_rate: float = 0.,
     ):
         super(EdgeResidual, self).__init__()
         norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
@@ -633,13 +664,17 @@ class EdgeResidual(nn.Module):
             mid_chs = make_divisible(in_chs * exp_ratio)
         groups = num_groups(group_size, in_chs)
         self.has_skip = (in_chs == out_chs and stride == 1) and not noskip
+        use_aa = aa_layer is not None and stride > 1  # FIXME handle dilation
 
         # Expansion convolution
         self.conv_exp = create_conv2d(
             in_chs, mid_chs, exp_kernel_size,
-            stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+            stride=1 if use_aa else stride,
+            dilation=dilation, groups=groups, padding=pad_type)
         self.bn1 = norm_act_layer(mid_chs, inplace=True)
 
+        self.aa = create_aa(aa_layer, channels=mid_chs, stride=stride, enable=use_aa)
+
         # Squeeze-and-excitation
         self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()
 
@@ -658,6 +693,7 @@ class EdgeResidual(nn.Module):
         shortcut = x
         x = self.conv_exp(x)
         x = self.bn1(x)
+        x = self.aa(x)
         x = self.se(x)
         x = self.conv_pwl(x)
         x = self.bn2(x)
diff --git a/timm/models/_efficientnet_builder.py b/timm/models/_efficientnet_builder.py
index 7d96216a..e9b789a4 100644
--- a/timm/models/_efficientnet_builder.py
+++ b/timm/models/_efficientnet_builder.py
@@ -17,7 +17,7 @@ from typing import Any, Dict, List
 import torch.nn as nn
 
 from ._efficientnet_blocks import *
-from timm.layers import CondConv2d, get_condconv_initializer, get_act_layer, get_attn, make_divisible
+from timm.layers import CondConv2d, get_condconv_initializer, get_act_layer, get_attn, make_divisible, LayerType
 
 __all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights",
            'resolve_bn_args', 'resolve_act_layer', 'round_channels', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT']
@@ -326,9 +326,10 @@ class EfficientNetBuilder:
             pad_type: str = '',
             round_chs_fn: Callable = round_channels,
             se_from_exp: bool = False,
-            act_layer: Optional[Callable] = None,
-            norm_layer: Optional[Callable] = None,
-            se_layer: Optional[Callable] = None,
+            act_layer: Optional[LayerType] = None,
+            norm_layer: Optional[LayerType] = None,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[LayerType] = None,
             drop_path_rate: float = 0.,
             layer_scale_init_value: Optional[float] = None,
             feature_location: str = '',
@@ -339,6 +340,7 @@ class EfficientNetBuilder:
         self.se_from_exp = se_from_exp  # calculate se channel reduction from expanded (mid) chs
         self.act_layer = act_layer
         self.norm_layer = norm_layer
+        self.aa_layer = aa_layer
         self.se_layer = get_attn(se_layer)
         try:
             self.se_layer(8, rd_ratio=1.0)  # test if attn layer accepts rd_ratio arg
@@ -378,6 +380,9 @@ class EfficientNetBuilder:
         ba['norm_layer'] = self.norm_layer
         ba['drop_path_rate'] = drop_path_rate
 
+        if self.aa_layer is not None:
+            ba['aa_layer'] = self.aa_layer
+
         se_ratio = ba.pop('se_ratio', None)
         if se_ratio and self.se_layer is not None:
             if not self.se_from_exp:
@@ -461,6 +466,7 @@ class EfficientNetBuilder:
                     space2depth = 1
 
                 if space2depth > 0:
+                    # FIXME s2d is a WIP
                     if space2depth == 2 and block_args['stride'] == 2:
                         block_args['stride'] = 1
                         # to end s2d region, need to correct expansion and se ratio relative to input
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index dcb0db7e..46c4e81e 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -36,7 +36,7 @@ the models and weights open source!
 Hacked together by / Copyright 2019, Ross Wightman
 """
 from functools import partial
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -44,10 +44,10 @@ import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from timm.layers import create_conv2d, create_classifier, get_norm_act_layer, GroupNormAct
+from timm.layers import create_conv2d, create_classifier, get_norm_act_layer, GroupNormAct, LayerType
 from ._builder import build_model_with_cfg, pretrained_cfg_for_features
 from ._efficientnet_blocks import SqueezeExcite
-from ._efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights, \
+from ._efficientnet_builder import BlockArgs, EfficientNetBuilder, decode_arch_def, efficientnet_init_weights, \
     round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from ._features import FeatureInfo, FeatureHooks, feature_take_indices
 from ._manipulate import checkpoint_seq
@@ -74,21 +74,22 @@ class EfficientNet(nn.Module):
 
     def __init__(
             self,
-            block_args,
-            num_classes=1000,
-            num_features=1280,
-            in_chans=3,
-            stem_size=32,
-            fix_stem=False,
-            output_stride=32,
-            pad_type='',
-            round_chs_fn=round_channels,
-            act_layer=None,
-            norm_layer=None,
-            se_layer=None,
-            drop_rate=0.,
-            drop_path_rate=0.,
-            global_pool='avg'
+            block_args: BlockArgs,
+            num_classes: int = 1000,
+            num_features: int = 1280,
+            in_chans: int = 3,
+            stem_size: int = 32,
+            fix_stem: bool = False,
+            output_stride: int = 32,
+            pad_type: str = '',
+            act_layer: Optional[LayerType] = None,
+            norm_layer: Optional[LayerType] = None,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[LayerType] = None,
+            round_chs_fn: Callable = round_channels,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
+            global_pool: str = 'avg'
     ):
         super(EfficientNet, self).__init__()
         act_layer = act_layer or nn.ReLU
@@ -113,6 +114,7 @@ class EfficientNet(nn.Module):
             round_chs_fn=round_chs_fn,
             act_layer=act_layer,
             norm_layer=norm_layer,
+            aa_layer=aa_layer,
             se_layer=se_layer,
             drop_path_rate=drop_path_rate,
         )
@@ -270,20 +272,21 @@ class EfficientNetFeatures(nn.Module):
 
     def __init__(
             self,
-            block_args,
-            out_indices=(0, 1, 2, 3, 4),
-            feature_location='bottleneck',
-            in_chans=3,
-            stem_size=32,
-            fix_stem=False,
-            output_stride=32,
-            pad_type='',
-            round_chs_fn=round_channels,
-            act_layer=None,
-            norm_layer=None,
-            se_layer=None,
-            drop_rate=0.,
-            drop_path_rate=0.
+            block_args: BlockArgs,
+            out_indices: Tuple[int, ...] = (0, 1, 2, 3, 4),
+            feature_location: str = 'bottleneck',
+            in_chans: int = 3,
+            stem_size: int = 32,
+            fix_stem: bool = False,
+            output_stride: int = 32,
+            pad_type: str = '',
+            act_layer: Optional[LayerType] = None,
+            norm_layer: Optional[LayerType] = None,
+            aa_layer: Optional[LayerType] = None,
+            se_layer: Optional[LayerType] = None,
+            round_chs_fn: Callable = round_channels,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
     ):
         super(EfficientNetFeatures, self).__init__()
         act_layer = act_layer or nn.ReLU
@@ -306,6 +309,7 @@ class EfficientNetFeatures(nn.Module):
             round_chs_fn=round_chs_fn,
             act_layer=act_layer,
             norm_layer=norm_layer,
+            aa_layer=aa_layer,
             se_layer=se_layer,
             drop_path_rate=drop_path_rate,
             feature_location=feature_location,
@@ -1154,6 +1158,7 @@ default_cfgs = generate_default_cfgs({
         input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
     'efficientnet_b3_g8_gn.untrained': _cfg(
         input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+    'efficientnet_blur_b0.untrained': _cfg(),
 
     'efficientnet_es.ra_in1k': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth',
@@ -1850,6 +1855,17 @@ def efficientnet_b3_g8_gn(pretrained=False, **kwargs) -> EfficientNet:
     return model
 
 
+@register_model
+def efficientnet_blur_b0(pretrained=False, **kwargs) -> EfficientNet:
+    """ EfficientNet-B0 w/ BlurPool """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_blur_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained,
+        aa_layer='blurpc', **kwargs
+    )
+    return model
+
+
 @register_model
 def efficientnet_es(pretrained=False, **kwargs) -> EfficientNet:
     """ EfficientNet-Edge Small. """
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
index 40f201b9..b25d87ba 100644
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@@ -40,6 +40,7 @@ class MobileNetV3(nn.Module):
       * HardCoRe-NAS - https://arxiv.org/abs/2102.11646 (defn in hardcorenas.py uses this class)
       * FBNet-V3 - https://arxiv.org/abs/2006.02049
       * LCNet - https://arxiv.org/abs/2109.15099
+      * MobileNet-V4 - https://arxiv.org/abs/2404.10518
     """
 
     def __init__(
@@ -52,9 +53,10 @@ class MobileNetV3(nn.Module):
             num_features: int = 1280,
             head_bias: bool = True,
             head_norm: bool = False,
-            pad_type: PadType = '',
+            pad_type: str = '',
             act_layer: Optional[LayerType] = None,
             norm_layer: Optional[LayerType] = None,
+            aa_layer: Optional[LayerType] = None,
             se_layer: Optional[LayerType] = None,
             se_from_exp: bool = True,
             round_chs_fn: Callable = round_channels,
@@ -75,6 +77,7 @@ class MobileNetV3(nn.Module):
             pad_type: Type of padding to use for convolution layers.
             act_layer: Type of activation layer.
             norm_layer: Type of normalization layer.
+            aa_layer: Type of anti-aliasing layer.
             se_layer: Type of Squeeze-and-Excite layer.
             se_from_exp: If True, calculate SE channel reduction from expanded mid channels.
             round_chs_fn: Callable to round number of filters based on depth multiplier.
@@ -107,6 +110,7 @@ class MobileNetV3(nn.Module):
             se_from_exp=se_from_exp,
             act_layer=act_layer,
             norm_layer=norm_layer,
+            aa_layer=aa_layer,
             se_layer=se_layer,
             drop_path_rate=drop_path_rate,
             layer_scale_init_value=layer_scale_init_value,
@@ -291,6 +295,7 @@ class MobileNetV3Features(nn.Module):
             se_from_exp: bool = True,
             act_layer: Optional[LayerType] = None,
             norm_layer: Optional[LayerType] = None,
+            aa_layer: Optional[LayerType] = None,
             se_layer: Optional[LayerType] = None,
             drop_rate: float = 0.,
             drop_path_rate: float = 0.,
@@ -337,6 +342,7 @@ class MobileNetV3Features(nn.Module):
             se_from_exp=se_from_exp,
             act_layer=act_layer,
             norm_layer=norm_layer,
+            aa_layer=aa_layer,
             se_layer=se_layer,
             drop_path_rate=drop_path_rate,
             layer_scale_init_value=layer_scale_init_value,
@@ -649,15 +655,17 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
     Args:
       channel_multiplier: multiplier to number of channels per layer.
     """
+    num_features = 1280
     if 'hybrid' in variant:
         layer_scale_init_value = 1e-5
         if 'medium' in variant:
             stem_size = 32
-            num_features = 1280
             act_layer = resolve_act_layer(kwargs, 'relu')
             arch_def = [
                 # stage 0, 112x112 in
-                ['er_r1_k3_s2_e4_c48'],  # FusedIB (EdgeResidual)
+                [
+                    'er_r1_k3_s2_e4_c48'  # FusedIB (EdgeResidual)
+                ],
                 # stage 1, 56x56 in
                 [
                     'uir_r1_a3_k5_s2_e4_c80',  # ExtraDW
@@ -689,23 +697,26 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
                     'uir_r1_a0_k0_s1_e4_c256',  # FFN
                     'mqa_r1_k3_h4_s1_d64_c256',  # MQA
                     'uir_r1_a3_k0_s1_e4_c256',  # ConvNeXt
-                    'mqa_r1_k3_h4_s1_d64_c256', # MQA
+                    'mqa_r1_k3_h4_s1_d64_c256',  # MQA
                     'uir_r1_a5_k5_s1_e4_c256',  # ExtraDW
-                    'mqa_r1_k3_h4_s1_d64_c256', # MQA
+                    'mqa_r1_k3_h4_s1_d64_c256',  # MQA
                     'uir_r1_a5_k0_s1_e4_c256',  # ConvNeXt
                     'mqa_r1_k3_h4_s1_d64_c256', # MQA
                     'uir_r1_a5_k0_s1_e4_c256',  # ConvNeXt
                 ],
                 # stage 4, 7x7 in
-                ['cn_r1_k1_s1_c960'],  # Conv
+                [
+                    'cn_r1_k1_s1_c960' # Conv
+                ],
             ]
         elif 'large' in variant:
             stem_size = 24
-            num_features = 1280
             act_layer = resolve_act_layer(kwargs, 'gelu')
             arch_def = [
                 # stage 0, 112x112 in
-                ['er_r1_k3_s2_e4_c48'],  # FusedIB (EdgeResidual)
+                [
+                    'er_r1_k3_s2_e4_c48',  # FusedIB (EdgeResidual)
+                ],
                 # stage 1, 56x56 in
                 [
                     'uir_r1_a3_k5_s2_e4_c96',  # ExtraDW
@@ -734,17 +745,19 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
                     'uir_r2_a5_k0_s1_e4_c512',  # ConvNeXt
                     'uir_r1_a5_k3_s1_e4_c512',  # ExtraDW
                     'uir_r1_a5_k5_s1_e4_c512',  # ExtraDW
-                    'mqa_r1_k3_h8_s1_d64_c512', # MQA
+                    'mqa_r1_k3_h8_s1_d64_c512',  # MQA
                     'uir_r1_a5_k0_s1_e4_c512',  # ConvNeXt
-                    'mqa_r1_k3_h8_s1_d64_c512', # MQA
+                    'mqa_r1_k3_h8_s1_d64_c512',  # MQA
                     'uir_r1_a5_k0_s1_e4_c512',  # ConvNeXt
-                    'mqa_r1_k3_h8_s1_d64_c512', # MQA
+                    'mqa_r1_k3_h8_s1_d64_c512',  # MQA
                     'uir_r1_a5_k0_s1_e4_c512',  # ConvNeXt
-                    'mqa_r1_k3_h8_s1_d64_c512', # MQA
+                    'mqa_r1_k3_h8_s1_d64_c512',  # MQA
                     'uir_r1_a5_k0_s1_e4_c512',  # ConvNeXt
                 ],
                 # stage 4, 7x7 in
-                ['cn_r1_k1_s1_c960'],
+                [
+                    'cn_r1_k1_s1_c960',  # Conv
+                ],
             ]
         else:
             assert False, f'Unknown variant {variant}.'
@@ -752,7 +765,6 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
         layer_scale_init_value = None
         if 'small' in variant:
             stem_size = 32
-            num_features = 1280
             act_layer = resolve_act_layer(kwargs, 'relu')
             arch_def = [
                 # stage 0, 112x112 in
@@ -780,15 +792,18 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
                     'uir_r2_a0_k3_s1_e4_c128',  # IR
                 ],
                 # stage 4, 7x7 in
-                ['cn_r1_k1_s1_c960'],  # Conv
+                [
+                    'cn_r1_k1_s1_c960',  # Conv
+                ],
             ]
         elif 'medium' in variant:
             stem_size = 32
-            num_features = 1280
             act_layer = resolve_act_layer(kwargs, 'relu')
             arch_def = [
                 # stage 0, 112x112 in
-                ['er_r1_k3_s2_e4_c48'],  # FusedIB (EdgeResidual)
+                [
+                    'er_r1_k3_s2_e4_c48',  # FusedIB (EdgeResidual)
+                ],
                 # stage 1, 56x56 in
                 [
                     'uir_r1_a3_k5_s2_e4_c80',  # ExtraDW
@@ -817,15 +832,18 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
                     'uir_r1_a5_k0_s1_e2_c256',  # ConvNeXt
                 ],
                 # stage 4, 7x7 in
-                ['cn_r1_k1_s1_c960'],  # Conv
+                [
+                    'cn_r1_k1_s1_c960',  # Conv
+                ],
             ]
         elif 'large' in variant:
             stem_size = 24
-            num_features = 1280
             act_layer = resolve_act_layer(kwargs, 'relu')
             arch_def = [
                 # stage 0, 112x112 in
-                ['er_r1_k3_s2_e4_c48'],  # FusedIB (EdgeResidual)
+                [
+                    'er_r1_k3_s2_e4_c48',  # FusedIB (EdgeResidual)
+                ],
                 # stage 1, 56x56 in
                 [
                     'uir_r1_a3_k5_s2_e4_c96',  # ExtraDW
@@ -851,24 +869,23 @@ def _gen_mobilenet_v4(variant: str, channel_multiplier: float = 1.0, pretrained:
 
                 ],
                 # stage 4, 7x7 in
-                ['cn_r1_k1_s1_c960'],  # Conv
+                [
+                    'cn_r1_k1_s1_c960',  # Conv
+                ],
             ]
         else:
             assert False, f'Unknown variant {variant}.'
 
-    # NOTE SE not used in initial MobileNet-v4 definitions
-    se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels)
     model_kwargs = dict(
         block_args=decode_arch_def(arch_def),
         head_bias=False,
         head_norm=True,
         num_features=num_features,
         stem_size=stem_size,
-        fix_stem=channel_multiplier < 0.75,
+        fix_stem=channel_multiplier < 1.0,
         round_chs_fn=partial(round_channels, multiplier=channel_multiplier),
         norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)),
         act_layer=act_layer,
-        se_layer=se_layer,
         layer_scale_init_value=layer_scale_init_value,
         **kwargs,
     )
@@ -904,9 +921,6 @@ default_cfgs = generate_default_cfgs({
         origin_url='https://github.com/Alibaba-MIIL/ImageNet21K',
         paper_ids='arXiv:2104.10972v4',
         interpolation='bilinear', mean=(0., 0., 0.), std=(1., 1., 1.), num_classes=11221),
-    'mobilenetv3_large_150.untrained': _cfg(
-        interpolation='bicubic'),
-
 
     'mobilenetv3_small_050.lamb_in1k': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_050_lambc-4b7bbe87.pth',
@@ -985,28 +999,48 @@ default_cfgs = generate_default_cfgs({
     'mobilenetv4_conv_small': _cfg(
         # hf_hub_id='timm/',
         interpolation='bicubic'),
-    'mobilenetv4_conv_medium': _cfg(
-        #hf_hub_id='timm/',
-        interpolation='bicubic'),
-    'mobilenetv4_conv_large': _cfg(
+    'mobilenetv4_conv_medium.r224': _cfg(
         # hf_hub_id='timm/',
-        interpolation='bicubic'),
+        crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_conv_medium.r256': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_conv_large.r256': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_conv_large.r384': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=0.95, interpolation='bicubic'),
 
     'mobilenetv4_hybrid_small': _cfg(
         # hf_hub_id='timm/',
         interpolation='bicubic'),
-    'mobilenetv4_hybrid_medium': _cfg(
+    'mobilenetv4_hybrid_medium.r224': _cfg(
         # hf_hub_id='timm/',
-        interpolation='bicubic'),
-    'mobilenetv4_hybrid_large': _cfg(
+        crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_hybrid_medium.r256': _cfg(
         # hf_hub_id='timm/',
-        interpolation='bicubic'),
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_hybrid_large.r256': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_hybrid_large.r384': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=0.95, interpolation='bicubic'),
+
+    # experimental
+    'mobilenetv4_conv_aa_medium.r256': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_conv_blur_medium.r256': _cfg(
+        # hf_hub_id='timm/',
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
     'mobilenetv4_hybrid_medium_075': _cfg(
         # hf_hub_id='timm/',
-        interpolation='bicubic'),
-    'mobilenetv4_hybrid_medium_150': _cfg(
+        crop_pct=0.95, interpolation='bicubic'),
+    'mobilenetv4_hybrid_large_075.r256': _cfg(
         # hf_hub_id='timm/',
-        interpolation='bicubic'),
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.95, interpolation='bicubic'),
 })
 
 
@@ -1024,13 +1058,6 @@ def mobilenetv3_large_100(pretrained: bool = False, **kwargs) -> MobileNetV3:
     return model
 
 
-@register_model
-def mobilenetv3_large_150(pretrained: bool = False, **kwargs) -> MobileNetV3:
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_large_150', 1.5, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def mobilenetv3_small_050(pretrained: bool = False, **kwargs) -> MobileNetV3:
     """ MobileNet V3 """
@@ -1191,13 +1218,6 @@ def mobilenetv4_conv_large(pretrained: bool = False, **kwargs) -> MobileNetV3:
     return model
 
 
-@register_model
-def mobilenetv4_hybrid_medium_075(pretrained: bool = False, **kwargs) -> MobileNetV3:
-    """ MobileNet V4 Hybrid """
-    model = _gen_mobilenet_v4('mobilenetv4_hybrid_medium_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def mobilenetv4_hybrid_medium(pretrained: bool = False, **kwargs) -> MobileNetV3:
     """ MobileNet V4 Hybrid """
@@ -1205,13 +1225,6 @@ def mobilenetv4_hybrid_medium(pretrained: bool = False, **kwargs) -> MobileNetV3
     return model
 
 
-@register_model
-def mobilenetv4_hybrid_medium_150(pretrained: bool = False, **kwargs) -> MobileNetV3:
-    """ MobileNet V4 Hybrid """
-    model = _gen_mobilenet_v4('mobilenetv4_hybrid_medium_150', 1.5, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def mobilenetv4_hybrid_large(pretrained: bool = False, **kwargs) -> MobileNetV3:
     """ MobileNet V4 Hybrid"""
@@ -1219,6 +1232,33 @@ def mobilenetv4_hybrid_large(pretrained: bool = False, **kwargs) -> MobileNetV3:
     return model
 
 
+@register_model
+def mobilenetv4_conv_aa_medium(pretrained: bool = False, **kwargs) -> MobileNetV3:
+    """ MobileNet V4 w/ AvgPool AA """
+    model = _gen_mobilenet_v4('mobilenetv4_conv_aa_medium', 1.0, pretrained=pretrained, aa_layer='avg', **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv4_conv_blur_medium(pretrained: bool = False, **kwargs) -> MobileNetV3:
+    """ MobileNet V4 Conv w/ Blur AA """
+    model = _gen_mobilenet_v4('mobilenetv4_conv_blur_medium', 1.0, pretrained=pretrained, aa_layer='blurpc', **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv4_hybrid_medium_075(pretrained: bool = False, **kwargs) -> MobileNetV3:
+    """ MobileNet V4 Hybrid """
+    model = _gen_mobilenet_v4('mobilenetv4_hybrid_medium_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv4_hybrid_large_075(pretrained: bool = False, **kwargs) -> MobileNetV3:
+    """ MobileNet V4 Hybrid"""
+    model = _gen_mobilenet_v4('mobilenetv4_hybrid_large', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
 
 register_model_deprecations(__name__, {
     'mobilenetv3_large_100_miil': 'mobilenetv3_large_100.miil_in21k_ft_in1k',
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 53dfab9c..15f16997 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -17,7 +17,7 @@ import torch.nn.functional as F
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from timm.layers import DropBlock2d, DropPath, AvgPool2dSame, BlurPool2d, GroupNorm, LayerType, create_attn, \
-    get_attn, get_act_layer, get_norm_layer, create_classifier
+    get_attn, get_act_layer, get_norm_layer, create_classifier, create_aa
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._manipulate import checkpoint_seq
@@ -31,15 +31,6 @@ def get_padding(kernel_size: int, stride: int, dilation: int = 1) -> int:
     return padding
 
 
-def create_aa(aa_layer: Type[nn.Module], channels: int, stride: int = 2, enable: bool = True) -> nn.Module:
-    if not aa_layer or not enable:
-        return nn.Identity()
-    if issubclass(aa_layer, nn.AvgPool2d):
-        return aa_layer(stride)
-    else:
-        return aa_layer(channels=channels, stride=stride)
-
-
 class BasicBlock(nn.Module):
     expansion = 1