From a9eb48483564336049ad5d2283dea0cb334a7510 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sat, 19 Oct 2019 14:48:30 -0700
Subject: [PATCH 01/35] Add memory efficient Swish impl

---
 timm/models/gen_efficientnet.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index c11782a7..00c2c86c 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -371,11 +371,30 @@ def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil'):
     return arch_args
 
 
-def swish(x, inplace=False):
-    if inplace:
-        return x.mul_(x.sigmoid())
-    else:
-        return x * x.sigmoid()
+_USE_SWISH_OPT = True
+if _USE_SWISH_OPT:
+    class SwishAutoFn(torch.autograd.Function):
+        """ Memory Efficient Swish
+        From: https://blog.ceshine.net/post/pytorch-memory-swish/
+        """
+        @staticmethod
+        def forward(ctx, x):
+            result = x.mul(torch.sigmoid(x))
+            ctx.save_for_backward(x)
+            return result
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_variables[0]
+            sigmoid_x = torch.sigmoid(x)
+            return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x)))
+
+
+    def swish(x, inplace=False):
+        return SwishAutoFn.apply(x)
+else:
+    def swish(x, inplace=False):
+        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
 
 
 def sigmoid(x, inplace=False):

From b93fcf0708c5c15692e9bb172a693ac88da8adea Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sat, 19 Oct 2019 17:05:37 -0700
Subject: [PATCH 02/35] Add Facebook Research Semi-Supervised and Semi-Weakly
 Supervised ResNet model weights.

---
 sotabench.py           |  59 ++++++++++
 timm/models/helpers.py |  16 +--
 timm/models/resnet.py  | 242 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 270 insertions(+), 47 deletions(-)

diff --git a/sotabench.py b/sotabench.py
index 56215cc4..fb967968 100644
--- a/sotabench.py
+++ b/sotabench.py
@@ -167,6 +167,65 @@ model_list = [
     _entry('ig_resnext101_32x48d', 'ResNeXt-101 32x48d (288x288 Mean-Max Pooling)', '1805.00932',
            ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 8),
 
+    ## Facebook SSL weights
+    _entry('ssl_resnet18', 'ResNet-18', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnet50', 'ResNet-50', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext50_32x4d', 'ResNeXt-50 32x4d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x4d', 'ResNeXt-101 32x4d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x8d', 'ResNeXt-101 32x8d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x16d', 'ResNeXt-101 32x16d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+
+    _entry('ssl_resnet50', 'ResNet-50 (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext50_32x4d', 'ResNeXt-50 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x4d', 'ResNeXt-101 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x8d', 'ResNeXt-101 32x8d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('ssl_resnext101_32x16d', 'ResNeXt-101 32x16d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2,
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+
+    ## Facebook SWSL weights
+    _entry('swsl_resnet18', 'ResNet-18', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnet50', 'ResNet-50', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext50_32x4d', 'ResNeXt-50 32x4d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x4d', 'ResNeXt-101 32x4d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x8d', 'ResNeXt-101 32x8d', '1905.00546',
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x16d', 'ResNeXt-101 32x16d', '1905.00546'),
+
+    _entry('swsl_resnet50', 'ResNet-50 (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext50_32x4d', 'ResNeXt-50 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x4d', 'ResNeXt-101 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x8d', 'ResNeXt-101 32x8d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+    _entry('swsl_resnext101_32x16d', 'ResNeXt-101 32x16d (288x288 Mean-Max Pooling)', '1905.00546',
+           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2,
+           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+
     ## DLA official impl weights (to remove if sotabench added to source)
     _entry('dla34', 'DLA-34', '1707.06484'),
     _entry('dla46_c', 'DLA-46-C', '1707.06484'),
diff --git a/timm/models/helpers.py b/timm/models/helpers.py
index ef5b53d7..919bb997 100644
--- a/timm/models/helpers.py
+++ b/timm/models/helpers.py
@@ -57,15 +57,17 @@ def resume_checkpoint(model, checkpoint_path):
         raise FileNotFoundError()
 
 
-def load_pretrained(model, default_cfg, num_classes=1000, in_chans=3, filter_fn=None):
-    if 'url' not in default_cfg or not default_cfg['url']:
+def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=None):
+    if cfg is None:
+        cfg = getattr(model, 'default_cfg')
+    if cfg is None or 'url' not in cfg or not cfg['url']:
         logging.warning("Pretrained model URL is invalid, using random initialization.")
         return
 
-    state_dict = model_zoo.load_url(default_cfg['url'], progress=False)
+    state_dict = model_zoo.load_url(cfg['url'], progress=False)
 
     if in_chans == 1:
-        conv1_name = default_cfg['first_conv']
+        conv1_name = cfg['first_conv']
         logging.info('Converting first conv (%s) from 3 to 1 channel' % conv1_name)
         conv1_weight = state_dict[conv1_name + '.weight']
         state_dict[conv1_name + '.weight'] = conv1_weight.sum(dim=1, keepdim=True)
@@ -73,14 +75,14 @@ def load_pretrained(model, default_cfg, num_classes=1000, in_chans=3, filter_fn=
         assert False, "Invalid in_chans for pretrained weights"
 
     strict = True
-    classifier_name = default_cfg['classifier']
-    if num_classes == 1000 and default_cfg['num_classes'] == 1001:
+    classifier_name = cfg['classifier']
+    if num_classes == 1000 and cfg['num_classes'] == 1001:
         # special case for imagenet trained models with extra background class in pretrained weights
         classifier_weight = state_dict[classifier_name + '.weight']
         state_dict[classifier_name + '.weight'] = classifier_weight[1:]
         classifier_bias = state_dict[classifier_name + '.bias']
         state_dict[classifier_name + '.bias'] = classifier_bias[1:]
-    elif num_classes != default_cfg['num_classes']:
+    elif num_classes != cfg['num_classes']:
         # completely discard fully connected for all other differences between pretrained and created model
         del state_dict[classifier_name + '.weight']
         del state_dict[classifier_name + '.bias']
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index c6b2290d..bedd303d 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -67,6 +67,30 @@ default_cfgs = {
     'ig_resnext101_32x16d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x16-c6f796b0.pth'),
     'ig_resnext101_32x32d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x32-e4b90b00.pth'),
     'ig_resnext101_32x48d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x48-3e41cc8a.pth'),
+    'ssl_resnet18':  _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet18-d92f0530.pth'),
+    'ssl_resnet50':  _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet50-08389792.pth'),
+    'ssl_resnext50_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext50_32x4-ddb3e555.pth'),
+    'ssl_resnext101_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x4-dc43570a.pth'),
+    'ssl_resnext101_32x8d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x8-2cfe2f8b.pth'),
+    'ssl_resnext101_32x16d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x16-15fffa57.pth'),
+    'swsl_resnet18': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet18-118f1556.pth'),
+    'swsl_resnet50': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet50-16a12f1b.pth'),
+    'swsl_resnext50_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext50_32x4-72679e44.pth'),
+    'swsl_resnext101_32x4d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x4-3f87e46b.pth'),
+    'swsl_resnext101_32x8d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x8-b4712904.pth'),
+    'swsl_resnext101_32x16d': _cfg(
+        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x16-f3559a9c.pth'),
 }
 
 
@@ -621,80 +645,218 @@ def tv_resnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
 
 
 @register_model
-def ig_resnext101_32x8d(pretrained=True, num_classes=1000, in_chans=3, **kwargs):
+def ig_resnext101_32x8d(pretrained=True, **kwargs):
     """Constructs a ResNeXt-101 32x8 model pre-trained on weakly-supervised data
     and finetuned on ImageNet from Figure 5 in
     `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
     Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    Args:
-        pretrained (bool): load pretrained weights
-        num_classes (int): number of classes for classifier (default: 1000 for pretrained)
-        in_chans (int): number of input planes (default: 3 for pretrained / color)
     """
-    default_cfg = default_cfgs['ig_resnext101_32x8d']
-    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=8,
-                   num_classes=1000, in_chans=3, **kwargs)
-    model.default_cfg = default_cfg
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    model.default_cfg = default_cfgs['ig_resnext101_32x8d']
     if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
     return model
 
 
 @register_model
-def ig_resnext101_32x16d(pretrained=True, num_classes=1000, in_chans=3, **kwargs):
+def ig_resnext101_32x16d(pretrained=True, **kwargs):
     """Constructs a ResNeXt-101 32x16 model pre-trained on weakly-supervised data
     and finetuned on ImageNet from Figure 5 in
     `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
     Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    Args:
-        pretrained (bool): load pretrained weights
-        num_classes (int): number of classes for classifier (default: 1000 for pretrained)
-        in_chans (int): number of input planes (default: 3 for pretrained / color)
     """
-    default_cfg = default_cfgs['ig_resnext101_32x16d']
-    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=16,
-                   num_classes=1000, in_chans=3, **kwargs)
-    model.default_cfg = default_cfg
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    model.default_cfg = default_cfgs['ig_resnext101_32x16d']
     if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
     return model
 
 
 @register_model
-def ig_resnext101_32x32d(pretrained=True, num_classes=1000, in_chans=3, **kwargs):
+def ig_resnext101_32x32d(pretrained=True, **kwargs):
     """Constructs a ResNeXt-101 32x32 model pre-trained on weakly-supervised data
     and finetuned on ImageNet from Figure 5 in
     `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
     Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    Args:
-        pretrained (bool): load pretrained weights
-        num_classes (int): number of classes for classifier (default: 1000 for pretrained)
-        in_chans (int): number of input planes (default: 3 for pretrained / color)
     """
-    default_cfg = default_cfgs['ig_resnext101_32x32d']
-    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=32,
-                   num_classes=1000, in_chans=3, **kwargs)
-    model.default_cfg = default_cfg
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=32, **kwargs)
+    model.default_cfg = default_cfgs['ig_resnext101_32x32d']
     if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
     return model
 
 
 @register_model
-def ig_resnext101_32x48d(pretrained=True, num_classes=1000, in_chans=3, **kwargs):
+def ig_resnext101_32x48d(pretrained=True, **kwargs):
     """Constructs a ResNeXt-101 32x48 model pre-trained on weakly-supervised data
     and finetuned on ImageNet from Figure 5 in
     `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
     Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    Args:
-        pretrained (bool): load pretrained weights
-        num_classes (int): number of classes for classifier (default: 1000 for pretrained)
-        in_chans (int): number of input planes (default: 3 for pretrained / color)
     """
-    default_cfg = default_cfgs['ig_resnext101_32x48d']
-    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=48,
-                   num_classes=1000, in_chans=3, **kwargs)
-    model.default_cfg = default_cfg
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=48, **kwargs)
+    model.default_cfg = default_cfgs['ig_resnext101_32x48d']
     if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnet18(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNet-18 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnet18']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnet50(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNet-50 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnet50']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnext50_32x4d(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNeXt-50 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnext50_32x4d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnext101_32x4d(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnext101_32x4d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnext101_32x8d(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x8 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnext101_32x8d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def ssl_resnext101_32x16d(pretrained=True, **kwargs):
+    """Constructs a semi-supervised ResNeXt-101 32x16 model pre-trained on YFCC100M dataset and finetuned on ImageNet
+    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    model.default_cfg = default_cfgs['ssl_resnext101_32x16d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnet18(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised Resnet-18 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnet18']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnet50(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised ResNet-50 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnet50']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnext50_32x4d(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-50 32x4 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnext50_32x4d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnext101_32x4d(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x4 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnext101_32x4d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnext101_32x8d(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x8 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnext101_32x8d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def swsl_resnext101_32x16d(pretrained=True, **kwargs):
+    """Constructs a semi-weakly supervised ResNeXt-101 32x16 model pre-trained on 1B weakly supervised
+       image dataset and finetuned on ImageNet.
+       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
+       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
+    model.default_cfg = default_cfgs['swsl_resnext101_32x16d']
+    if pretrained:
+        load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
     return model

From c099374771ab386522f98cb777d4fa97a5cf95c2 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sat, 19 Oct 2019 17:27:46 -0700
Subject: [PATCH 03/35] Map pretrained checkpoint to cpu to avoid issue with
 some pretrained checkpoints still having CUDA tensors. Fixes #42

---
 timm/models/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/timm/models/helpers.py b/timm/models/helpers.py
index 919bb997..9ac728da 100644
--- a/timm/models/helpers.py
+++ b/timm/models/helpers.py
@@ -64,7 +64,7 @@ def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=Non
         logging.warning("Pretrained model URL is invalid, using random initialization.")
         return
 
-    state_dict = model_zoo.load_url(cfg['url'], progress=False)
+    state_dict = model_zoo.load_url(cfg['url'], progress=False, map_location='cpu')
 
     if in_chans == 1:
         conv1_name = cfg['first_conv']

From 62105ed4a0ccae81b48a6dbe27554426b018c61d Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sun, 20 Oct 2019 13:17:25 -0700
Subject: [PATCH 04/35] Better differentiate sotabench WSL, SSL, and SWSL
 models via model_desc

---
 sotabench.py | 70 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/sotabench.py b/sotabench.py
index fb967968..9521836f 100644
--- a/sotabench.py
+++ b/sotabench.py
@@ -154,77 +154,87 @@ model_list = [
     # _entry('wide_resnet101_2', , ),
 
     ## Facebook WSL weights
-    _entry('ig_resnext101_32x8d', 'ResNeXt-101 32x8d', '1805.00932'),
-    _entry('ig_resnext101_32x16d', 'ResNeXt-101 32x16d', '1805.00932'),
-    _entry('ig_resnext101_32x32d', 'ResNeXt-101 32x32d', '1805.00932', batch_size=BATCH_SIZE // 2),
-    _entry('ig_resnext101_32x48d', 'ResNeXt-101 32x48d', '1805.00932', batch_size=BATCH_SIZE // 4),
+    _entry('ig_resnext101_32x8d', 'ResNeXt-101 32x8d', '1805.00932',
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
+    _entry('ig_resnext101_32x16d', 'ResNeXt-101 32x16d', '1805.00932',
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
+    _entry('ig_resnext101_32x32d', 'ResNeXt-101 32x32d', '1805.00932', batch_size=BATCH_SIZE // 2,
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
+    _entry('ig_resnext101_32x48d', 'ResNeXt-101 32x48d', '1805.00932', batch_size=BATCH_SIZE // 4,
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
+
     _entry('ig_resnext101_32x8d', 'ResNeXt-101 32x8d (288x288 Mean-Max Pooling)', '1805.00932',
-           ttp=True, args=dict(img_size=288)),
+           ttp=True, args=dict(img_size=288),
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
     _entry('ig_resnext101_32x16d', 'ResNeXt-101 32x16d (288x288 Mean-Max Pooling)', '1805.00932',
-           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2),
+           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2,
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
     _entry('ig_resnext101_32x32d', 'ResNeXt-101 32x32d (288x288 Mean-Max Pooling)', '1805.00932',
-           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 4),
+           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 4,
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
     _entry('ig_resnext101_32x48d', 'ResNeXt-101 32x48d (288x288 Mean-Max Pooling)', '1805.00932',
-           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 8),
+           ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 8,
+           model_desc='Weakly-Supervised pre-training on 1B Instagram hashtag dataset by Facebook Research'),
 
     ## Facebook SSL weights
     _entry('ssl_resnet18', 'ResNet-18', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnet50', 'ResNet-50', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext50_32x4d', 'ResNeXt-50 32x4d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x4d', 'ResNeXt-101 32x4d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x8d', 'ResNeXt-101 32x8d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x16d', 'ResNeXt-101 32x16d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
 
     _entry('ssl_resnet50', 'ResNet-50 (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext50_32x4d', 'ResNeXt-50 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x4d', 'ResNeXt-101 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x8d', 'ResNeXt-101 32x8d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
     _entry('ssl_resnext101_32x16d', 'ResNeXt-101 32x16d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2,
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Supervised pre-training on YFCC100M dataset by Facebook Research'),
 
     ## Facebook SWSL weights
     _entry('swsl_resnet18', 'ResNet-18', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnet50', 'ResNet-50', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext50_32x4d', 'ResNeXt-50 32x4d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext101_32x4d', 'ResNeXt-101 32x4d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext101_32x8d', 'ResNeXt-101 32x8d', '1905.00546',
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
-    _entry('swsl_resnext101_32x16d', 'ResNeXt-101 32x16d', '1905.00546'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
+    _entry('swsl_resnext101_32x16d', 'ResNeXt-101 32x16d', '1905.00546',
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
 
     _entry('swsl_resnet50', 'ResNet-50 (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext50_32x4d', 'ResNeXt-50 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext101_32x4d', 'ResNeXt-101 32x4d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext101_32x8d', 'ResNeXt-101 32x8d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288),
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
     _entry('swsl_resnext101_32x16d', 'ResNeXt-101 32x16d (288x288 Mean-Max Pooling)', '1905.00546',
            ttp=True, args=dict(img_size=288), batch_size=BATCH_SIZE // 2,
-           model_desc='Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/'),
+           model_desc='Semi-Weakly-Supervised pre-training on 1 billion unlabelled dataset by Facebook Research'),
 
     ## DLA official impl weights (to remove if sotabench added to source)
     _entry('dla34', 'DLA-34', '1707.06484'),

From 0d58c50fb1d5fca98a388dd028a6f6126c9a8ead Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Wed, 30 Oct 2019 16:47:39 -0700
Subject: [PATCH 05/35] Add TF RandAug weights for B5/B7 EfficientNet models.

---
 README.md                       | 8 ++++----
 sotabench.py                    | 4 ++--
 timm/models/gen_efficientnet.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f622f51f..615e68b1 100644
--- a/README.md
+++ b/README.md
@@ -105,12 +105,12 @@ I've leveraged the training scripts in this repository to train a few of the mod
 
 | Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size | Source |
 |---|---|---|---|---|---|---|
-| tf_efficientnet_b7 *tfp  | 84.480 (15.520) | 96.870 (3.130) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b7       | 84.420 (15.580) | 96.906 (3.094) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
+| tf_efficientnet_b7 *tfp  | 84.940 (15.060) | 97.214 (2.786) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
+| tf_efficientnet_b7       | 84.932 (15.068) | 97.208 (2.792) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b6 *tfp  | 84.140 (15.860) | 96.852 (3.148) | 43.04  | bicubic | 528 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b6       | 84.110 (15.890) | 96.886 (3.114) | 43.04  | bicubic | 528 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b5 *tfp  | 83.694 (16.306) | 96.696 (3.304) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b5       | 83.688 (16.312) | 96.714 (3.286) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
+| tf_efficientnet_b5 *tfp  | 83.822 (16.178) | 96.756 (3.244) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
+| tf_efficientnet_b5       | 83.812 (16.188) | 96.748 (3.252) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b4       | 83.022 (16.978) | 96.300 (3.700) | 19.34  | bicubic | 380 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b4 *tfp  | 82.948 (17.052) | 96.308 (3.692) | 19.34  | bicubic | 380 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b3 *tfp  | 81.576 (18.424) | 95.662 (4.338) | 12.23  | bicubic | 300 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
diff --git a/sotabench.py b/sotabench.py
index 9521836f..5b61a93f 100644
--- a/sotabench.py
+++ b/sotabench.py
@@ -108,11 +108,11 @@ model_list = [
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_b4', 'EfficientNet-B4 (AutoAugment)', '1905.11946', batch_size=BATCH_SIZE//2,
            model_desc='Ported from official Google AI Tensorflow weights'),
-    _entry('tf_efficientnet_b5', 'EfficientNet-B5 (AutoAugment)', '1905.11946', batch_size=BATCH_SIZE//4,
+    _entry('tf_efficientnet_b5', 'EfficientNet-B5 (RandAugment)', '1905.11946', batch_size=BATCH_SIZE//4,
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_b6', 'EfficientNet-B6 (AutoAugment)', '1905.11946', batch_size=BATCH_SIZE//8,
            model_desc='Ported from official Google AI Tensorflow weights'),
-    _entry('tf_efficientnet_b7', 'EfficientNet-B7 (AutoAugment)', '1905.11946', batch_size=BATCH_SIZE//8,
+    _entry('tf_efficientnet_b7', 'EfficientNet-B7 (RandAugment)', '1905.11946', batch_size=BATCH_SIZE//8,
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_es', 'EfficientNet-EdgeTPU-S', '1905.11946',
            model_desc='Ported from official Google AI Tensorflow weights'),
diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index 00c2c86c..d3a5cb60 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -112,13 +112,13 @@ default_cfgs = {
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
         input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
     'tf_efficientnet_b5': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_aa-99018a74.pth',
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
         input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
     'tf_efficientnet_b6': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
         input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
     'tf_efficientnet_b7': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_aa-076e3472.pth',
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
         input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
     'tf_efficientnet_es': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',

From 4748c6dff2ead22f1d2785e7aa5e20e2d6db9140 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sat, 2 Nov 2019 17:42:55 -0700
Subject: [PATCH 06/35] Fix non-prefetch variant of Mixup. Fixes #50

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index d4dd7332..b0e18bdd 100644
--- a/train.py
+++ b/train.py
@@ -439,7 +439,7 @@ def train_epoch(
                 lam = 1.
                 if not args.mixup_off_epoch or epoch < args.mixup_off_epoch:
                     lam = np.random.beta(args.mixup, args.mixup)
-                input.mul_(lam).add_(1 - lam, input.flip(0))
+                input = input.mul(lam).add_(1 - lam, input.flip(0))
                 target = mixup_target(target, args.num_classes, lam, args.smoothing)
 
         output = model(input)

From 4243f076f1b1dbc67d342a731405407e098c49d1 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 21 Nov 2019 21:14:33 -0800
Subject: [PATCH 07/35] Adding RandAugment to AutoAugment impl, some tweaks to
 AA included

---
 timm/data/__init__.py     |   2 +
 timm/data/auto_augment.py | 312 ++++++++++++++++++++++++++++++--------
 timm/data/transforms.py   |   9 +-
 3 files changed, 253 insertions(+), 70 deletions(-)

diff --git a/timm/data/__init__.py b/timm/data/__init__.py
index 61ad83ea..49c4bc60 100644
--- a/timm/data/__init__.py
+++ b/timm/data/__init__.py
@@ -4,3 +4,5 @@ from .dataset import Dataset, DatasetTar
 from .transforms import *
 from .loader import create_loader, create_transform
 from .mixup import mixup_target, FastCollateMixup
+from .auto_augment import RandAugment, AutoAugment, rand_augment_ops, auto_augment_policy,\
+    rand_augment_transform, auto_augment_transform
diff --git a/timm/data/auto_augment.py b/timm/data/auto_augment.py
index 04c0b60a..9d711cd1 100644
--- a/timm/data/auto_augment.py
+++ b/timm/data/auto_augment.py
@@ -7,11 +7,13 @@ Hacked together by Ross Wightman
 """
 import random
 import math
+import re
 from PIL import Image, ImageOps, ImageEnhance
 import PIL
 import numpy as np
 
 
+
 _PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
 
 _FILL = (128, 128, 128)
@@ -25,11 +27,11 @@ _HPARAMS_DEFAULT = dict(
     img_mean=_FILL,
 )
 
-_RANDOM_INTERPOLATION = (Image.NEAREST, Image.BILINEAR, Image.BICUBIC)
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
 
 
 def _interpolation(kwargs):
-    interpolation = kwargs.pop('resample', Image.NEAREST)
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
     if isinstance(interpolation, (list, tuple)):
         return random.choice(interpolation)
     else:
@@ -140,7 +142,6 @@ def solarize_add(img, add, thresh=128, **__):
 def posterize(img, bits_to_keep, **__):
     if bits_to_keep >= 8:
         return img
-    bits_to_keep = max(1, bits_to_keep)  # prevent all 0 images
     return ImageOps.posterize(img, bits_to_keep)
 
 
@@ -165,61 +166,89 @@ def _randomly_negate(v):
     return -v if random.random() > 0.5 else v
 
 
-def _rotate_level_to_arg(level):
+def _rotate_level_to_arg(level, _hparams):
     # range [-30, 30]
     level = (level / _MAX_LEVEL) * 30.
     level = _randomly_negate(level)
-    return (level,)
+    return level,
 
 
-def _enhance_level_to_arg(level):
+def _enhance_level_to_arg(level, _hparams):
     # range [0.1, 1.9]
-    return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
 
 
-def _shear_level_to_arg(level):
+def _shear_level_to_arg(level, _hparams):
     # range [-0.3, 0.3]
     level = (level / _MAX_LEVEL) * 0.3
     level = _randomly_negate(level)
-    return (level,)
+    return level,
 
 
-def _translate_abs_level_to_arg(level, translate_const):
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
     level = (level / _MAX_LEVEL) * float(translate_const)
     level = _randomly_negate(level)
-    return (level,)
+    return level,
 
 
-def _translate_rel_level_to_arg(level):
+def _translate_rel_level_to_arg(level, _hparams):
     # range [-0.45, 0.45]
     level = (level / _MAX_LEVEL) * 0.45
     level = _randomly_negate(level)
-    return (level,)
+    return level,
 
 
-def level_to_arg(hparams):
-    return {
-        'AutoContrast': lambda level: (),
-        'Equalize': lambda level: (),
-        'Invert': lambda level: (),
-        'Rotate': _rotate_level_to_arg,
-        # FIXME these are both different from original impl as I believe there is a bug,
-        # not sure what is the correct alternative, hence 2 options that look better
-        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4) + 4,),  # range [4, 8]
-        'Posterize2': lambda level: (4 - int((level / _MAX_LEVEL) * 4),),  # range [4, 0]
-        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256),),  # range [0, 256]
-        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110),),  # range [0, 110]
-        'Color': _enhance_level_to_arg,
-        'Contrast': _enhance_level_to_arg,
-        'Brightness': _enhance_level_to_arg,
-        'Sharpness': _enhance_level_to_arg,
-        'ShearX': _shear_level_to_arg,
-        'ShearY': _shear_level_to_arg,
-        'TranslateX': lambda level: _translate_abs_level_to_arg(level, hparams['translate_const']),
-        'TranslateY': lambda level: _translate_abs_level_to_arg(level, hparams['translate_const']),
-        'TranslateXRel': lambda level: _translate_rel_level_to_arg(level),
-        'TranslateYRel': lambda level: _translate_rel_level_to_arg(level),
-    }
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _posterize_research_level_to_arg(level, _hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image'
+    return 4 - int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_tpu_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'PosterizeResearch': _posterize_research_level_to_arg,
+    'PosterizeTpu': _posterize_tpu_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
 
 
 NAME_TO_OP = {
@@ -227,8 +256,9 @@ NAME_TO_OP = {
     'Equalize': equalize,
     'Invert': invert,
     'Rotate': rotate,
-    'Posterize': posterize,
-    'Posterize2': posterize,
+    'PosterizeOriginal': posterize,
+    'PosterizeResearch': posterize,
+    'PosterizeTpu': posterize,
     'Solarize': solarize,
     'SolarizeAdd': solarize_add,
     'Color': color,
@@ -246,35 +276,37 @@ NAME_TO_OP = {
 
 class AutoAugmentOp:
 
-    def __init__(self, name, prob, magnitude, hparams={}):
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
         self.aug_fn = NAME_TO_OP[name]
-        self.level_fn = level_to_arg(hparams)[name]
+        self.level_fn = LEVEL_TO_ARG[name]
         self.prob = prob
         self.magnitude = magnitude
-        # If std deviation of magnitude is > 0, we introduce some randomness
-        # in the usually fixed policy and sample magnitude from normal dist
-        # with mean magnitude and std-dev of magnitude_std.
-        # NOTE This is being tested as it's not in paper or reference impl.
-        self.magnitude_std = 0.5  # FIXME add arg/hparam
-        self.kwargs = {
-            'fillcolor': hparams['img_mean'] if 'img_mean' in hparams else _FILL,
-            'resample': hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION
-        }
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
+        )
+
+        # If magnitude_noise is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_noise`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_noise = self.hparams.get('magnitude_noise', 0)
 
     def __call__(self, img):
-        if self.prob < random.random():
+        if random.random() > self.prob:
             return img
         magnitude = self.magnitude
-        if self.magnitude_std and self.magnitude_std > 0:
-            magnitude = random.gauss(magnitude, self.magnitude_std)
-        magnitude = min(_MAX_LEVEL, max(0, magnitude))
-        level_args = self.level_fn(magnitude)
+        if self.magnitude_noise and self.magnitude_noise > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_noise)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range
+        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
         return self.aug_fn(img, *level_args, **self.kwargs)
 
 
-def auto_augment_policy_v0(hparams=_HPARAMS_DEFAULT):
-    # ImageNet policy from TPU EfficientNet impl, cannot find
-    # a paper reference.
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
     policy = [
         [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
         [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
@@ -288,7 +320,7 @@ def auto_augment_policy_v0(hparams=_HPARAMS_DEFAULT):
         [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
         [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
         [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
-        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('PosterizeTpu', 0.4, 6), ('AutoContrast', 0.4, 7)],
         [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
         [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
         [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
@@ -298,27 +330,60 @@ def auto_augment_policy_v0(hparams=_HPARAMS_DEFAULT):
         [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
         [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
         [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
-        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('PosterizeTpu', 0.8, 2), ('Solarize', 0.6, 10)],  # This results in black image with Tpu posterize
         [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
         [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
     ]
-    pc = [[AutoAugmentOp(*a, hparams) for a in sp] for sp in policy]
+    pc = [[AutoAugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
     return pc
 
 
-def auto_augment_policy_original(hparams=_HPARAMS_DEFAULT):
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with research variation of Posterize
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeResearch', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeResearch', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AutoAugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
     # ImageNet policy from https://arxiv.org/abs/1805.09501
     policy = [
-        [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
         [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
         [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
-        [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
         [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
         [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
         [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
-        [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
         [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
-        [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
         [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
         [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
         [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
@@ -335,15 +400,53 @@ def auto_augment_policy_original(hparams=_HPARAMS_DEFAULT):
         [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
         [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
     ]
-    pc = [[AutoAugmentOp(*a, hparams) for a in sp] for sp in policy]
+    pc = [[AutoAugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
     return pc
 
 
-def auto_augment_policy(name='v0', hparams=_HPARAMS_DEFAULT):
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeResearch', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeResearch', 0.6, 7), ('PosterizeResearch', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeResearch', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeResearch', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AutoAugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
     if name == 'original':
         return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
     elif name == 'v0':
         return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
     else:
         assert False, 'Unknown AA policy (%s)' % name
 
@@ -358,3 +461,78 @@ class AutoAugment:
         for op in sub_policy:
             img = op(img)
         return img
+
+
+def auto_augment_transform(config_str, hparams):
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) >= 2:
+            key, val = cs[:2]
+            if key == 'noise':
+                # noise param injected via hparams for now
+                hparams.setdefault('magnitude_noise', float(val))
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeTpu',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # FIXME I implement this as random erasing separately
+]
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [AutoAugmentOp(
+        name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
+
+
+class RandAugment:
+    def __init__(self, ops, num_layers=2):
+        self.ops = ops
+        self.num_layers = num_layers
+
+    def __call__(self, img):
+        for _ in range(self.num_layers):
+            op = random.choice(self.ops)
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    magnitude = 10
+    num_layers = 2
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) >= 2:
+            key, val = cs[:2]
+            if key == 'noise':
+                # noise param injected via hparams for now
+                hparams.setdefault('magnitude_noise', float(val))
+            elif key == 'm':
+                magnitude = int(val)
+            elif key == 'n':
+                num_layers = int(val)
+    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams)
+    return RandAugment(ra_ops, num_layers)
diff --git a/timm/data/transforms.py b/timm/data/transforms.py
index 33911638..ac03b098 100644
--- a/timm/data/transforms.py
+++ b/timm/data/transforms.py
@@ -9,7 +9,7 @@ import numpy as np
 
 from .constants import DEFAULT_CROP_PCT, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .random_erasing import RandomErasing
-from .auto_augment import AutoAugment, auto_augment_policy
+from .auto_augment import auto_augment_transform, rand_augment_transform
 
 
 class ToNumpy:
@@ -179,6 +179,7 @@ def transforms_imagenet_train(
         transforms.RandomHorizontalFlip()
     ]
     if auto_augment:
+        assert isinstance(auto_augment, str)
         if isinstance(img_size, tuple):
             img_size_min = min(img_size)
         else:
@@ -189,8 +190,10 @@ def transforms_imagenet_train(
         )
         if interpolation and interpolation != 'random':
             aa_params['interpolation'] = _pil_interp(interpolation)
-        aa_policy = auto_augment_policy(auto_augment, aa_params)
-        tfl += [AutoAugment(aa_policy)]
+        if 'rand' in auto_augment:
+            tfl += [rand_augment_transform(auto_augment, aa_params)]
+        else:
+            tfl += [auto_augment_transform(auto_augment, aa_params)]
     else:
         # color jitter is enabled when not using AA
         if isinstance(color_jitter, (list, tuple)):

From 31453b039ef1217171cfef128ad6ca4e595787ce Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 22 Nov 2019 13:00:24 -0800
Subject: [PATCH 08/35] Update Auto/RandAugment comments, README, more. * Add a
 weighted choice option for RandAugment * Adjust magnitude noise/std naming,
 config

---
 README.md                 |   1 +
 timm/data/auto_augment.py | 127 ++++++++++++++++++++++++++++++--------
 timm/data/transforms.py   |   2 +-
 3 files changed, 102 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 615e68b1..bb7f4206 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ Several (less common) features that I often utilize in my projects are included.
 * Training schedules and techniques that provide competitive results (Cosine LR, Random Erasing, Label Smoothing, etc)
 * Mixup (as in https://arxiv.org/abs/1710.09412) - currently implementing/testing
 * An inference script that dumps output to CSV is provided as an example
+* AutoAugment (https://arxiv.org/abs/1805.09501) and RandAugment (https://arxiv.org/abs/1909.13719) ImageNet configurations modeled after impl for EfficientNet training (https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py)
 
 ## Results
 
diff --git a/timm/data/auto_augment.py b/timm/data/auto_augment.py
index 9d711cd1..d730c266 100644
--- a/timm/data/auto_augment.py
+++ b/timm/data/auto_augment.py
@@ -1,7 +1,7 @@
-""" Auto Augment
+""" AutoAugment and RandAugment
 Implementation adapted from:
     https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
-Papers: https://arxiv.org/abs/1805.09501 and https://arxiv.org/abs/1906.11172
+Papers: https://arxiv.org/abs/1805.09501, https://arxiv.org/abs/1906.11172, and https://arxiv.org/abs/1909.13719
 
 Hacked together by Ross Wightman
 """
@@ -288,18 +288,18 @@ class AutoAugmentOp:
             resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
         )
 
-        # If magnitude_noise is > 0, we introduce some randomness
+        # If magnitude_std is > 0, we introduce some randomness
         # in the usually fixed policy and sample magnitude from a normal distribution
-        # with mean `magnitude` and std-dev of `magnitude_noise`.
+        # with mean `magnitude` and std-dev of `magnitude_std`.
         # NOTE This is my own hack, being tested, not in papers or reference impls.
-        self.magnitude_noise = self.hparams.get('magnitude_noise', 0)
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
 
     def __call__(self, img):
         if random.random() > self.prob:
             return img
         magnitude = self.magnitude
-        if self.magnitude_noise and self.magnitude_noise > 0:
-            magnitude = random.gauss(magnitude, self.magnitude_noise)
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
         magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range
         level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
         return self.aug_fn(img, *level_args, **self.kwargs)
@@ -464,16 +464,32 @@ class AutoAugment:
 
 
 def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A PyTorch compatible Transform
+    """
     config = config_str.split('-')
     policy_name = config[0]
     config = config[1:]
     for c in config:
         cs = re.split(r'(\d.*)', c)
-        if len(cs) >= 2:
-            key, val = cs[:2]
-            if key == 'noise':
-                # noise param injected via hparams for now
-                hparams.setdefault('magnitude_noise', float(val))
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
     aa_policy = auto_augment_policy(policy_name, hparams=hparams)
     return AutoAugment(aa_policy)
 
@@ -498,6 +514,36 @@ _RAND_TRANSFORMS = [
 ]
 
 
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'PosterizeTpu': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
 def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
     hparams = hparams or _HPARAMS_DEFAULT
     transforms = transforms or _RAND_TRANSFORMS
@@ -506,33 +552,60 @@ def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
 
 
 class RandAugment:
-    def __init__(self, ops, num_layers=2):
+    def __init__(self, ops, num_layers=2, choice_weights=None):
         self.ops = ops
         self.num_layers = num_layers
+        self.choice_weights = choice_weights
 
     def __call__(self, img):
-        for _ in range(self.num_layers):
-            op = random.choice(self.ops)
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
+        for op in ops:
             img = op(img)
         return img
 
 
 def rand_augment_transform(config_str, hparams):
-    magnitude = 10
-    num_layers = 2
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A PyTorch compatible Transform
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
     config = config_str.split('-')
     assert config[0] == 'rand'
     config = config[1:]
     for c in config:
         cs = re.split(r'(\d.*)', c)
-        if len(cs) >= 2:
-            key, val = cs[:2]
-            if key == 'noise':
-                # noise param injected via hparams for now
-                hparams.setdefault('magnitude_noise', float(val))
-            elif key == 'm':
-                magnitude = int(val)
-            elif key == 'n':
-                num_layers = int(val)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
     ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams)
-    return RandAugment(ra_ops, num_layers)
+    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
diff --git a/timm/data/transforms.py b/timm/data/transforms.py
index ac03b098..41f2a63e 100644
--- a/timm/data/transforms.py
+++ b/timm/data/transforms.py
@@ -190,7 +190,7 @@ def transforms_imagenet_train(
         )
         if interpolation and interpolation != 'random':
             aa_params['interpolation'] = _pil_interp(interpolation)
-        if 'rand' in auto_augment:
+        if auto_augment.startswith('rand'):
             tfl += [rand_augment_transform(auto_augment, aa_params)]
         else:
             tfl += [auto_augment_transform(auto_augment, aa_params)]

From 7b83e67f77122d5c07ccafaa4c09719f947e00b2 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 22 Nov 2019 13:27:43 -0800
Subject: [PATCH 09/35] Pass drop connect arg through to EfficientNet models

---
 timm/models/factory.py | 7 ++++---
 train.py               | 5 ++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/timm/models/factory.py b/timm/models/factory.py
index d807a342..3c051e75 100644
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@@ -25,12 +25,13 @@ def create_model(
     """
     margs = dict(pretrained=pretrained, num_classes=num_classes, in_chans=in_chans)
 
-    # Not all models have support for batchnorm params passed as args, only gen_efficientnet variants
-    supports_bn_params = is_model_in_modules(model_name, ['gen_efficientnet'])
-    if not supports_bn_params and any([x in kwargs for x in ['bn_tf', 'bn_momentum', 'bn_eps']]):
+    # Only gen_efficientnet models have support for batchnorm params or drop_connect_rate passed as args
+    is_efficientnet = is_model_in_modules(model_name, ['gen_efficientnet'])
+    if not is_efficientnet:
         kwargs.pop('bn_tf', None)
         kwargs.pop('bn_momentum', None)
         kwargs.pop('bn_eps', None)
+        kwargs.pop('drop_connect_rate', None)
 
     if is_model(model_name):
         create_fn = model_entrypoint(model_name)
diff --git a/train.py b/train.py
index b0e18bdd..776e5ef2 100644
--- a/train.py
+++ b/train.py
@@ -65,6 +65,8 @@ parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
                     help='input batch size for training (default: 32)')
 parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
                     help='Dropout rate (default: 0.)')
+parser.add_argument('--drop-connect', type=float, default=0.0, metavar='DROP',
+                    help='Drop connect rate (default: 0.)')
 # Optimizer parameters
 parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
                     help='Optimizer (default: "sgd"')
@@ -208,6 +210,7 @@ def main():
         pretrained=args.pretrained,
         num_classes=args.num_classes,
         drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,
         global_pool=args.gp,
         bn_tf=args.bn_tf,
         bn_momentum=args.bn_momentum,
@@ -253,7 +256,7 @@ def main():
             if args.local_rank == 0:
                 logging.info('Restoring NVIDIA AMP state from checkpoint')
             amp.load_state_dict(resume_state['amp'])
-    resume_state = None
+    resume_state = None  # clear it
 
     model_ema = None
     if args.model_ema:

From 1f39d15f154536eba0d1876652d7ee3dad3e78f9 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 22 Nov 2019 13:28:24 -0800
Subject: [PATCH 10/35] Allow float decay epochs arg for training, works out
 with step lr math

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 776e5ef2..b79a342e 100644
--- a/train.py
+++ b/train.py
@@ -89,7 +89,7 @@ parser.add_argument('--epochs', type=int, default=200, metavar='N',
                     help='number of epochs to train (default: 2)')
 parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
                     help='manual epoch number (useful on restarts)')
-parser.add_argument('--decay-epochs', type=int, default=30, metavar='N',
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
                     help='epoch interval to decay LR')
 parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
                     help='epochs to warmup LR, if scheduler supports')

From 576d360f20c8299cfd909c86edad4afff45d3d01 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 22 Nov 2019 13:57:45 -0800
Subject: [PATCH 11/35] Bring in JIT version of optimized swish activation from
 gen_efficientnet as default (while working on feature extraction
 functionality here).

---
 timm/models/gen_efficientnet.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index d3a5cb60..a7191025 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -373,25 +373,37 @@ def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil'):
 
 _USE_SWISH_OPT = True
 if _USE_SWISH_OPT:
-    class SwishAutoFn(torch.autograd.Function):
-        """ Memory Efficient Swish
-        From: https://blog.ceshine.net/post/pytorch-memory-swish/
+    @torch.jit.script
+    def swish_jit_fwd(x):
+        return x.mul(torch.sigmoid(x))
+
+
+    @torch.jit.script
+    def swish_jit_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+    class SwishJitAutoFn(torch.autograd.Function):
+        """ torch.jit.script optimised Swish
+        Inspired by conversation btw Jeremy Howard & Adam Pazske
+        https://twitter.com/jeremyphoward/status/1188251041835315200
         """
+
         @staticmethod
         def forward(ctx, x):
-            result = x.mul(torch.sigmoid(x))
             ctx.save_for_backward(x)
-            return result
+            return swish_jit_fwd(x)
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            sigmoid_x = torch.sigmoid(x)
-            return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x)))
+            x = ctx.saved_tensors[0]
+            return swish_jit_bwd(x, grad_output)
 
 
     def swish(x, inplace=False):
-        return SwishAutoFn.apply(x)
+        # inplace ignored
+        return SwishJitAutoFn.apply(x)
 else:
     def swish(x, inplace=False):
         return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())

From 506df0e3d0136da4063b2a2881958e57cf43c784 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 22 Oct 2019 23:42:04 -0700
Subject: [PATCH 12/35] Add CondConv support for EfficientNet into WIP for
 GenEfficientNet Feature extraction setup

---
 timm/models/conv2d_helpers.py   |  120 ---
 timm/models/conv2d_layers.py    |  255 ++++++
 timm/models/gen_efficientnet.py | 1291 +++++++++++++++++--------------
 timm/models/helpers.py          |    3 +-
 4 files changed, 976 insertions(+), 693 deletions(-)
 delete mode 100644 timm/models/conv2d_helpers.py
 create mode 100644 timm/models/conv2d_layers.py

diff --git a/timm/models/conv2d_helpers.py b/timm/models/conv2d_helpers.py
deleted file mode 100644
index 674eadca..00000000
--- a/timm/models/conv2d_helpers.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-
-
-def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
-    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
-
-
-def _get_padding(kernel_size, stride=1, dilation=1, **_):
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-def _calc_same_pad(i, k, s, d):
-    return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
-
-
-def _split_channels(num_chan, num_groups):
-    split = [num_chan // num_groups for _ in range(num_groups)]
-    split[0] += num_chan - sum(split)
-    return split
-
-
-class Conv2dSame(nn.Conv2d):
-    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        super(Conv2dSame, self).__init__(
-            in_channels, out_channels, kernel_size, stride, 0, dilation,
-            groups, bias)
-
-    def forward(self, x):
-        ih, iw = x.size()[-2:]
-        kh, kw = self.weight.size()[-2:]
-        pad_h = _calc_same_pad(ih, kh, self.stride[0], self.dilation[0])
-        pad_w = _calc_same_pad(iw, kw, self.stride[1], self.dilation[1])
-        if pad_h > 0 or pad_w > 0:
-            x = F.pad(x, [pad_w//2, pad_w - pad_w//2, pad_h//2, pad_h - pad_h//2])
-        return F.conv2d(x, self.weight, self.bias, self.stride,
-                        self.padding, self.dilation, self.groups)
-
-
-def conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
-    padding = kwargs.pop('padding', '')
-    kwargs.setdefault('bias', False)
-    if isinstance(padding, str):
-        # for any string padding, the padding will be calculated for you, one of three ways
-        padding = padding.lower()
-        if padding == 'same':
-            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
-            if _is_static_pad(kernel_size, **kwargs):
-                # static case, no extra overhead
-                padding = _get_padding(kernel_size, **kwargs)
-                return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
-            else:
-                # dynamic padding
-                return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
-        elif padding == 'valid':
-            # 'VALID' padding, same as padding=0
-            return nn.Conv2d(in_chs, out_chs, kernel_size, padding=0, **kwargs)
-        else:
-            # Default to PyTorch style 'same'-ish symmetric padding
-            padding = _get_padding(kernel_size, **kwargs)
-            return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
-    else:
-        # padding was specified as a number or pair
-        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
-
-
-class MixedConv2d(nn.Module):
-    """ Mixed Grouped Convolution
-    Based on MDConv and GroupedConv in MixNet impl:
-      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilated=False, depthwise=False, **kwargs):
-        super(MixedConv2d, self).__init__()
-
-        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
-        num_groups = len(kernel_size)
-        in_splits = _split_channels(in_channels, num_groups)
-        out_splits = _split_channels(out_channels, num_groups)
-        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            d = 1
-            # FIXME make compat with non-square kernel/dilations/strides
-            if stride == 1 and dilated:
-                d, k = (k - 1) // 2, 3
-            conv_groups = out_ch if depthwise else 1
-            # use add_module to keep key space clean
-            self.add_module(
-                str(idx),
-                conv2d_pad(
-                    in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=d, groups=conv_groups, **kwargs)
-            )
-        self.splits = in_splits
-
-    def forward(self, x):
-        x_split = torch.split(x, self.splits, 1)
-        x_out = [c(x) for x, c in zip(x_split, self._modules.values())]
-        x = torch.cat(x_out, 1)
-        return x
-
-
-# helper method
-def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
-    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
-    if isinstance(kernel_size, list):
-        # We're going to use only lists for defining the MixedConv2d kernel groups,
-        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        return MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
-    else:
-        depthwise = kwargs.pop('depthwise', False)
-        groups = out_chs if depthwise else 1
-        return conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
-
diff --git a/timm/models/conv2d_layers.py b/timm/models/conv2d_layers.py
new file mode 100644
index 00000000..cd52b885
--- /dev/null
+++ b/timm/models/conv2d_layers.py
@@ -0,0 +1,255 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch._six import container_abcs
+from itertools import repeat
+from functools import partial
+import numpy as np
+import math
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+_single = _ntuple(1)
+_pair = _ntuple(2)
+_triple = _ntuple(3)
+_quadruple = _ntuple(4)
+
+
+def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def _get_padding(kernel_size, stride=1, dilation=1, **_):
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+def _calc_same_pad(i, k, s, d):
+    return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+
+
+def _split_channels(num_chan, num_groups):
+    split = [num_chan // num_groups for _ in range(num_groups)]
+    split[0] += num_chan - sum(split)
+    return split
+
+
+# pylint: disable=unused-argument
+def conv2d_same(x, weight, bias=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1):
+    ih, iw = x.size()[-2:]
+    kh, kw = weight.size()[-2:]
+    pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
+    pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+
+
+class Conv2dSame(nn.Conv2d):
+    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+    """
+
+    # pylint: disable=unused-argument
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super(Conv2dSame, self).__init__(
+            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+
+    def forward(self, x):
+        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+def get_padding_value(padding, kernel_size, **kwargs):
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if _is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = _get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic padding
+                padding = 0
+                dynamic = True
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = _get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', '')
+    kwargs.setdefault('bias', False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
+class MixedConv2d(nn.Module):
+    """ Mixed Grouped Convolution
+    Based on MDConv and GroupedConv in MixNet impl:
+      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, mixed_dilated=False, depthwise=False, **kwargs):
+        super(MixedConv2d, self).__init__()
+
+        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+        num_groups = len(kernel_size)
+        in_splits = _split_channels(in_channels, num_groups)
+        out_splits = _split_channels(out_channels, num_groups)
+        self.in_channels = sum(in_splits)
+        self.out_channels = sum(out_splits)
+        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+            d = dilation
+            # FIXME make compat with non-square kernel/dilations/strides
+            if stride == 1 and mixed_dilated:
+                d, k = (k - 1) // 2, 3
+            conv_groups = out_ch if depthwise else 1
+            # use add_module to keep key space clean
+            self.add_module(
+                str(idx),
+                create_conv2d_pad(
+                    in_ch, out_ch, k, stride=stride,
+                    padding=padding, dilation=d, groups=conv_groups, **kwargs)
+            )
+        self.splits = in_splits
+
+    def forward(self, x):
+        x_split = torch.split(x, self.splits, 1)
+        x_out = [c(x) for x, c in zip(x_split, self._modules.values())]
+        x = torch.cat(x_out, 1)
+        return x
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+    def condconv_initializer(weight):
+        """CondConv initializer function."""
+        num_params = np.prod(expert_shape)
+        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+                weight.shape[1] != num_params):
+            raise (ValueError(
+                'CondConv variables must have shape [num_experts, num_params]'))
+        for i in range(num_experts):
+            initializer(weight[i].view(expert_shape))
+    return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+    """ Conditional Convolution
+    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+        super(CondConv2d, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        padding_val, is_padding_dynamic = get_padding_value(
+            padding, kernel_size, stride=stride, dilation=dilation)
+        self.conv_fn = conv2d_same if is_padding_dynamic else F.conv2d
+        self.padding = _pair(padding_val)
+        self.dilation = _pair(dilation)
+        self.transposed = False
+        self.output_padding = _pair(0)
+        self.groups = groups
+        self.padding_mode = 'zero'
+        self.num_experts = num_experts
+
+        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight_num_param = 1
+        for wd in self.weight_shape:
+            weight_num_param *= wd
+        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+        # FIXME I haven't tested bias yet
+        if bias:
+            self.bias_shape = (self.out_channels,)
+            condconv_bias_shape = (self.num_experts, self.out_channels)
+            self.bias = torch.nn.Parameter(torch.Tensor(condconv_bias_shape))
+        else:
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+        # FIXME once I'm satisfied this works, remove the looping path?
+        self._use_groups = True  # use groups for parallel per-batch-element kernel convolution
+
+    def reset_parameters(self):
+        init_weight = get_condconv_initializer(
+            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+        init_weight(self.weight)
+        if self.bias is not None:
+            # FIXME bias not tested
+            fan_in = np.prod(self.weight_shape[1:])
+            bound = 1 / math.sqrt(fan_in)
+            init_bias = get_condconv_initializer(
+                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+            init_bias(self.bias)
+
+    def forward(self, x, routing_weights):
+        weight = torch.matmul(routing_weights, self.weight)
+        bias = torch.matmul(routing_weights, self.bias) if self.bias is not None else None
+        B, C, H, W = x.shape
+        if self._use_groups:
+            new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+            weight = weight.view(new_weight_shape)
+            x = x.view(1, B * C, H, W)
+            out = self.conv_fn(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+            out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+        else:
+            x = torch.split(x, 1, 0)
+            weight = torch.split(weight, 1, 0)
+            if self.bias is not None:
+                bias = torch.matmul(routing_weights, self.bias)
+                bias = torch.split(bias, 1, 0)
+            else:
+                bias = [None] * B
+            out = []
+            for xi, wi, bi in zip(x, weight, bias):
+                wi = wi.view(*self.weight_shape)
+                if bi is not None:
+                    bi = bi.view(*self.bias_shape)
+                out.append(self.conv_fn(
+                    xi, wi, bi, stride=self.stride, padding=self.padding,
+                    dilation=self.dilation, groups=self.groups))
+            out = torch.cat(out, 0)
+        return out
+
+
+# helper method
+def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
+    if isinstance(kernel_size, list):
+        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
+        # We're going to use only lists for defining the MixedConv2d kernel groups,
+        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+        return MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        depthwise = kwargs.pop('depthwise', False)
+        groups = out_chs if depthwise else 1
+        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
+            create_fn = CondConv2d
+        else:
+            create_fn = create_conv2d_pad
+        return create_fn(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+
diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index a7191025..e51bab2a 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -19,16 +19,19 @@ import math
 import re
 import logging
 from copy import deepcopy
+from functools import partial
+from collections import OrderedDict, defaultdict
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .registry import register_model
+from timm.models.activations import Swish, sigmoid, HardSwish, hard_sigmoid
+from .registry import register_model, model_entrypoint
 from .helpers import load_pretrained
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
-from .conv2d_helpers import select_conv2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .conv2d_layers import select_conv2d
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 
 __all__ = ['GenEfficientNet']
@@ -96,6 +99,9 @@ default_cfgs = {
     'efficientnet_el': _cfg(
         url='',
         input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'efficientnet_cc_b0_4e': _cfg(url=''),
+    'efficientnet_cc_b0_8e': _cfg(url=''),
+    'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
     'tf_efficientnet_b0': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
         input_size=(3, 224, 224)),
@@ -132,6 +138,16 @@ default_cfgs = {
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
         mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
         input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_cc_b0_4e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b0_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b1_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
     'mixnet_s': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth'),
     'mixnet_m': _cfg(
@@ -150,7 +166,7 @@ default_cfgs = {
 }
 
 
-_DEBUG = False
+_DEBUG = True
 
 # Default args for PyTorch BN impl
 _BN_MOMENTUM_PT_DEFAULT = 0.1
@@ -201,7 +217,7 @@ def _parse_ksize(ss):
         return [int(k) for k in ss.split('.')]
 
 
-def _decode_block_str(block_str, depth_multiplier=1.0):
+def _decode_block_str(block_str):
     """ Decode block definition string
 
     Gets a list of block arg (dicts) through a string notation of arguments.
@@ -241,13 +257,13 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
             key = op[0]
             v = op[1:]
             if v == 're':
-                value = F.relu
+                value = nn.ReLU
             elif v == 'r6':
-                value = F.relu6
+                value = nn.ReLU6
             elif v == 'hs':
-                value = hard_swish
+                value = HardSwish
             elif v == 'sw':
-                value = swish
+                value = Swish
             else:
                 continue
             options[key] = value
@@ -258,8 +274,8 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
                 key, value = splits[:2]
                 options[key] = value
 
-    # if act_fn is None, the model default (passed to model init) will be used
-    act_fn = options['n'] if 'n' in options else None
+    # if act_layer is None, the model default (passed to model init) will be used
+    act_layer = options['n'] if 'n' in options else None
     exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
     pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
     fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
@@ -276,8 +292,9 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
             exp_ratio=float(options['e']),
             se_ratio=float(options['se']) if 'se' in options else None,
             stride=int(options['s']),
-            act_fn=act_fn,
+            act_layer=act_layer,
             noskip=noskip,
+            num_experts=int(options['cc']) if 'cc' in options else 0
         )
     elif block_type == 'ds' or block_type == 'dsa':
         block_args = dict(
@@ -287,7 +304,7 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
             out_chs=int(options['c']),
             se_ratio=float(options['se']) if 'se' in options else None,
             stride=int(options['s']),
-            act_fn=act_fn,
+            act_layer=act_layer,
             pw_act=block_type == 'dsa',
             noskip=block_type == 'dsa' or noskip,
         )
@@ -301,7 +318,7 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
             fake_in_chs=fake_in_chs,
             se_ratio=float(options['se']) if 'se' in options else None,
             stride=int(options['s']),
-            act_fn=act_fn,
+            act_layer=act_layer,
             noskip=noskip,
         )
     elif block_type == 'cn':
@@ -310,7 +327,7 @@ def _decode_block_str(block_str, depth_multiplier=1.0):
             kernel_size=int(options['k']),
             out_chs=int(options['c']),
             stride=int(options['s']),
-            act_fn=act_fn,
+            act_layer=act_layer,
         )
     else:
         assert False, 'Unknown block type (%s)' % block_type
@@ -356,7 +373,7 @@ def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='c
     return sa_scaled
 
 
-def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil'):
+def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1):
     arch_args = []
     for stack_idx, block_strings in enumerate(arch_def):
         assert isinstance(block_strings, list)
@@ -365,6 +382,8 @@ def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil'):
         for block_str in block_strings:
             assert isinstance(block_str, str)
             ba, rep = _decode_block_str(block_str)
+            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+                ba['num_experts'] *= experts_multiplier
             stack_args.append(ba)
             repeats.append(rep)
         arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
@@ -437,61 +456,67 @@ class _BlockBuilder:
 
     """
     def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 pad_type='', act_fn=None, se_gate_fn=sigmoid, se_reduce_mid=False,
-                 bn_args=_BN_ARGS_PT, drop_connect_rate=0., verbose=False):
+                 output_stride=32, pad_type='', act_layer=None, se_gate_fn=sigmoid, se_reduce_mid=False,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0., feature_location='',
+                 verbose=False):
         self.channel_multiplier = channel_multiplier
         self.channel_divisor = channel_divisor
         self.channel_min = channel_min
+        self.output_stride = output_stride
         self.pad_type = pad_type
-        self.act_fn = act_fn
+        self.act_layer = act_layer
         self.se_gate_fn = se_gate_fn
         self.se_reduce_mid = se_reduce_mid
-        self.bn_args = bn_args
+        self.norm_layer = norm_layer
+        self.norm_kwargs = norm_kwargs
         self.drop_connect_rate = drop_connect_rate
+        self.feature_location = feature_location
+        assert feature_location in ('pre_pwl', 'post_exp', '')
         self.verbose = verbose
 
-        # updated during build
+        # state updated during build, consumed by model
         self.in_chs = None
-        self.block_idx = 0
-        self.block_count = 0
+        self.features = OrderedDict()
 
     def _round_channels(self, chs):
         return _round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
 
-    def _make_block(self, ba):
+    def _make_block(self, ba, block_idx, block_count):
+        drop_connect_rate = self.drop_connect_rate * block_idx / block_count
         bt = ba.pop('block_type')
         ba['in_chs'] = self.in_chs
         ba['out_chs'] = self._round_channels(ba['out_chs'])
         if 'fake_in_chs' in ba and ba['fake_in_chs']:
             # FIXME this is a hack to work around mismatch in origin impl input filters
             ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
-        ba['bn_args'] = self.bn_args
+        ba['norm_layer'] = self.norm_layer
+        ba['norm_kwargs'] = self.norm_kwargs
         ba['pad_type'] = self.pad_type
         # block act fn overrides the model default
-        ba['act_fn'] = ba['act_fn'] if ba['act_fn'] is not None else self.act_fn
-        assert ba['act_fn'] is not None
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
         if bt == 'ir':
-            ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+            ba['drop_connect_rate'] = drop_connect_rate
             ba['se_gate_fn'] = self.se_gate_fn
             ba['se_reduce_mid'] = self.se_reduce_mid
             if self.verbose:
-                logging.info('  InvertedResidual {}, Args: {}'.format(self.block_idx, str(ba)))
+                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
             block = InvertedResidual(**ba)
         elif bt == 'ds' or bt == 'dsa':
-            ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+            ba['drop_connect_rate'] = drop_connect_rate
             if self.verbose:
-                logging.info('  DepthwiseSeparable {}, Args: {}'.format(self.block_idx, str(ba)))
+                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
             block = DepthwiseSeparableConv(**ba)
         elif bt == 'er':
-            ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count
+            ba['drop_connect_rate'] = drop_connect_rate
             ba['se_gate_fn'] = self.se_gate_fn
             ba['se_reduce_mid'] = self.se_reduce_mid
             if self.verbose:
-                logging.info('  EdgeResidual {}, Args: {}'.format(self.block_idx, str(ba)))
+                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
             block = EdgeResidual(**ba)
         elif bt == 'cn':
             if self.verbose:
-                logging.info('  ConvBnAct {}, Args: {}'.format(self.block_idx, str(ba)))
+                logging.info('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)))
             block = ConvBnAct(**ba)
         else:
             assert False, 'Uknkown block type (%s) while building model.' % bt
@@ -499,46 +524,96 @@ class _BlockBuilder:
 
         return block
 
-    def _make_stack(self, stack_args):
-        blocks = []
-        # each stack (stage) contains a list of block arguments
-        for i, ba in enumerate(stack_args):
-            if self.verbose:
-                logging.info(' Block: {}'.format(i))
-            if i >= 1:
-                # only the first block in any stack can have a stride > 1
-                ba['stride'] = 1
-            block = self._make_block(ba)
-            blocks.append(block)
-            self.block_idx += 1  # incr global idx (across all stacks)
-        return nn.Sequential(*blocks)
-
-    def __call__(self, in_chs, block_args):
+    def __call__(self, in_chs, model_block_args):
         """ Build the blocks
         Args:
             in_chs: Number of input-channels passed to first block
-            block_args: A list of lists, outer list defines stages, inner
+            model_block_args: A list of lists, outer list defines stages, inner
                 list contains strings defining block configuration(s)
         Return:
              List of block stacks (each stack wrapped in nn.Sequential)
         """
         if self.verbose:
-            logging.info('Building model trunk with %d stages...' % len(block_args))
+            logging.info('Building model trunk with %d stages...' % len(model_block_args))
         self.in_chs = in_chs
-        self.block_count = sum([len(x) for x in block_args])
-        self.block_idx = 0
-        blocks = []
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        feature_idx = 0
+        stages = []
         # outer list of block_args defines the stacks ('stages' by some conventions)
-        for stack_idx, stack in enumerate(block_args):
+        for stage_idx, stage_block_args in enumerate(model_block_args):
+            last_stack = stage_idx == (len(model_block_args) - 1)
             if self.verbose:
-                logging.info('Stack: {}'.format(stack_idx))
-            assert isinstance(stack, list)
-            stack = self._make_stack(stack)
-            blocks.append(stack)
-        return blocks
+                logging.info('Stack: {}'.format(stage_idx))
+            assert isinstance(stage_block_args, list)
+
+            blocks = []
+            # each stack (stage) contains a list of block arguments
+            for block_idx, block_args in enumerate(stage_block_args):
+                last_block = block_idx == (len(stage_block_args) - 1)
+                extract_features = ''  # No features extracted
+                if self.verbose:
+                    logging.info(' Block: {}'.format(block_idx))
+
+                # Sort out stride, dilation, and feature extraction details
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:
+                    # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                do_extract = False
+                if self.feature_location == 'pre_pwl':
+                    if last_block:
+                        next_stage_idx = stage_idx + 1
+                        if next_stage_idx >= len(model_block_args):
+                            do_extract = True
+                        else:
+                            do_extract = model_block_args[next_stage_idx][0]['stride'] > 1
+                elif self.feature_location == 'post_exp':
+                    if block_args['stride'] > 1 or (last_stack and last_block) :
+                        do_extract = True
+                if do_extract:
+                    extract_features = self.feature_location
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                        if self.verbose:
+                            logging.info('  Converting stride to dilation to maintain output_stride=={}'.format(
+                                self.output_stride))
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+                # create the block
+                block = self._make_block(block_args, total_block_idx, total_block_count)
+                blocks.append(block)
+
+                # stash feature module name and channel info for model feature extraction
+                if extract_features:
+                    feature_module = block.feature_module(extract_features)
+                    if feature_module:
+                        feature_module = 'blocks.{}.{}.'.format(stage_idx, block_idx) + feature_module
+                    feature_channels = block.feature_channels(extract_features)
+                    self.features[feature_idx] = dict(
+                        name=feature_module,
+                        num_chs=feature_channels
+                    )
+                    feature_idx += 1
+
+                total_block_idx += 1  # incr global block idx (across all stacks)
+            stages.append(nn.Sequential(*blocks))
+        return stages
 
 
-def _initialize_weight_goog(m):
+def _init_weight_goog(m):
     # weight init as per Tensorflow Official impl
     # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
     if isinstance(m, nn.Conv2d):
@@ -556,7 +631,7 @@ def _initialize_weight_goog(m):
         m.bias.data.zero_()
 
 
-def _initialize_weight_default(m):
+def _init_weight_default(m):
     if isinstance(m, nn.Conv2d):
         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
     elif isinstance(m, nn.BatchNorm2d):
@@ -601,19 +676,19 @@ class ChannelShuffle(nn.Module):
 
 
 class SqueezeExcite(nn.Module):
-    def __init__(self, in_chs, reduce_chs=None, act_fn=F.relu, gate_fn=sigmoid):
+    def __init__(self, in_chs, reduce_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid):
         super(SqueezeExcite, self).__init__()
-        self.act_fn = act_fn
         self.gate_fn = gate_fn
         reduced_chs = reduce_chs or in_chs
         self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layer(inplace=True)
         self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
 
     def forward(self, x):
         # NOTE adaptiveavgpool can be used here, but seems to cause issues with NVIDIA AMP performance
         x_se = x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1, 1)
         x_se = self.conv_reduce(x_se)
-        x_se = self.act_fn(x_se, inplace=True)
+        x_se = self.act1(x_se)
         x_se = self.conv_expand(x_se)
         x = x * self.gate_fn(x_se)
         return x
@@ -621,17 +696,24 @@ class SqueezeExcite(nn.Module):
 
 class ConvBnAct(nn.Module):
     def __init__(self, in_chs, out_chs, kernel_size,
-                 stride=1, pad_type='', act_fn=F.relu, bn_args=_BN_ARGS_PT):
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,):
         super(ConvBnAct, self).__init__()
         assert stride in [1, 2]
-        self.act_fn = act_fn
-        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, padding=pad_type)
-        self.bn1 = nn.BatchNorm2d(out_chs, **bn_args)
+        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn1 = norm_layer(out_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+    def feature_module(self, location):
+        return 'act1'
+
+    def feature_channels(self, location):
+        return self.conv.out_channels
 
     def forward(self, x):
         x = self.conv(x)
         x = self.bn1(x)
-        x = self.act_fn(x, inplace=True)
+        x = self.act1(x)
         return x
 
 
@@ -639,29 +721,41 @@ class EdgeResidual(nn.Module):
     """ Residual block with expansion convolution followed by pointwise-linear w/ stride"""
 
     def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
-                 stride=1, pad_type='', act_fn=F.relu, noskip=False, pw_kernel_size=1,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
                  se_ratio=0., se_reduce_mid=False, se_gate_fn=sigmoid,
-                 bn_args=_BN_ARGS_PT, drop_connect_rate=0.):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0.):
         super(EdgeResidual, self).__init__()
         mid_chs = int(fake_in_chs * exp_ratio) if fake_in_chs > 0 else int(in_chs * exp_ratio)
         self.has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.act_fn = act_fn
         self.drop_connect_rate = drop_connect_rate
 
         # Expansion convolution
         self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
-        self.bn1 = nn.BatchNorm2d(mid_chs, **bn_args)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
         if self.has_se:
             se_base_chs = mid_chs if se_reduce_mid else in_chs
             self.se = SqueezeExcite(
-                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_fn=act_fn, gate_fn=se_gate_fn)
+                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
 
         # Point-wise linear projection
-        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, stride=stride, padding=pad_type)
-        self.bn2 = nn.BatchNorm2d(out_chs, **bn_args)
+        self.conv_pwl = select_conv2d(
+            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_module(self, location):
+        if location == 'post_exp':
+            return 'act1'
+        return 'conv_pwl'
+
+    def feature_channels(self, location):
+        if location == 'post_exp':
+            return self.conv_exp.out_channels
+        # location == 'pre_pw'
+        return self.conv_pwl.in_channels
 
     def forward(self, x):
         residual = x
@@ -669,7 +763,7 @@ class EdgeResidual(nn.Module):
         # Expansion convolution
         x = self.conv_exp(x)
         x = self.bn1(x)
-        x = self.act_fn(x, inplace=True)
+        x = self.act1(x)
 
         # Squeeze-and-excitation
         if self.has_se:
@@ -693,44 +787,50 @@ class DepthwiseSeparableConv(nn.Module):
     factor of 1.0. This is an alternative to having a IR with an optional first pw conv.
     """
     def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, pad_type='', act_fn=F.relu, noskip=False,
-                 pw_kernel_size=1, pw_act=False,
-                 se_ratio=0., se_gate_fn=sigmoid,
-                 bn_args=_BN_ARGS_PT, drop_connect_rate=0.):
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_gate_fn=sigmoid,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0.):
         super(DepthwiseSeparableConv, self).__init__()
         assert stride in [1, 2]
         self.has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
         self.has_pw_act = pw_act  # activation after point-wise conv
-        self.act_fn = act_fn
         self.drop_connect_rate = drop_connect_rate
 
         self.conv_dw = select_conv2d(
-            in_chs, in_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True)
-        self.bn1 = nn.BatchNorm2d(in_chs, **bn_args)
+            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
+        self.bn1 = norm_layer(in_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
         if self.has_se:
             self.se = SqueezeExcite(
-                in_chs, reduce_chs=max(1, int(in_chs * se_ratio)), act_fn=act_fn, gate_fn=se_gate_fn)
+                in_chs, reduce_chs=max(1, int(in_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
 
         self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = nn.BatchNorm2d(out_chs, **bn_args)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
+
+    def feature_module(self, location):
+        # no expansion in this block, pre pw only feature extraction point
+        return 'conv_pw'
+
+    def feature_channels(self, location):
+        return self.conv_pw.in_channels
 
     def forward(self, x):
         residual = x
 
         x = self.conv_dw(x)
         x = self.bn1(x)
-        x = self.act_fn(x, inplace=True)
+        x = self.act1(x)
 
         if self.has_se:
             x = self.se(x)
 
         x = self.conv_pw(x)
         x = self.bn2(x)
-        if self.has_pw_act:
-            x = self.act_fn(x, inplace=True)
+        x = self.act2(x)
 
         if self.has_residual:
             if self.drop_connect_rate > 0.:
@@ -740,67 +840,87 @@ class DepthwiseSeparableConv(nn.Module):
 
 
 class InvertedResidual(nn.Module):
-    """ Inverted residual block w/ optional SE"""
+    """ Inverted residual block w/ optional SE and CondConv routing"""
 
     def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, pad_type='', act_fn=F.relu, noskip=False,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                  exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
                  se_ratio=0., se_reduce_mid=False, se_gate_fn=sigmoid,
-                 shuffle_type=None, bn_args=_BN_ARGS_PT, drop_connect_rate=0.):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
+                 num_experts=0, drop_connect_rate=0.):
         super(InvertedResidual, self).__init__()
         mid_chs = int(in_chs * exp_ratio)
         self.has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.act_fn = act_fn
         self.drop_connect_rate = drop_connect_rate
 
-        # Point-wise expansion
-        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
-        self.bn1 = nn.BatchNorm2d(mid_chs, **bn_args)
+        self.num_experts = num_experts
+        extra_args = dict()
+        if num_experts > 0:
+            extra_args = dict(num_experts=self.num_experts)
+            self.routing_fn = nn.Linear(in_chs, self.num_experts)
+            self.routing_act = torch.sigmoid
 
-        self.shuffle_type = shuffle_type
-        if shuffle_type is not None and isinstance(exp_kernel_size, list):
-            self.shuffle = ChannelShuffle(len(exp_kernel_size))
+        # Point-wise expansion
+        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **extra_args)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
 
         # Depth-wise convolution
         self.conv_dw = select_conv2d(
-            mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True)
-        self.bn2 = nn.BatchNorm2d(mid_chs, **bn_args)
+            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
+            padding=pad_type, depthwise=True, **extra_args)
+        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
         if self.has_se:
             se_base_chs = mid_chs if se_reduce_mid else in_chs
             self.se = SqueezeExcite(
-                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_fn=act_fn, gate_fn=se_gate_fn)
+                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
 
         # Point-wise linear projection
-        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn3 = nn.BatchNorm2d(out_chs, **bn_args)
+        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **extra_args)
+        self.bn3 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_module(self, location):
+        if location == 'post_exp':
+            return 'act1'
+        return 'conv_pwl'
+
+    def feature_channels(self, location):
+        if location == 'post_exp':
+            return self.conv_pw.out_channels
+        # location == 'pre_pw'
+        return self.conv_pwl.in_channels
 
     def forward(self, x):
         residual = x
 
-        # Point-wise expansion
-        x = self.conv_pw(x)
-        x = self.bn1(x)
-        x = self.act_fn(x, inplace=True)
+        conv_pw, conv_dw, conv_pwl = self.conv_pw, self.conv_dw, self.conv_pwl
+        if self.num_experts > 0:
+            pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+            routing_weights = self.routing_act(self.routing_fn(pooled_inputs))
+            conv_pw = partial(self.conv_pw, routing_weights=routing_weights)
+            conv_dw = partial(self.conv_dw, routing_weights=routing_weights)
+            conv_pwl = partial(self.conv_pwl, routing_weights=routing_weights)
 
-        # FIXME haven't tried this yet
-        # for channel shuffle when using groups with pointwise convs as per FBNet variants
-        if self.shuffle_type == "mid":
-            x = self.shuffle(x)
+        # Point-wise expansion
+        x = conv_pw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
 
         # Depth-wise convolution
-        x = self.conv_dw(x)
+        x = conv_dw(x)
         x = self.bn2(x)
-        x = self.act_fn(x, inplace=True)
+        x = self.act2(x)
 
         # Squeeze-and-excitation
         if self.has_se:
             x = self.se(x)
 
         # Point-wise linear projection
-        x = self.conv_pwl(x)
+        x = conv_pwl(x)
         x = self.bn3(x)
 
         if self.has_residual:
@@ -808,12 +928,52 @@ class InvertedResidual(nn.Module):
                 x = drop_connect(x, self.training, self.drop_connect_rate)
             x += residual
 
-        # NOTE maskrcnn_benchmark building blocks have an SE module defined here for some variants
-
         return x
 
 
-class GenEfficientNet(nn.Module):
+class _GenEfficientNet(nn.Module):
+    """ Generic EfficientNet Base
+    """
+
+    def __init__(self, block_args, in_chans=3, stem_size=32,
+                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 se_gate_fn=sigmoid, se_reduce_mid=False, norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
+                 feature_location='pre_pwl'):
+        super(_GenEfficientNet, self).__init__()
+        self.drop_rate = drop_rate
+        self._in_chs = in_chans
+
+        # Stem
+        stem_size = _round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self._in_chs = stem_size
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = _BlockBuilder(
+            channel_multiplier, channel_divisor, channel_min,
+            output_stride, pad_type, act_layer, se_gate_fn, se_reduce_mid,
+            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
+        self.feature_info = builder.features
+        self._in_chs = builder.in_chs
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1, self.act1]
+        layers.extend(self.blocks)
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        return x
+
+
+class GenEfficientNet(_GenEfficientNet):
     """ Generic EfficientNet
 
     An implementation of efficient network architectures, in many cases mobile optimized networks:
@@ -828,46 +988,77 @@ class GenEfficientNet(nn.Module):
       * MixNet S, M, L
     """
 
-    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=32, num_features=1280,
+    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                  channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 pad_type='', act_fn=F.relu, drop_rate=0., drop_connect_rate=0.,
-                 se_gate_fn=sigmoid, se_reduce_mid=False, bn_args=_BN_ARGS_PT,
+                 pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 se_gate_fn=sigmoid, se_reduce_mid=False,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
                  global_pool='avg', head_conv='default', weight_init='goog'):
-        super(GenEfficientNet, self).__init__()
+
         self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        self.act_fn = act_fn
         self.num_features = num_features
+        super(GenEfficientNet, self).__init__(  # FIXME it would be nice if Python made this nicer
+            block_args, in_chans=in_chans, stem_size=stem_size,
+            pad_type=pad_type, act_layer=act_layer, drop_rate=drop_rate, drop_connect_rate=drop_connect_rate,
+            channel_multiplier=channel_multiplier, channel_divisor=channel_divisor, channel_min=channel_min,
+            se_gate_fn=se_gate_fn, se_reduce_mid=se_reduce_mid, norm_layer=norm_layer, norm_kwargs=norm_kwargs)
 
-        stem_size = _round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = nn.BatchNorm2d(stem_size, **bn_args)
-        in_chs = stem_size
+        # Head + Pooling
+        self.conv_head = None
+        self.global_pool = None
+        self.act2 = None
+        self.forward_head = None
+        self.head_conv = head_conv
+        if head_conv == 'efficient':
+            self._create_head_efficient(global_pool, pad_type, act_layer)
+        elif head_conv == 'default':
+            self._create_head_default(global_pool, pad_type, act_layer, norm_layer, norm_kwargs)
 
-        builder = _BlockBuilder(
-            channel_multiplier, channel_divisor, channel_min,
-            pad_type, act_fn, se_gate_fn, se_reduce_mid,
-            bn_args, drop_connect_rate, verbose=_DEBUG)
-        self.blocks = nn.Sequential(*builder(in_chs, block_args))
-        in_chs = builder.in_chs
-
-        if not head_conv or head_conv == 'none':
-            self.efficient_head = False
-            self.conv_head = None
-            assert in_chs == self.num_features
-        else:
-            self.efficient_head = head_conv == 'efficient'
-            self.conv_head = select_conv2d(in_chs, self.num_features, 1, padding=pad_type)
-            self.bn2 = None if self.efficient_head else nn.BatchNorm2d(self.num_features, **bn_args)
-
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        # Classifier
         self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), self.num_classes)
 
         for m in self.modules():
             if weight_init == 'goog':
-                _initialize_weight_goog(m)
+                _init_weight_goog(m)
             else:
-                _initialize_weight_default(m)
+                _init_weight_default(m)
+
+    def _create_head_default(self, global_pool, pad_type, act_layer, norm_layer, norm_kwargs):
+        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
+        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+
+    def _create_head_efficient(self, global_pool, pad_type, act_layer):
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
+        self.act2 = act_layer(inplace=True)
+
+    def _forward_head_default(self, x):
+        x = self.conv_head(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+        return x
+
+    def _forward_head_efficient(self, x):
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        return x
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1, self.act1]
+        layers.extend(self.blocks)
+        if self.head_conv == 'efficient':
+            layers.extend([self.global_pool, self.bn2, self.act2])
+        else:
+            layers.extend([self.conv_head, self.bn2, self.act2])
+            if self.global_pool is not None:
+                layers.append(self.global_pool)
+        #append flatten layer
+        layers.append(self.classifier)
+        return nn.Sequential(*layers)
+
 
     def get_classifier(self):
         return self.classifier
@@ -882,38 +1073,121 @@ class GenEfficientNet(nn.Module):
         else:
             self.classifier = None
 
-    def forward_features(self, x, pool=True):
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act_fn(x, inplace=True)
-        x = self.blocks(x)
-        if self.efficient_head:
-            # efficient head, currently only mobilenet-v3 performs pool before last 1x1 conv
-            x = self.global_pool(x)  # always need to pool here regardless of flag
-            x = self.conv_head(x)
-            # no BN
-            x = self.act_fn(x, inplace=True)
-            if pool:
-                # expect flattened output if pool is true, otherwise keep dim
-                x = x.view(x.size(0), -1)
-        else:
-            if self.conv_head is not None:
-                x = self.conv_head(x)
-                x = self.bn2(x)
-            x = self.act_fn(x, inplace=True)
-            if pool:
-                x = self.global_pool(x)
-                x = x.view(x.size(0), -1)
+    def forward_features(self, x):
+        x = super(GenEfficientNet, self).forward(x)
+        if self.head_conv == 'efficient':
+            x = self._forward_head_efficient(x)
+        elif self.head_conv == 'default':
+            x = self._forward_head_default(x)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
+        if self.global_pool is not None and x.shape[-1] > 1 or x.shape[-2] > 1:
+            x = self.global_pool(x)
+        x = x.flatten(1)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         return self.classifier(x)
 
 
-def _gen_mnasnet_a1(channel_multiplier, num_classes=1000, **kwargs):
+class GenEfficientNetFeatures(_GenEfficientNet):
+    """ Generic EfficientNet Feature Extractor
+    """
+
+    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
+                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 se_gate_fn=sigmoid, se_reduce_mid=False, norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
+                 weight_init='goog'):
+
+        # validate and modify block arguments and out indices for feature extraction
+        num_stages = max(out_indices) + 1  # FIXME reduce num stages created if not needed
+        #assert len(block_args) >= num_stages - 1
+        #block_args = block_args[:num_stages - 1]
+
+        super(GenEfficientNetFeatures, self).__init__(  # FIXME it would be nice if Python made this nicer
+            block_args, in_chans=in_chans, stem_size=stem_size,
+            output_stride=output_stride, pad_type=pad_type, act_layer=act_layer,
+            drop_rate=drop_rate, drop_connect_rate=drop_connect_rate, feature_location=feature_location,
+            channel_multiplier=channel_multiplier, channel_divisor=channel_divisor, channel_min=channel_min,
+            se_gate_fn=se_gate_fn, se_reduce_mid=se_reduce_mid, norm_layer=norm_layer, norm_kwargs=norm_kwargs)
+
+        for m in self.modules():
+            if weight_init == 'goog':
+                _init_weight_goog(m)
+            else:
+                _init_weight_default(m)
+
+        if _DEBUG:
+            for k, v in self.feature_info.items():
+                print('Feature idx: {}: Name: {}, Channels: {}'.format(k, v['name'], v['num_chs']))
+        hook_type = 'forward_pre' if feature_location == 'pre_pwl' else 'forward'
+        hooks = [dict(name=self.feature_info[idx]['name'], type=hook_type) for idx in out_indices]
+        self._feature_outputs = None
+        self._register_hooks(hooks)
+
+    def _collect_output_hook(self, name, *args):
+        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
+        if isinstance(x, tuple):
+            x = x[0]  # unwrap input tuple
+        self._feature_outputs[x.device][name] = x
+
+    def _get_output(self, device):
+        output = tuple(self._feature_outputs[device].values())[::-1]
+        self._feature_outputs[device] = OrderedDict()
+        return output
+
+    def _register_hooks(self, hooks):
+        # setup feature hooks
+        modules = {k: v for k, v in self.named_modules()}
+        for h in hooks:
+            hook_name = h['name']
+            m = modules[hook_name]
+            hook_fn = partial(self._collect_output_hook, hook_name)
+            if h['type'] == 'forward_pre':
+                m.register_forward_pre_hook(hook_fn)
+            else:
+                m.register_forward_hook(hook_fn)
+        self._feature_outputs = defaultdict(OrderedDict)
+
+    def feature_channels(self, idx=None):
+        if isinstance(idx, int):
+            return self.feature_info[idx]['num_chs']
+        return [self.feature_info[i]['num_chs'] for i in self.out_indices]
+
+    def forward(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        self.blocks(x)
+        return self._get_output(x.device)
+
+
+def _create_model(model_kwargs, default_cfg, pretrained=False):
+    if model_kwargs.pop('features_only', False):
+        load_strict = False
+        model_kwargs.pop('num_classes', 0)
+        model_kwargs.pop('num_features', 0)
+        model_kwargs.pop('head_conv', None)
+        model_class = GenEfficientNetFeatures
+    else:
+        load_strict = True
+        model_class = GenEfficientNet
+
+    model = model_class(**model_kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=model_kwargs.get('num_classes', 0),
+            in_chans=model_kwargs.get('in_chans', 3),
+            strict=load_strict)
+    return model
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a mnasnet-a1 model.
 
     Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
@@ -938,18 +1212,18 @@ def _gen_mnasnet_a1(channel_multiplier, num_classes=1000, **kwargs):
         # stage 6, 7x7 in
         ['ir_r1_k3_s1_e6_c320'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mnasnet_b1(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a mnasnet-b1 model.
 
     Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
@@ -974,18 +1248,18 @@ def _gen_mnasnet_b1(channel_multiplier, num_classes=1000, **kwargs):
         # stage 6, 7x7 in
         ['ir_r1_k3_s1_e6_c320_noskip']
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mnasnet_small(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a mnasnet-b1 model.
 
     Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
@@ -1003,18 +1277,18 @@ def _gen_mnasnet_small(channel_multiplier, num_classes=1000, **kwargs):
         ['ir_r3_k5_s2_e6_c88_se0.25'],
         ['ir_r1_k3_s1_e6_c144']
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=8,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mobilenet_v1(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_mobilenet_v1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ Generate MobileNet-V1 network
     Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
     Paper: https://arxiv.org/abs/1801.04381
@@ -1026,21 +1300,21 @@ def _gen_mobilenet_v1(channel_multiplier, num_classes=1000, **kwargs):
         ['dsa_r6_k3_s2_c512'],
         ['dsa_r2_k3_s2_c1024'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         num_features=1024,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=F.relu6,
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=nn.ReLU6,
         head_conv='none',
         **kwargs
-        )
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mobilenet_v2(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_mobilenet_v2(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ Generate MobileNet-V2 network
     Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
     Paper: https://arxiv.org/abs/1801.04381
@@ -1054,19 +1328,19 @@ def _gen_mobilenet_v2(channel_multiplier, num_classes=1000, **kwargs):
         ['ir_r3_k3_s2_e6_c160'],
         ['ir_r1_k3_s1_e6_c320'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=F.relu6,
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=nn.ReLU6,
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mobilenet_v3(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a MobileNet-V3 model.
 
     Ref impl: ?
@@ -1091,22 +1365,22 @@ def _gen_mobilenet_v3(channel_multiplier, num_classes=1000, **kwargs):
         # stage 6, 7x7 in
         ['cn_r1_k1_s1_c960'],  # hard-swish
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=16,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=hard_swish,
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=HardSwish,
         se_gate_fn=hard_sigmoid,
         se_reduce_mid=True,
         head_conv='efficient',
-        **kwargs
+        **kwargs,
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_chamnet_v1(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_chamnet_v1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ Generate Chameleon Network (ChamNet)
 
     Paper: https://arxiv.org/abs/1812.08934
@@ -1123,19 +1397,19 @@ def _gen_chamnet_v1(channel_multiplier, num_classes=1000, **kwargs):
         ['ir_r4_k3_s2_e7_c152'],
         ['ir_r1_k3_s1_e10_c104'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         num_features=1280,  # no idea what this is? try mobile/mnasnet default?
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_chamnet_v2(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_chamnet_v2(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ Generate Chameleon Network (ChamNet)
 
     Paper: https://arxiv.org/abs/1812.08934
@@ -1152,19 +1426,19 @@ def _gen_chamnet_v2(channel_multiplier, num_classes=1000, **kwargs):
         ['ir_r6_k3_s2_e2_c152'],
         ['ir_r1_k3_s1_e6_c112'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         num_features=1280,  # no idea what this is? try mobile/mnasnet default?
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_fbnetc(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ FBNet-C
 
         Paper: https://arxiv.org/abs/1812.03443
@@ -1182,19 +1456,19 @@ def _gen_fbnetc(channel_multiplier, num_classes=1000, **kwargs):
         ['ir_r4_k5_s2_e6_c184'],
         ['ir_r1_k3_s1_e6_c352'],
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=16,
         num_features=1984,  # paper suggests this, but is not 100% clear
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_spnasnet(channel_multiplier, num_classes=1000, **kwargs):
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
 
     Paper: https://arxiv.org/abs/1904.02877
@@ -1218,18 +1492,18 @@ def _gen_spnasnet(channel_multiplier, num_classes=1000, **kwargs):
         # stage 6, 7x7 in
         ['ir_r1_k3_s1_e6_c320_noskip']
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_efficientnet(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=1000, **kwargs):
+def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
     """Creates an EfficientNet model.
 
     Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
@@ -1260,21 +1534,20 @@ def _gen_efficientnet(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=
         ['ir_r4_k5_s2_e6_c192_se0.25'],
         ['ir_r1_k3_s1_e6_c320_se0.25'],
     ]
-    num_features = _round_channels(1280, channel_multiplier, 8, None)
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def, depth_multiplier),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def, depth_multiplier),
+        num_features=_round_channels(1280, channel_multiplier, 8, None),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        num_features=num_features,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=swish,
-        **kwargs
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=Swish,
+        **kwargs,
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_efficientnet_edge(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=1000, **kwargs):
+def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
     arch_def = [
         # NOTE `fc` is present to override a mismatch between stem channels and in chs not
         # present in other models
@@ -1285,21 +1558,46 @@ def _gen_efficientnet_edge(channel_multiplier=1.0, depth_multiplier=1.0, num_cla
         ['ir_r4_k5_s1_e8_c144'],
         ['ir_r2_k5_s2_e8_c192'],
     ]
-    num_features = _round_channels(1280, channel_multiplier, 8, None)
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def, depth_multiplier),
-        num_classes=num_classes,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def, depth_multiplier),
+        num_features=_round_channels(1280, channel_multiplier, 8, None),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        num_features=num_features,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=F.relu,
-        **kwargs
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=nn.ReLU,
+        **kwargs,
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mixnet_s(channel_multiplier=1.0, num_classes=1000, **kwargs):
+def _gen_efficientnet_condconv(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+
+    """Creates an efficientnet-condconv model."""
+    arch_def = [
+      ['ds_r1_k3_s1_e1_c16_se0.25'],
+      ['ir_r2_k3_s2_e6_c24_se0.25'],
+      ['ir_r2_k5_s2_e6_c40_se0.25'],
+      ['ir_r3_k3_s2_e6_c80_se0.25'],
+      ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+      ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+      ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+    ]
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+        num_features=_round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=_resolve_bn_args(kwargs),
+        act_layer=Swish,
+        **kwargs,
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
+    return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a MixNet Small model.
 
     Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
@@ -1320,20 +1618,19 @@ def _gen_mixnet_s(channel_multiplier=1.0, num_classes=1000, **kwargs):
         ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
         # 7x7
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def),
-        num_classes=num_classes,
-        stem_size=16,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def),
         num_features=1536,
+        stem_size=16,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=F.relu,
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
-def _gen_mixnet_m(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=1000, **kwargs):
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
     """Creates a MixNet Medium-Large model.
 
     Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
@@ -1354,672 +1651,524 @@ def _gen_mixnet_m(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=1000
         ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
         # 7x7
     ]
-    model = GenEfficientNet(
-        _decode_arch_def(arch_def, depth_multiplier=depth_multiplier, depth_trunc='round'),
-        num_classes=num_classes,
-        stem_size=24,
+    model_kwargs = dict(
+        block_args=_decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
         num_features=1536,
+        stem_size=24,
         channel_multiplier=channel_multiplier,
-        bn_args=_resolve_bn_args(kwargs),
-        act_fn=F.relu,
+        norm_kwargs=_resolve_bn_args(kwargs),
         **kwargs
     )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
     return model
 
 
 @register_model
-def mnasnet_050(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_050(pretrained=False, **kwargs):
     """ MNASNet B1, depth multiplier of 0.5. """
-    default_cfg = default_cfgs['mnasnet_050']
-    model = _gen_mnasnet_b1(0.5, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mnasnet_075(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_075(pretrained=False, **kwargs):
     """ MNASNet B1, depth multiplier of 0.75. """
-    default_cfg = default_cfgs['mnasnet_075']
-    model = _gen_mnasnet_b1(0.75, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mnasnet_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_100(pretrained=False, **kwargs):
     """ MNASNet B1, depth multiplier of 1.0. """
-    default_cfg = default_cfgs['mnasnet_100']
-    model = _gen_mnasnet_b1(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mnasnet_b1(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_b1(pretrained=False, **kwargs):
     """ MNASNet B1, depth multiplier of 1.0. """
-    return mnasnet_100(pretrained, num_classes, in_chans, **kwargs)
+    return mnasnet_100(pretrained, **kwargs)
 
 
 @register_model
-def mnasnet_140(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_140(pretrained=False, **kwargs):
     """ MNASNet B1,  depth multiplier of 1.4 """
-    default_cfg = default_cfgs['mnasnet_140']
-    model = _gen_mnasnet_b1(1.4, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def semnasnet_050(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def semnasnet_050(pretrained=False, **kwargs):
     """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
-    default_cfg = default_cfgs['semnasnet_050']
-    model = _gen_mnasnet_a1(0.5, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def semnasnet_075(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def semnasnet_075(pretrained=False, **kwargs):
     """ MNASNet A1 (w/ SE),  depth multiplier of 0.75. """
-    default_cfg = default_cfgs['semnasnet_075']
-    model = _gen_mnasnet_a1(0.75, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def semnasnet_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def semnasnet_100(pretrained=False, **kwargs):
     """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
-    default_cfg = default_cfgs['semnasnet_100']
-    model = _gen_mnasnet_a1(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mnasnet_a1(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_a1(pretrained=False, **kwargs):
     """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
-    return semnasnet_100(pretrained, num_classes, in_chans, **kwargs)
+    return semnasnet_100(pretrained, **kwargs)
 
 
 @register_model
-def semnasnet_140(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def semnasnet_140(pretrained=False, **kwargs):
     """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
-    default_cfg = default_cfgs['semnasnet_140']
-    model = _gen_mnasnet_a1(1.4, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mnasnet_small(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mnasnet_small(pretrained=False, **kwargs):
     """ MNASNet Small,  depth multiplier of 1.0. """
-    default_cfg = default_cfgs['mnasnet_small']
-    model = _gen_mnasnet_small(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mobilenetv1_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mobilenetv1_100(pretrained=False, **kwargs):
     """ MobileNet V1 """
-    default_cfg = default_cfgs['mobilenetv1_100']
-    model = _gen_mobilenet_v1(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mobilenet_v1('mobilenetv1_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mobilenetv2_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mobilenetv2_100(pretrained=False, **kwargs):
     """ MobileNet V2 """
-    default_cfg = default_cfgs['mobilenetv2_100']
-    model = _gen_mobilenet_v2(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mobilenetv3_050(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mobilenetv3_050(pretrained=False, **kwargs):
     """ MobileNet V3 """
-    default_cfg = default_cfgs['mobilenetv3_050']
-    model = _gen_mobilenet_v3(0.5, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mobilenet_v3('mobilenetv3_050', 0.5, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mobilenetv3_075(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mobilenetv3_075(pretrained=False, **kwargs):
     """ MobileNet V3 """
-    default_cfg = default_cfgs['mobilenetv3_075']
-    model = _gen_mobilenet_v3(0.75, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mobilenet_v3('mobilenetv3_075', 0.75, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mobilenetv3_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mobilenetv3_100(pretrained=False, **kwargs):
     """ MobileNet V3 """
-    default_cfg = default_cfgs['mobilenetv3_100']
     if pretrained:
         # pretrained model trained with non-default BN epsilon
         kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
-    model = _gen_mobilenet_v3(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_mobilenet_v3('mobilenetv3_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def fbnetc_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def fbnetc_100(pretrained=False, **kwargs):
     """ FBNet-C """
-    default_cfg = default_cfgs['fbnetc_100']
     if pretrained:
         # pretrained model trained with non-default BN epsilon
         kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
-    model = _gen_fbnetc(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def chamnetv1_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def chamnetv1_100(pretrained=False, **kwargs):
     """ ChamNet """
-    default_cfg = default_cfgs['chamnetv1_100']
-    model = _gen_chamnet_v1(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_chamnet_v1('chamnetv1_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def chamnetv2_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def chamnetv2_100(pretrained=False, **kwargs):
     """ ChamNet """
-    default_cfg = default_cfgs['chamnetv2_100']
-    model = _gen_chamnet_v2(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_chamnet_v2('chamnetv2_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def spnasnet_100(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def spnasnet_100(pretrained=False, **kwargs):
     """ Single-Path NAS Pixel1"""
-    default_cfg = default_cfgs['spnasnet_100']
-    model = _gen_spnasnet(1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+    model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b0(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b0(pretrained=False, **kwargs):
     """ EfficientNet-B0 """
-    default_cfg = default_cfgs['efficientnet_b0']
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     model = _gen_efficientnet(
-        channel_multiplier=1.0, depth_multiplier=1.0,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b1(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b1(pretrained=False, **kwargs):
     """ EfficientNet-B1 """
-    default_cfg = default_cfgs['efficientnet_b1']
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     model = _gen_efficientnet(
-        channel_multiplier=1.0, depth_multiplier=1.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b2(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b2(pretrained=False, **kwargs):
     """ EfficientNet-B2 """
-    default_cfg = default_cfgs['efficientnet_b2']
     # NOTE for train, drop_rate should be 0.3
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     model = _gen_efficientnet(
-        channel_multiplier=1.1, depth_multiplier=1.2,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b3(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b3(pretrained=False, **kwargs):
     """ EfficientNet-B3 """
-    default_cfg = default_cfgs['efficientnet_b3']
     # NOTE for train, drop_rate should be 0.3
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     model = _gen_efficientnet(
-        channel_multiplier=1.2, depth_multiplier=1.4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b4(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b4(pretrained=False, **kwargs):
     """ EfficientNet-B4 """
-    default_cfg = default_cfgs['efficientnet_b4']
     # NOTE for train, drop_rate should be 0.4
     #kwargs['drop_connect_rate'] = 0.2  #  set when training, TODO add as cmd arg
     model = _gen_efficientnet(
-        channel_multiplier=1.4, depth_multiplier=1.8,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b5(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b5(pretrained=False, **kwargs):
     """ EfficientNet-B5 """
     # NOTE for train, drop_rate should be 0.4
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    default_cfg = default_cfgs['efficientnet_b5']
     model = _gen_efficientnet(
-        channel_multiplier=1.6, depth_multiplier=2.2,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b6(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b6(pretrained=False, **kwargs):
     """ EfficientNet-B6 """
     # NOTE for train, drop_rate should be 0.5
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    default_cfg = default_cfgs['efficientnet_b6']
     model = _gen_efficientnet(
-        channel_multiplier=1.8, depth_multiplier=2.6,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_b7(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_b7(pretrained=False, **kwargs):
     """ EfficientNet-B7 """
     # NOTE for train, drop_rate should be 0.5
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    default_cfg = default_cfgs['efficientnet_b7']
     model = _gen_efficientnet(
-        channel_multiplier=2.0, depth_multiplier=3.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_es(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_es(pretrained=False, **kwargs):
     """ EfficientNet-Edge Small. """
-    default_cfg = default_cfgs['efficientnet_es']
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.0, depth_multiplier=1.0,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_em(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_em(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Medium. """
-    default_cfg = default_cfgs['efficientnet_em']
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.0, depth_multiplier=1.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def efficientnet_el(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_el(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Large. """
-    default_cfg = default_cfgs['efficientnet_el']
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.2, depth_multiplier=1.4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b0(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+@register_model
+def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B1 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b0(pretrained=False, **kwargs):
     """ EfficientNet-B0. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_b0']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.0, depth_multiplier=1.0,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b1(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b1(pretrained=False, **kwargs):
     """ EfficientNet-B1. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_b1']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.0, depth_multiplier=1.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b2(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b2(pretrained=False, **kwargs):
     """ EfficientNet-B2. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_b2']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.1, depth_multiplier=1.2,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
 def tf_efficientnet_b3(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """ EfficientNet-B3. Tensorflow compatible variant """
-    default_cfg = default_cfgs['tf_efficientnet_b3']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.2, depth_multiplier=1.4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b4(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b4(pretrained=False, **kwargs):
     """ EfficientNet-B4. Tensorflow compatible variant """
-    default_cfg = default_cfgs['tf_efficientnet_b4']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.4, depth_multiplier=1.8,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b5(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b5(pretrained=False, **kwargs):
     """ EfficientNet-B5. Tensorflow compatible variant """
-    default_cfg = default_cfgs['tf_efficientnet_b5']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.6, depth_multiplier=2.2,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b6(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b6(pretrained=False, **kwargs):
     """ EfficientNet-B6. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.5
-    default_cfg = default_cfgs['tf_efficientnet_b6']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=1.8, depth_multiplier=2.6,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_b7(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b7(pretrained=False, **kwargs):
     """ EfficientNet-B7. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.5
-    default_cfg = default_cfgs['tf_efficientnet_b7']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
-        channel_multiplier=2.0, depth_multiplier=3.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_es(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_es(pretrained=False, **kwargs):
     """ EfficientNet-Edge Small. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_es']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.0, depth_multiplier=1.0,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_em(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_em(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Medium. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_em']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.0, depth_multiplier=1.1,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_efficientnet_el(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_el(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Large. Tensorflow compatible variant  """
-    default_cfg = default_cfgs['tf_efficientnet_el']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
-        channel_multiplier=1.2, depth_multiplier=1.4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mixnet_s(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-B0 """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-B0 """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+@register_model
+def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-B0 """
+    # NOTE for train, drop_rate should be 0.2
+    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mixnet_s(pretrained=False, **kwargs):
     """Creates a MixNet Small model.
     """
-    default_cfg = default_cfgs['mixnet_s']
     model = _gen_mixnet_s(
-        channel_multiplier=1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mixnet_m(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mixnet_m(pretrained=False, **kwargs):
     """Creates a MixNet Medium model.
     """
-    default_cfg = default_cfgs['mixnet_m']
     model = _gen_mixnet_m(
-        channel_multiplier=1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mixnet_l(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mixnet_l(pretrained=False, **kwargs):
     """Creates a MixNet Large model.
     """
-    default_cfg = default_cfgs['mixnet_l']
     model = _gen_mixnet_m(
-        channel_multiplier=1.3, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mixnet_xl(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mixnet_xl(pretrained=False, **kwargs):
     """Creates a MixNet Extra-Large model.
     Not a paper spec, experimental def by RW w/ depth scaling.
     """
-    default_cfg = default_cfgs['mixnet_xl']
-    #kwargs['drop_connect_rate'] = 0.2
     model = _gen_mixnet_m(
-        channel_multiplier=1.6, depth_multiplier=1.2, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def mixnet_xxl(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def mixnet_xxl(pretrained=False, **kwargs):
     """Creates a MixNet Double Extra Large model.
     Not a paper spec, experimental def by RW w/ depth scaling.
     """
-    default_cfg = default_cfgs['mixnet_xxl']
     # kwargs['drop_connect_rate'] = 0.2
     model = _gen_mixnet_m(
-        channel_multiplier=2.4, depth_multiplier=1.3, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_mixnet_s(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_mixnet_s(pretrained=False, **kwargs):
     """Creates a MixNet Small model. Tensorflow compatible variant
     """
-    default_cfg = default_cfgs['tf_mixnet_s']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_s(
-        channel_multiplier=1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_mixnet_m(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_mixnet_m(pretrained=False, **kwargs):
     """Creates a MixNet Medium model. Tensorflow compatible variant
     """
-    default_cfg = default_cfgs['tf_mixnet_m']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_m(
-        channel_multiplier=1.0, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
 
 
 @register_model
-def tf_mixnet_l(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_mixnet_l(pretrained=False, **kwargs):
     """Creates a MixNet Large model. Tensorflow compatible variant
     """
-    default_cfg = default_cfgs['tf_mixnet_l']
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_m(
-        channel_multiplier=1.3, num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans)
+        'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
     return model
 
 
diff --git a/timm/models/helpers.py b/timm/models/helpers.py
index 9ac728da..7460f4a2 100644
--- a/timm/models/helpers.py
+++ b/timm/models/helpers.py
@@ -57,7 +57,7 @@ def resume_checkpoint(model, checkpoint_path):
         raise FileNotFoundError()
 
 
-def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=None):
+def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=None, strict=True):
     if cfg is None:
         cfg = getattr(model, 'default_cfg')
     if cfg is None or 'url' not in cfg or not cfg['url']:
@@ -74,7 +74,6 @@ def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=Non
     elif in_chans != 3:
         assert False, "Invalid in_chans for pretrained weights"
 
-    strict = True
     classifier_name = cfg['classifier']
     if num_classes == 1000 and cfg['num_classes'] == 1001:
         # special case for imagenet trained models with extra background class in pretrained weights

From 7ac6db4543a07d44c6b30327a9e8fe31a4ee8e08 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 22 Oct 2019 23:45:30 -0700
Subject: [PATCH 13/35] Missed activations.py

---
 timm/models/activations.py | 180 +++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 timm/models/activations.py

diff --git a/timm/models/activations.py b/timm/models/activations.py
new file mode 100644
index 00000000..aa29b84d
--- /dev/null
+++ b/timm/models/activations.py
@@ -0,0 +1,180 @@
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+_USE_MEM_EFFICIENT_ISH = True
+if _USE_MEM_EFFICIENT_ISH:
+    # This version reduces memory overhead of Swish during training by
+    # recomputing torch.sigmoid(x) in backward instead of saving it.
+    class SwishAutoFn(torch.autograd.Function):
+        """Swish - Described in: https://arxiv.org/abs/1710.05941
+        Memory efficient variant from:
+         https://medium.com/the-artificial-impostor/more-memory-efficient-swish-activation-function-e07c22c12a76
+        """
+        @staticmethod
+        def forward(ctx, x):
+            result = x.mul(torch.sigmoid(x))
+            ctx.save_for_backward(x)
+            return result
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_variables[0]
+            sigmoid_x = torch.sigmoid(x)
+            return grad_output.mul(sigmoid_x * (1 + x * (1 - sigmoid_x)))
+
+    def swish(x, inplace=False):
+        # inplace ignored
+        return SwishAutoFn.apply(x)
+
+
+    class MishAutoFn(torch.autograd.Function):
+        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+        Experimental memory-efficient variant
+        """
+
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            y = x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
+            return y
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_variables[0]
+            x_sigmoid = torch.sigmoid(x)
+            x_tanh_sp = F.softplus(x).tanh()
+            return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+    def mish(x, inplace=False):
+        # inplace ignored
+        return MishAutoFn.apply(x)
+
+
+    class WishAutoFn(torch.autograd.Function):
+        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
+        Experimental memory-efficient variant
+        """
+
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            y = x.mul(torch.tanh(torch.exp(x)))
+            return y
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_variables[0]
+            x_exp = x.exp()
+            x_tanh_exp = x_exp.tanh()
+            return grad_output.mul(x_tanh_exp + x * x_exp * (1 - x_tanh_exp * x_tanh_exp))
+
+    def wish(x, inplace=False):
+        # inplace ignored
+        return WishAutoFn.apply(x)
+else:
+    def swish(x, inplace=False):
+        """Swish - Described in: https://arxiv.org/abs/1710.05941
+        """
+        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
+
+
+    def mish(x, inplace=False):
+        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+        """
+        inner = F.softplus(x).tanh()
+        return x.mul_(inner) if inplace else x.mul(inner)
+
+
+    def wish(x, inplace=False):
+        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
+        """
+        inner = x.exp().tanh()
+        return x.mul_(inner) if inplace else x.mul(inner)
+
+
+class Swish(nn.Module):
+    def __init__(self, inplace=False):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return swish(x, self.inplace)
+
+
+class Mish(nn.Module):
+    def __init__(self, inplace=False):
+        super(Mish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return mish(x, self.inplace)
+
+
+class Wish(nn.Module):
+    def __init__(self, inplace=False):
+        super(Wish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return wish(x, self.inplace)
+
+
+def sigmoid(x, inplace=False):
+    return x.sigmoid_() if inplace else x.sigmoid()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Sigmoid(nn.Module):
+    def __init__(self, inplace=False):
+        super(Sigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x.sigmoid_() if self.inplace else x.sigmoid()
+
+
+def tanh(x, inplace=False):
+    return x.tanh_() if inplace else x.tanh()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Tanh(nn.Module):
+    def __init__(self, inplace=False):
+        super(Tanh, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x.tanh_() if self.inplace else x.tanh()
+
+
+def hard_swish(x, inplace=False):
+    inner = F.relu6(x + 3.).div_(6.)
+    return x.mul_(inner) if inplace else x.mul(inner)
+
+
+class HardSwish(nn.Module):
+    def __init__(self, inplace=False):
+        super(HardSwish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return hard_swish(x, self.inplace)
+
+
+def hard_sigmoid(x, inplace=False):
+    if inplace:
+        return x.add_(3.).clamp_(0., 6.).div_(6.)
+    else:
+        return F.relu6(x + 3.) / 6.
+
+
+class HardSigmoid(nn.Module):
+    def __init__(self, inplace=False):
+        super(HardSigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return hard_sigmoid(x, self.inplace)
+

From 35e8f0c5e726c87bce183e412588eb77912497d3 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Wed, 23 Oct 2019 11:37:03 -0700
Subject: [PATCH 14/35] Fixup a few comments, add PyTorch version aware Flatten
 and finish as_sequential for GenEfficientNet

---
 timm/models/conv2d_layers.py    |  9 ++++++++-
 timm/models/gen_efficientnet.py | 29 ++++++++++++++++++++---------
 timm/models/layers.py           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 10 deletions(-)
 create mode 100644 timm/models/layers.py

diff --git a/timm/models/conv2d_layers.py b/timm/models/conv2d_layers.py
index cd52b885..ea72d07c 100644
--- a/timm/models/conv2d_layers.py
+++ b/timm/models/conv2d_layers.py
@@ -8,6 +8,7 @@ import numpy as np
 import math
 
 
+# Tuple helpers ripped from PyTorch
 def _ntuple(n):
     def parse(x):
         if isinstance(x, container_abcs.Iterable):
@@ -77,7 +78,7 @@ def get_padding_value(padding, kernel_size, **kwargs):
                 # static case, no extra overhead
                 padding = _get_padding(kernel_size, **kwargs)
             else:
-                # dynamic padding
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
                 padding = 0
                 dynamic = True
         elif padding == 'valid':
@@ -101,6 +102,7 @@ def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
 
 class MixedConv2d(nn.Module):
     """ Mixed Grouped Convolution
+
     Based on MDConv and GroupedConv in MixNet impl:
       https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
     """
@@ -152,7 +154,11 @@ def get_condconv_initializer(initializer, num_experts, expert_shape):
 
 class CondConv2d(nn.Module):
     """ Conditional Convolution
+
     Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+    https://github.com/pytorch/pytorch/issues/17983
     """
 
     def __init__(self, in_channels, out_channels, kernel_size=3,
@@ -211,6 +217,7 @@ class CondConv2d(nn.Module):
         if self._use_groups:
             new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
             weight = weight.view(new_weight_shape)
+            # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
             x = x.view(1, B * C, H, W)
             out = self.conv_fn(
                 x, weight, bias, stride=self.stride, padding=self.padding,
diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index e51bab2a..216ea6ee 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -2,6 +2,8 @@
 
 A generic class with building blocks to support a variety of models with efficient architectures:
 * EfficientNet (B0-B7)
+* EfficientNet-EdgeTPU
+* EfficientNet-CondConv
 * MixNet (Small, Medium, and Large)
 * MnasNet B1, A1 (SE), Small
 * MobileNet V1, V2, and V3
@@ -31,6 +33,7 @@ from .registry import register_model, model_entrypoint
 from .helpers import load_pretrained
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from .conv2d_layers import select_conv2d
+from .layers import Flatten
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 
@@ -1050,16 +1053,14 @@ class GenEfficientNet(_GenEfficientNet):
         layers = [self.conv_stem, self.bn1, self.act1]
         layers.extend(self.blocks)
         if self.head_conv == 'efficient':
-            layers.extend([self.global_pool, self.bn2, self.act2])
+            layers.extend([self.global_pool, self.conv_head, self.act2])
         else:
             layers.extend([self.conv_head, self.bn2, self.act2])
             if self.global_pool is not None:
                 layers.append(self.global_pool)
-        #append flatten layer
-        layers.append(self.classifier)
+        layers.extend([Flatten(), nn.Dropout(self.drop_rate), self.classifier])
         return nn.Sequential(*layers)
 
-
     def get_classifier(self):
         return self.classifier
 
@@ -1106,7 +1107,8 @@ class GenEfficientNetFeatures(_GenEfficientNet):
         #assert len(block_args) >= num_stages - 1
         #block_args = block_args[:num_stages - 1]
 
-        super(GenEfficientNetFeatures, self).__init__(  # FIXME it would be nice if Python made this nicer
+        # FIXME it would be nice if Python made this nicer without using kwargs and erasing IDE hints, etc
+        super(GenEfficientNetFeatures, self).__init__(
             block_args, in_chans=in_chans, stem_size=stem_size,
             output_stride=output_stride, pad_type=pad_type, act_layer=act_layer,
             drop_rate=drop_rate, drop_connect_rate=drop_connect_rate, feature_location=feature_location,
@@ -1548,6 +1550,11 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
 
 
 def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-EdgeTPU model
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
+    """
+
     arch_def = [
         # NOTE `fc` is present to override a mismatch between stem channels and in chs not
         # present in other models
@@ -1573,8 +1580,10 @@ def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0
 
 def _gen_efficientnet_condconv(
         variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+    """Creates an EfficientNet-CondConv model.
 
-    """Creates an efficientnet-condconv model."""
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+    """
     arch_def = [
       ['ds_r1_k3_s1_e1_c16_se0.25'],
       ['ir_r2_k3_s2_e6_c24_se0.25'],
@@ -1584,6 +1593,8 @@ def _gen_efficientnet_condconv(
       ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
       ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
     ]
+    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
+    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
     model_kwargs = dict(
         block_args=_decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
         num_features=_round_channels(1280, channel_multiplier, 8, None),
@@ -2056,7 +2067,7 @@ def tf_efficientnet_el(pretrained=False, **kwargs):
 
 @register_model
 def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
-    """ EfficientNet-B0 """
+    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
@@ -2068,7 +2079,7 @@ def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
 
 @register_model
 def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
-    """ EfficientNet-B0 """
+    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
@@ -2080,7 +2091,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
 
 @register_model
 def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
-    """ EfficientNet-B0 """
+    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
     kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
diff --git a/timm/models/layers.py b/timm/models/layers.py
new file mode 100644
index 00000000..c8e0a837
--- /dev/null
+++ b/timm/models/layers.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def versiontuple(v):
+    return tuple(map(int, (v.split("."))))[:3]
+
+
+if versiontuple(torch.__version__) >= versiontuple('1.2.0'):
+    Flatten = nn.Flatten
+else:
+    class Flatten(nn.Module):
+        r"""
+        Flattens a contiguous range of dims into a tensor. For use with :class:`~nn.Sequential`.
+        Args:
+            start_dim: first dim to flatten (default = 1).
+            end_dim: last dim to flatten (default = -1).
+        Shape:
+            - Input: :math:`(N, *dims)`
+            - Output: :math:`(N, \prod *dims)` (for the default case).
+        """
+        __constants__ = ['start_dim', 'end_dim']
+
+        def __init__(self, start_dim=1, end_dim=-1):
+            super(Flatten, self).__init__()
+            self.start_dim = start_dim
+            self.end_dim = end_dim
+
+        def forward(self, input):
+            return input.flatten(self.start_dim, self.end_dim)

From 23937086502fca9d308a2b9a6aba9d7ac13a5165 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 5 Nov 2019 09:55:55 -0800
Subject: [PATCH 15/35] Missed stashing of out_indices in model

---
 timm/models/gen_efficientnet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index 216ea6ee..c3b1b0e2 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -1106,6 +1106,7 @@ class GenEfficientNetFeatures(_GenEfficientNet):
         num_stages = max(out_indices) + 1  # FIXME reduce num stages created if not needed
         #assert len(block_args) >= num_stages - 1
         #block_args = block_args[:num_stages - 1]
+        self.out_indices = out_indices
 
         # FIXME it would be nice if Python made this nicer without using kwargs and erasing IDE hints, etc
         super(GenEfficientNetFeatures, self).__init__(

From ad9334754800f820732b68532281aff6886707c0 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Mon, 25 Nov 2019 22:34:04 -0800
Subject: [PATCH 16/35] Initial HRNet classification model commit

---
 timm/models/__init__.py |   1 +
 timm/models/hrnet.py    | 869 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 870 insertions(+)
 create mode 100644 timm/models/hrnet.py

diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 3c7e8e47..08bc1699 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -13,6 +13,7 @@ from .gluon_resnet import *
 from .gluon_xception import *
 from .res2net import *
 from .dla import *
+from .hrnet import *
 
 from .registry import *
 from .factory import create_model
diff --git a/timm/models/hrnet.py b/timm/models/hrnet.py
new file mode 100644
index 00000000..59ded4ab
--- /dev/null
+++ b/timm/models/hrnet.py
@@ -0,0 +1,869 @@
+""" HRNet
+
+Copied from https://github.com/HRNet/HRNet-Image-Classification
+
+Original header:
+  Copyright (c) Microsoft
+  Licensed under the MIT License.
+  Written by Bin Xiao (Bin.Xiao@microsoft.com)
+  Modified by Ke Sun (sunk@mail.ustc.edu.cn)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import logging
+import functools
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch._utils
+import torch.nn.functional as F
+
+from .registry import register_model
+from .helpers import load_pretrained
+from .helpers import load_pretrained
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'hrnet_w18_small': _cfg(url=''),
+    'hrnet_w18_small_v2': _cfg(url=''),
+    'hrnet_w18': _cfg(url=''),
+    'hrnet_w30': _cfg(url=''),
+    'hrnet_w32': _cfg(url=''),
+    'hrnet_w40': _cfg(url=''),
+    'hrnet_w44': _cfg(url=''),
+    'hrnet_w48': _cfg(url=''),
+}
+
+cfg_cls_hrnet_w18_small = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(1,),
+        NUM_CHANNELS=(32,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2),
+        NUM_CHANNELS=(16, 32),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2, 2),
+        NUM_CHANNELS=(16, 32, 64),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2, 2, 2),
+        NUM_CHANNELS=(16, 32, 64, 128),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+cfg_cls_hrnet_w18_small_v2 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(2,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2),
+        NUM_CHANNELS=(18, 36),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2, 2),
+        NUM_CHANNELS=(18, 36, 72),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=2,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(2, 2, 2, 2),
+        NUM_CHANNELS=(18, 36, 72, 144),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+cfg_cls_hrnet_w18 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(18, 36),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(18, 36, 72),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(18, 36, 72, 144),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+cfg_cls_hrnet_w30 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(30, 60),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(30, 60, 120),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(30, 60, 120, 240),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+cfg_cls_hrnet_w32 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(32, 64),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(32, 64, 128),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(32, 64, 128, 256),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+cfg_cls_hrnet_w40 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(40, 80),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(40, 80, 160),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(40, 80, 160, 320),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+cfg_cls_hrnet_w44 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(44, 88),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(44, 88, 176),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(44, 88, 176, 352),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+cfg_cls_hrnet_w48 = dict(
+    STAGE1=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=1,
+        BLOCK='BOTTLENECK',
+        NUM_BLOCKS=(4,),
+        NUM_CHANNELS=(64,),
+        FUSE_METHOD='SUM',
+    ),
+    STAGE2=dict(
+        NUM_MODULES=1,
+        NUM_BRANCHES=2,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4),
+        NUM_CHANNELS=(48, 96),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE3=dict(
+        NUM_MODULES=4,
+        NUM_BRANCHES=3,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4),
+        NUM_CHANNELS=(48, 96, 192),
+        FUSE_METHOD='SUM'
+    ),
+    STAGE4=dict(
+        NUM_MODULES=3,
+        NUM_BRANCHES=4,
+        BLOCK='BASIC',
+        NUM_BLOCKS=(4, 4, 4, 4),
+        NUM_CHANNELS=(48, 96, 192, 384),
+        FUSE_METHOD='SUM',
+    ),
+)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(
+            planes * self.expansion, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HighResolutionModule(nn.Module):
+    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
+                 num_channels, fuse_method, multi_scale_output=True):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(
+            num_branches, blocks, num_blocks, num_inchannels, num_channels)
+
+        self.num_inchannels = num_inchannels
+        self.fuse_method = fuse_method
+        self.num_branches = num_branches
+
+        self.multi_scale_output = multi_scale_output
+
+        self.branches = self._make_branches(
+            num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(False)
+
+    def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+                num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+                num_branches, len(num_channels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_inchannels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+                num_branches, len(num_inchannels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index], num_channels[branch_index] * block.expansion,
+                    kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample))
+        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(nn.Sequential(
+                        nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
+                        nn.BatchNorm2d(num_inchannels[i], momentum=BN_MOMENTUM),
+                        nn.Upsample(scale_factor=2 ** (j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(nn.Sequential(
+                                nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
+                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM)))
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(nn.Sequential(
+                                nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
+                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM),
+                                nn.ReLU(False)))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        return self.num_inchannels
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+
+        return x_fuse
+
+
+blocks_dict = {
+    'BASIC': BasicBlock,
+    'BOTTLENECK': Bottleneck
+}
+
+
+class HighResolutionNet(nn.Module):
+
+    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg'):
+        super(HighResolutionNet, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.stage1_cfg = cfg['STAGE1']
+        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
+        block = blocks_dict[self.stage1_cfg['BLOCK']]
+        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+        stage1_out_channel = block.expansion * num_channels
+
+        self.stage2_cfg = cfg['STAGE2']
+        num_channels = self.stage2_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage2_cfg['BLOCK']]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition1 = self._make_transition_layer([stage1_out_channel], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
+
+        self.stage3_cfg = cfg['STAGE3']
+        num_channels = self.stage3_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage3_cfg['BLOCK']]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
+
+        self.stage4_cfg = cfg['STAGE4']
+        num_channels = self.stage4_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage4_cfg['BLOCK']]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(self.stage4_cfg, num_channels, multi_scale_output=True)
+
+        # Classification Head
+        self.incre_modules, self.downsamp_modules, self.final_layer = self._make_head(pre_stage_channels)
+
+        self.classifier = nn.Linear(2048, num_classes)
+
+        self.init_weights()
+
+    def _make_head(self, pre_stage_channels):
+        head_block = Bottleneck
+        head_channels = [32, 64, 128, 256]
+
+        # Increasing the #channels on each resolution
+        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
+        incre_modules = []
+        for i, channels in enumerate(pre_stage_channels):
+            incre_modules.append(
+                self._make_layer(head_block, channels, head_channels[i], 1, stride=1))
+        incre_modules = nn.ModuleList(incre_modules)
+
+        # downsampling modules
+        downsamp_modules = []
+        for i in range(len(pre_stage_channels) - 1):
+            in_channels = head_channels[i] * head_block.expansion
+            out_channels = head_channels[i + 1] * head_block.expansion
+            downsamp_module = nn.Sequential(
+                nn.Conv2d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)
+            )
+            downsamp_modules.append(downsamp_module)
+        downsamp_modules = nn.ModuleList(downsamp_modules)
+
+        final_layer = nn.Sequential(
+            nn.Conv2d(
+                in_channels=head_channels[3] * head_block.expansion,
+                out_channels=2048, kernel_size=1, stride=1, padding=0
+            ),
+            nn.BatchNorm2d(2048, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True)
+        )
+
+        return incre_modules, downsamp_modules, final_layer
+
+    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(nn.Sequential(
+                        nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
+                        nn.BatchNorm2d(num_channels_cur_layer[i], momentum=BN_MOMENTUM),
+                        nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i + 1 - num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
+                    conv3x3s.append(nn.Sequential(
+                        nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
+                        nn.BatchNorm2d(outchannels, momentum=BN_MOMENTUM),
+                        nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv3x3s))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
+        num_modules = layer_config['NUM_MODULES']
+        num_branches = layer_config['NUM_BRANCHES']
+        num_blocks = layer_config['NUM_BLOCKS']
+        num_channels = layer_config['NUM_CHANNELS']
+        block = blocks_dict[layer_config['BLOCK']]
+        fuse_method = layer_config['FUSE_METHOD']
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+
+            modules.append(HighResolutionModule(
+                num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output)
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
+
+    def init_weights(self, pretrained='', ):
+        logger.info('=> init weights from normal distribution')
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['NUM_BRANCHES']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['NUM_BRANCHES']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['NUM_BRANCHES']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        # Classification Head
+        y = self.incre_modules[0](y_list[0])
+        for i in range(len(self.downsamp_modules)):
+            y = self.incre_modules[i + 1](y_list[i + 1]) + self.downsamp_modules[i](y)
+
+        y = self.final_layer(y)
+
+        if torch._C._get_tracing_state():
+            y = y.flatten(start_dim=2).mean(dim=2)
+        else:
+            y = F.avg_pool2d(y, kernel_size=y.size()[2:]).view(y.size(0), -1)
+
+        y = self.classifier(y)
+
+        return y
+
+
+
+@register_model
+def hrnet_w18_small(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w18_small']
+    model = HighResolutionNet(cfg_cls_hrnet_w18_small, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def hrnet_w18_small_v2(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w18_small_v2']
+    model = HighResolutionNet(cfg_cls_hrnet_w18_small_v2, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def hrnet_w18(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w18']
+    model = HighResolutionNet(cfg_cls_hrnet_w18, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def hrnet_w30(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w30']
+    model = HighResolutionNet(cfg_cls_hrnet_w30, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def hrnet_w32(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w32']
+    model = HighResolutionNet(cfg_cls_hrnet_w32, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def hrnet_w40(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w40']
+    model = HighResolutionNet(cfg_cls_hrnet_w40, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def hrnet_w44(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w44']
+    model = HighResolutionNet(cfg_cls_hrnet_w44, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def hrnet_w48(pretrained=True, **kwargs):
+    default_cfg = default_cfgs['hrnet_w48']
+    model = HighResolutionNet(cfg_cls_hrnet_w48, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3))
+    return model

From a39cc433748ca509e627229f34b085709e3ee2b7 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 16:40:35 -0800
Subject: [PATCH 17/35] Bring EfficientNet and MobileNetV3 up to date with my
 gen-efficientnet repo * Split MobileNetV3 and EfficientNet model files and
 put builder and blocks in own files (getting too large) * Finalize CondConv
 EfficientNet variant * Add the AdvProp weights files and B8 EfficientNet
 model * Refine the feature extraction module for EfficientNet and MobileNetV3

---
 timm/models/__init__.py             |    1 +
 timm/models/activations.py          |  105 +-
 timm/models/conv2d_layers.py        |   92 +-
 timm/models/efficientnet_blocks.py  |  404 ++++++++
 timm/models/efficientnet_builder.py |  402 ++++++++
 timm/models/feature_hooks.py        |   31 +
 timm/models/gen_efficientnet.py     | 1411 +++++----------------------
 timm/models/layers.py               |   31 -
 timm/models/mobilenetv3.py          |  439 +++++++++
 9 files changed, 1623 insertions(+), 1293 deletions(-)
 create mode 100644 timm/models/efficientnet_blocks.py
 create mode 100644 timm/models/efficientnet_builder.py
 create mode 100644 timm/models/feature_hooks.py
 delete mode 100644 timm/models/layers.py
 create mode 100644 timm/models/mobilenetv3.py

diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 08bc1699..4ef966ea 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -8,6 +8,7 @@ from .xception import *
 from .nasnet import *
 from .pnasnet import *
 from .gen_efficientnet import *
+from .mobilenetv3 import *
 from .inception_v3 import *
 from .gluon_resnet import *
 from .gluon_xception import *
diff --git a/timm/models/activations.py b/timm/models/activations.py
index aa29b84d..aafa290c 100644
--- a/timm/models/activations.py
+++ b/timm/models/activations.py
@@ -7,72 +7,64 @@ _USE_MEM_EFFICIENT_ISH = True
 if _USE_MEM_EFFICIENT_ISH:
     # This version reduces memory overhead of Swish during training by
     # recomputing torch.sigmoid(x) in backward instead of saving it.
-    class SwishAutoFn(torch.autograd.Function):
-        """Swish - Described in: https://arxiv.org/abs/1710.05941
-        Memory efficient variant from:
-         https://medium.com/the-artificial-impostor/more-memory-efficient-swish-activation-function-e07c22c12a76
-        """
-        @staticmethod
-        def forward(ctx, x):
-            result = x.mul(torch.sigmoid(x))
-            ctx.save_for_backward(x)
-            return result
-
-        @staticmethod
-        def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            sigmoid_x = torch.sigmoid(x)
-            return grad_output.mul(sigmoid_x * (1 + x * (1 - sigmoid_x)))
-
-    def swish(x, inplace=False):
-        # inplace ignored
-        return SwishAutoFn.apply(x)
+    @torch.jit.script
+    def swish_jit_fwd(x):
+        return x.mul(torch.sigmoid(x))
 
 
-    class MishAutoFn(torch.autograd.Function):
-        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-        Experimental memory-efficient variant
+    @torch.jit.script
+    def swish_jit_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+    class SwishJitAutoFn(torch.autograd.Function):
+        """ torch.jit.script optimised Swish
+        Inspired by conversation btw Jeremy Howard & Adam Pazske
+        https://twitter.com/jeremyphoward/status/1188251041835315200
         """
 
         @staticmethod
         def forward(ctx, x):
             ctx.save_for_backward(x)
-            y = x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
-            return y
+            return swish_jit_fwd(x)
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            x_sigmoid = torch.sigmoid(x)
-            x_tanh_sp = F.softplus(x).tanh()
-            return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
-
-    def mish(x, inplace=False):
-        # inplace ignored
-        return MishAutoFn.apply(x)
+            x = ctx.saved_tensors[0]
+            return swish_jit_bwd(x, grad_output)
 
 
-    class WishAutoFn(torch.autograd.Function):
-        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
-        Experimental memory-efficient variant
-        """
+    def swish(x, _inplace=False):
+        return SwishJitAutoFn.apply(x)
 
+
+    @torch.jit.script
+    def mish_jit_fwd(x):
+        return x.mul(torch.tanh(F.softplus(x)))
+
+
+    @torch.jit.script
+    def mish_jit_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        x_tanh_sp = F.softplus(x).tanh()
+        return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+    class MishJitAutoFn(torch.autograd.Function):
         @staticmethod
         def forward(ctx, x):
             ctx.save_for_backward(x)
-            y = x.mul(torch.tanh(torch.exp(x)))
-            return y
+            return mish_jit_fwd(x)
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_variables[0]
-            x_exp = x.exp()
-            x_tanh_exp = x_exp.tanh()
-            return grad_output.mul(x_tanh_exp + x * x_exp * (1 - x_tanh_exp * x_tanh_exp))
+            x = ctx.saved_tensors[0]
+            return mish_jit_bwd(x, grad_output)
+
+    def mish(x, _inplace=False):
+        return MishJitAutoFn.apply(x)
 
-    def wish(x, inplace=False):
-        # inplace ignored
-        return WishAutoFn.apply(x)
 else:
     def swish(x, inplace=False):
         """Swish - Described in: https://arxiv.org/abs/1710.05941
@@ -80,18 +72,10 @@ else:
         return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
 
 
-    def mish(x, inplace=False):
+    def mish(x, _inplace=False):
         """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
         """
-        inner = F.softplus(x).tanh()
-        return x.mul_(inner) if inplace else x.mul(inner)
-
-
-    def wish(x, inplace=False):
-        """Wish: My own mistaken creation while fiddling with Mish. Did well in some experiments.
-        """
-        inner = x.exp().tanh()
-        return x.mul_(inner) if inplace else x.mul(inner)
+        return x.mul(F.softplus(x).tanh())
 
 
 class Swish(nn.Module):
@@ -112,15 +96,6 @@ class Mish(nn.Module):
         return mish(x, self.inplace)
 
 
-class Wish(nn.Module):
-    def __init__(self, inplace=False):
-        super(Wish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return wish(x, self.inplace)
-
-
 def sigmoid(x, inplace=False):
     return x.sigmoid_() if inplace else x.sigmoid()
 
diff --git a/timm/models/conv2d_layers.py b/timm/models/conv2d_layers.py
index ea72d07c..acd14fde 100644
--- a/timm/models/conv2d_layers.py
+++ b/timm/models/conv2d_layers.py
@@ -102,13 +102,14 @@ def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
 
 class MixedConv2d(nn.Module):
     """ Mixed Grouped Convolution
-
     Based on MDConv and GroupedConv in MixNet impl:
       https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+
+    NOTE: This does not currently work with torch.jit.script
     """
 
     def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, mixed_dilated=False, depthwise=False, **kwargs):
+                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
         super(MixedConv2d, self).__init__()
 
         kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
@@ -118,17 +119,13 @@ class MixedConv2d(nn.Module):
         self.in_channels = sum(in_splits)
         self.out_channels = sum(out_splits)
         for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            d = dilation
-            # FIXME make compat with non-square kernel/dilations/strides
-            if stride == 1 and mixed_dilated:
-                d, k = (k - 1) // 2, 3
             conv_groups = out_ch if depthwise else 1
             # use add_module to keep key space clean
             self.add_module(
                 str(idx),
                 create_conv2d_pad(
                     in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=d, groups=conv_groups, **kwargs)
+                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
             )
         self.splits = in_splits
 
@@ -154,12 +151,12 @@ def get_condconv_initializer(initializer, num_experts, expert_shape):
 
 class CondConv2d(nn.Module):
     """ Conditional Convolution
-
     Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
 
     Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
     https://github.com/pytorch/pytorch/issues/17983
     """
+    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
 
     def __init__(self, in_channels, out_channels, kernel_size=3,
                  stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
@@ -171,13 +168,10 @@ class CondConv2d(nn.Module):
         self.stride = _pair(stride)
         padding_val, is_padding_dynamic = get_padding_value(
             padding, kernel_size, stride=stride, dilation=dilation)
-        self.conv_fn = conv2d_same if is_padding_dynamic else F.conv2d
+        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
         self.padding = _pair(padding_val)
         self.dilation = _pair(dilation)
-        self.transposed = False
-        self.output_padding = _pair(0)
         self.groups = groups
-        self.padding_mode = 'zero'
         self.num_experts = num_experts
 
         self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
@@ -186,24 +180,19 @@ class CondConv2d(nn.Module):
             weight_num_param *= wd
         self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
 
-        # FIXME I haven't tested bias yet
         if bias:
             self.bias_shape = (self.out_channels,)
-            condconv_bias_shape = (self.num_experts, self.out_channels)
-            self.bias = torch.nn.Parameter(torch.Tensor(condconv_bias_shape))
+            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
         else:
             self.register_parameter('bias', None)
 
         self.reset_parameters()
-        # FIXME once I'm satisfied this works, remove the looping path?
-        self._use_groups = True  # use groups for parallel per-batch-element kernel convolution
 
     def reset_parameters(self):
         init_weight = get_condconv_initializer(
             partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
         init_weight(self.weight)
         if self.bias is not None:
-            # FIXME bias not tested
             fan_in = np.prod(self.weight_shape[1:])
             bound = 1 / math.sqrt(fan_in)
             init_bias = get_condconv_initializer(
@@ -211,35 +200,43 @@ class CondConv2d(nn.Module):
             init_bias(self.bias)
 
     def forward(self, x, routing_weights):
-        weight = torch.matmul(routing_weights, self.weight)
-        bias = torch.matmul(routing_weights, self.bias) if self.bias is not None else None
         B, C, H, W = x.shape
-        if self._use_groups:
-            new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
-            weight = weight.view(new_weight_shape)
-            # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
-            x = x.view(1, B * C, H, W)
-            out = self.conv_fn(
+        weight = torch.matmul(routing_weights, self.weight)
+        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight = weight.view(new_weight_shape)
+        bias = None
+        if self.bias is not None:
+            bias = torch.matmul(routing_weights, self.bias)
+            bias = bias.view(B * self.out_channels)
+        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+        x = x.view(1, B * C, H, W)
+        if self.dynamic_padding:
+            out = conv2d_same(
                 x, weight, bias, stride=self.stride, padding=self.padding,
                 dilation=self.dilation, groups=self.groups * B)
-            out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
         else:
-            x = torch.split(x, 1, 0)
-            weight = torch.split(weight, 1, 0)
-            if self.bias is not None:
-                bias = torch.matmul(routing_weights, self.bias)
-                bias = torch.split(bias, 1, 0)
-            else:
-                bias = [None] * B
-            out = []
-            for xi, wi, bi in zip(x, weight, bias):
-                wi = wi.view(*self.weight_shape)
-                if bi is not None:
-                    bi = bi.view(*self.bias_shape)
-                out.append(self.conv_fn(
-                    xi, wi, bi, stride=self.stride, padding=self.padding,
-                    dilation=self.dilation, groups=self.groups))
-            out = torch.cat(out, 0)
+            out = F.conv2d(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+        # Literal port (from TF definition)
+        # x = torch.split(x, 1, 0)
+        # weight = torch.split(weight, 1, 0)
+        # if self.bias is not None:
+        #     bias = torch.matmul(routing_weights, self.bias)
+        #     bias = torch.split(bias, 1, 0)
+        # else:
+        #     bias = [None] * B
+        # out = []
+        # for xi, wi, bi in zip(x, weight, bias):
+        #     wi = wi.view(*self.weight_shape)
+        #     if bi is not None:
+        #         bi = bi.view(*self.bias_shape)
+        #     out.append(self.conv_fn(
+        #         xi, wi, bi, stride=self.stride, padding=self.padding,
+        #         dilation=self.dilation, groups=self.groups))
+        # out = torch.cat(out, 0)
         return out
 
 
@@ -250,13 +247,14 @@ def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
         assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
         # We're going to use only lists for defining the MixedConv2d kernel groups,
         # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        return MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
     else:
         depthwise = kwargs.pop('depthwise', False)
         groups = out_chs if depthwise else 1
         if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            create_fn = CondConv2d
+            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
         else:
-            create_fn = create_conv2d_pad
-        return create_fn(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+    return m
+
 
diff --git a/timm/models/efficientnet_blocks.py b/timm/models/efficientnet_blocks.py
new file mode 100644
index 00000000..13ab051a
--- /dev/null
+++ b/timm/models/efficientnet_blocks.py
@@ -0,0 +1,404 @@
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .activations import sigmoid
+from .conv2d_layers import *
+
+
+# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
+# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
+# NOTE: momentum varies btw .99 and .9997 depending on source
+# .99 in official TF TPU impl
+# .9997 (/w .999 in search space) for paper
+BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
+BN_EPS_TF_DEFAULT = 1e-3
+_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
+
+
+def get_bn_args_tf():
+    return _BN_ARGS_TF.copy()
+
+
+def resolve_bn_args(kwargs):
+    bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
+    bn_momentum = kwargs.pop('bn_momentum', None)
+    if bn_momentum is not None:
+        bn_args['momentum'] = bn_momentum
+    bn_eps = kwargs.pop('bn_eps', None)
+    if bn_eps is not None:
+        bn_args['eps'] = bn_eps
+    return bn_args
+
+
+_SE_ARGS_DEFAULT = dict(
+    gate_fn=sigmoid,
+    act_layer=None,
+    reduce_mid=False,
+    divisor=1)
+
+
+def resolve_se_args(kwargs, in_chs, act_layer=None):
+    se_kwargs = kwargs.copy() if kwargs is not None else {}
+    # fill in args that aren't specified with the defaults
+    for k, v in _SE_ARGS_DEFAULT.items():
+        se_kwargs.setdefault(k, v)
+    # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
+    if not se_kwargs.pop('reduce_mid'):
+        se_kwargs['reduced_base_chs'] = in_chs
+    # act_layer override, if it remains None, the containing block's act_layer will be used
+    if se_kwargs['act_layer'] is None:
+        assert act_layer is not None
+        se_kwargs['act_layer'] = act_layer
+    return se_kwargs
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
+    """Round number of filters based on depth multiplier."""
+    if not multiplier:
+        return channels
+    channels *= multiplier
+    return make_divisible(channels, divisor, channel_min)
+
+
+def drop_connect(inputs, training=False, drop_connect_rate=0.):
+    """Apply drop connect."""
+    if not training:
+        return inputs
+
+    keep_prob = 1 - drop_connect_rate
+    random_tensor = keep_prob + torch.rand(
+        (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
+    random_tensor.floor_()  # binarize
+    output = inputs.div(keep_prob) * random_tensor
+    return output
+
+
+class ChannelShuffle(nn.Module):
+    # FIXME haven't used yet
+    def __init__(self, groups):
+        super(ChannelShuffle, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
+        N, C, H, W = x.size()
+        g = self.groups
+        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
+            g, C
+        )
+        return (
+            x.view(N, g, int(C / g), H, W)
+            .permute(0, 2, 1, 3, 4)
+            .contiguous()
+            .view(N, C, H, W)
+        )
+
+
+class SqueezeExcite(nn.Module):
+    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
+                 act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1, **_):
+        super(SqueezeExcite, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layer(inplace=True)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = x * self.gate_fn(x_se)
+        return x
+
+
+class ConvBnAct(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+        super(ConvBnAct, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn1 = norm_layer(out_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+    def feature_module(self, location):
+        return 'act1'
+
+    def feature_channels(self, location):
+        return self.conv.out_channels
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        return x
+
+
+class DepthwiseSeparableConv(nn.Module):
+    """ DepthwiseSeparable block
+    Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion
+    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
+    """
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+        super(DepthwiseSeparableConv, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
+        self.has_pw_act = pw_act  # activation after point-wise conv
+        self.drop_connect_rate = drop_connect_rate
+
+        self.conv_dw = select_conv2d(
+            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
+        self.bn1 = norm_layer(in_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+
+        self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
+
+    def feature_module(self, location):
+        # no expansion in this block, pre pw only feature extraction point
+        return 'conv_pw'
+
+    def feature_channels(self, location):
+        return self.conv_pw.in_channels
+
+    def forward(self, x):
+        residual = x
+
+        x = self.conv_dw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        if self.has_se:
+            x = self.se(x)
+
+        x = self.conv_pw(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        if self.has_residual:
+            if self.drop_connect_rate > 0.:
+                x = drop_connect(x, self.training, self.drop_connect_rate)
+            x += residual
+        return x
+
+
+class InvertedResidual(nn.Module):
+    """ Inverted residual block w/ optional SE and CondConv routing"""
+
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 conv_kwargs=None, drop_connect_rate=0.):
+        super(InvertedResidual, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        conv_kwargs = conv_kwargs or {}
+        mid_chs = make_divisible(in_chs * exp_ratio)
+        self.has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+        self.drop_connect_rate = drop_connect_rate
+
+        # Point-wise expansion
+        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Depth-wise convolution
+        self.conv_dw = select_conv2d(
+            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
+            padding=pad_type, depthwise=True, **conv_kwargs)
+        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+
+        # Point-wise linear projection
+        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn3 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_module(self, location):
+        if location == 'post_exp':
+            return 'act1'
+        return 'conv_pwl'
+
+    def feature_channels(self, location):
+        if location == 'post_exp':
+            return self.conv_pw.out_channels
+        # location == 'pre_pw'
+        return self.conv_pwl.in_channels
+
+    def forward(self, x):
+        residual = x
+
+        # Point-wise expansion
+        x = self.conv_pw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Depth-wise convolution
+        x = self.conv_dw(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x)
+        x = self.bn3(x)
+
+        if self.has_residual:
+            if self.drop_connect_rate > 0.:
+                x = drop_connect(x, self.training, self.drop_connect_rate)
+            x += residual
+
+        return x
+
+
+class CondConvResidual(InvertedResidual):
+    """ Inverted residual block w/ CondConv routing"""
+
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 num_experts=0, drop_connect_rate=0.):
+
+        self.num_experts = num_experts
+        conv_kwargs = dict(num_experts=self.num_experts)
+
+        super(CondConvResidual, self).__init__(
+            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type,
+            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
+            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
+            drop_connect_rate=drop_connect_rate)
+
+        self.routing_fn = nn.Linear(in_chs, self.num_experts)
+
+    def forward(self, x):
+        residual = x
+
+        # CondConv routing
+        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+        routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
+
+        # Point-wise expansion
+        x = self.conv_pw(x, routing_weights)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Depth-wise convolution
+        x = self.conv_dw(x, routing_weights)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x, routing_weights)
+        x = self.bn3(x)
+
+        if self.has_residual:
+            if self.drop_connect_rate > 0.:
+                x = drop_connect(x, self.training, self.drop_connect_rate)
+            x += residual
+        return x
+
+
+class EdgeResidual(nn.Module):
+    """ Residual block with expansion convolution followed by pointwise-linear w/ stride"""
+
+    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 drop_connect_rate=0.):
+        super(EdgeResidual, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        if fake_in_chs > 0:
+            mid_chs = make_divisible(fake_in_chs * exp_ratio)
+        else:
+            mid_chs = make_divisible(in_chs * exp_ratio)
+        self.has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+        self.drop_connect_rate = drop_connect_rate
+
+        # Expansion convolution
+        self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+
+        # Point-wise linear projection
+        self.conv_pwl = select_conv2d(
+            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_module(self, location):
+        if location == 'post_exp':
+            return 'act1'
+        return 'conv_pwl'
+
+    def feature_channels(self, location):
+        if location == 'post_exp':
+            return self.conv_exp.out_channels
+        # location == 'pre_pw'
+        return self.conv_pwl.in_channels
+
+    def forward(self, x):
+        residual = x
+
+        # Expansion convolution
+        x = self.conv_exp(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Squeeze-and-excitation
+        if self.has_se:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x)
+        x = self.bn2(x)
+
+        if self.has_residual:
+            if self.drop_connect_rate > 0.:
+                x = drop_connect(x, self.training, self.drop_connect_rate)
+            x += residual
+
+        return x
diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py
new file mode 100644
index 00000000..c2b3a801
--- /dev/null
+++ b/timm/models/efficientnet_builder.py
@@ -0,0 +1,402 @@
+import logging
+import math
+import re
+from collections.__init__ import OrderedDict
+from copy import deepcopy
+
+import torch.nn as nn
+from .activations import sigmoid, HardSwish, Swish
+from .efficientnet_blocks import *
+
+
+def _parse_ksize(ss):
+    if ss.isdigit():
+        return int(ss)
+    else:
+        return [int(k) for k in ss.split('.')]
+
+
+def _decode_block_str(block_str):
+    """ Decode block definition string
+
+    Gets a list of block arg (dicts) through a string notation of arguments.
+    E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
+
+    All args can exist in any order with the exception of the leading string which
+    is assumed to indicate the block type.
+
+    leading string - block type (
+      ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
+    r - number of repeat blocks,
+    k - kernel size,
+    s - strides (1-9),
+    e - expansion ratio,
+    c - output channels,
+    se - squeeze/excitation ratio
+    n - activation fn ('re', 'r6', 'hs', or 'sw')
+    Args:
+        block_str: a string representation of block arguments.
+    Returns:
+        A list of block args (dicts)
+    Raises:
+        ValueError: if the string def not properly specified (TODO)
+    """
+    assert isinstance(block_str, str)
+    ops = block_str.split('_')
+    block_type = ops[0]  # take the block type off the front
+    ops = ops[1:]
+    options = {}
+    noskip = False
+    for op in ops:
+        # string options being checked on individual basis, combine if they grow
+        if op == 'noskip':
+            noskip = True
+        elif op.startswith('n'):
+            # activation fn
+            key = op[0]
+            v = op[1:]
+            if v == 're':
+                value = nn.ReLU
+            elif v == 'r6':
+                value = nn.ReLU6
+            elif v == 'hs':
+                value = HardSwish
+            elif v == 'sw':
+                value = Swish
+            else:
+                continue
+            options[key] = value
+        else:
+            # all numeric options
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+    # if act_layer is None, the model default (passed to model init) will be used
+    act_layer = options['n'] if 'n' in options else None
+    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
+    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
+    fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
+
+    num_repeat = int(options['r'])
+    # each type of block has different valid arguments, fill accordingly
+    if block_type == 'ir':
+        block_args = dict(
+            block_type=block_type,
+            dw_kernel_size=_parse_ksize(options['k']),
+            exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            exp_ratio=float(options['e']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            noskip=noskip,
+        )
+        if 'cc' in options:
+            block_args['num_experts'] = int(options['cc'])
+    elif block_type == 'ds' or block_type == 'dsa':
+        block_args = dict(
+            block_type=block_type,
+            dw_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            pw_act=block_type == 'dsa',
+            noskip=block_type == 'dsa' or noskip,
+        )
+    elif block_type == 'er':
+        block_args = dict(
+            block_type=block_type,
+            exp_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            exp_ratio=float(options['e']),
+            fake_in_chs=fake_in_chs,
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            noskip=noskip,
+        )
+    elif block_type == 'cn':
+        block_args = dict(
+            block_type=block_type,
+            kernel_size=int(options['k']),
+            out_chs=int(options['c']),
+            stride=int(options['s']),
+            act_layer=act_layer,
+        )
+    else:
+        assert False, 'Unknown block type (%s)' % block_type
+
+    return block_args, num_repeat
+
+
+def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
+    """ Per-stage depth scaling
+    Scales the block repeats in each stage. This depth scaling impl maintains
+    compatibility with the EfficientNet scaling method, while allowing sensible
+    scaling for other models that may have multiple block arg definitions in each stage.
+    """
+
+    # We scale the total repeat count for each stage, there may be multiple
+    # block arg defs per stage so we need to sum.
+    num_repeat = sum(repeats)
+    if depth_trunc == 'round':
+        # Truncating to int by rounding allows stages with few repeats to remain
+        # proportionally smaller for longer. This is a good choice when stage definitions
+        # include single repeat stages that we'd prefer to keep that way as long as possible
+        num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
+    else:
+        # The default for EfficientNet truncates repeats to int via 'ceil'.
+        # Any multiplier > 1.0 will result in an increased depth for every stage.
+        num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
+
+    # Proportionally distribute repeat count scaling to each block definition in the stage.
+    # Allocation is done in reverse as it results in the first block being less likely to be scaled.
+    # The first block makes less sense to repeat in most of the arch definitions.
+    repeats_scaled = []
+    for r in repeats[::-1]:
+        rs = max(1, round((r / num_repeat * num_repeat_scaled)))
+        repeats_scaled.append(rs)
+        num_repeat -= r
+        num_repeat_scaled -= rs
+    repeats_scaled = repeats_scaled[::-1]
+
+    # Apply the calculated scaling to each block arg in the stage
+    sa_scaled = []
+    for ba, rep in zip(stack_args, repeats_scaled):
+        sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
+    return sa_scaled
+
+
+def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1):
+    arch_args = []
+    for stack_idx, block_strings in enumerate(arch_def):
+        assert isinstance(block_strings, list)
+        stack_args = []
+        repeats = []
+        for block_str in block_strings:
+            assert isinstance(block_str, str)
+            ba, rep = _decode_block_str(block_str)
+            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+                ba['num_experts'] *= experts_multiplier
+            stack_args.append(ba)
+            repeats.append(rep)
+        arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
+    return arch_args
+
+
+class EfficientNetBuilder:
+    """ Build Trunk Blocks
+
+    This ended up being somewhat of a cross between
+    https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
+    and
+    https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
+
+    """
+    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0., feature_location='',
+                 verbose=False):
+        self.channel_multiplier = channel_multiplier
+        self.channel_divisor = channel_divisor
+        self.channel_min = channel_min
+        self.output_stride = output_stride
+        self.pad_type = pad_type
+        self.act_layer = act_layer
+        self.se_kwargs = se_kwargs
+        self.norm_layer = norm_layer
+        self.norm_kwargs = norm_kwargs
+        self.drop_connect_rate = drop_connect_rate
+        self.feature_location = feature_location
+        assert feature_location in ('pre_pwl', 'post_exp', '')
+        self.verbose = verbose
+
+        # state updated during build, consumed by model
+        self.in_chs = None
+        self.features = OrderedDict()
+
+    def _round_channels(self, chs):
+        return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
+
+    def _make_block(self, ba, block_idx, block_count):
+        drop_connect_rate = self.drop_connect_rate * block_idx / block_count
+        bt = ba.pop('block_type')
+        ba['in_chs'] = self.in_chs
+        ba['out_chs'] = self._round_channels(ba['out_chs'])
+        if 'fake_in_chs' in ba and ba['fake_in_chs']:
+            # FIXME this is a hack to work around mismatch in origin impl input filters
+            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+        ba['norm_layer'] = self.norm_layer
+        ba['norm_kwargs'] = self.norm_kwargs
+        ba['pad_type'] = self.pad_type
+        # block act fn overrides the model default
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
+        if bt == 'ir':
+            ba['drop_connect_rate'] = drop_connect_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
+            if ba.get('num_experts', 0) > 0:
+                block = CondConvResidual(**ba)
+            else:
+                block = InvertedResidual(**ba)
+        elif bt == 'ds' or bt == 'dsa':
+            ba['drop_connect_rate'] = drop_connect_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
+            block = DepthwiseSeparableConv(**ba)
+        elif bt == 'er':
+            ba['drop_connect_rate'] = drop_connect_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
+            block = EdgeResidual(**ba)
+        elif bt == 'cn':
+            if self.verbose:
+                logging.info('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)))
+            block = ConvBnAct(**ba)
+        else:
+            assert False, 'Uknkown block type (%s) while building model.' % bt
+        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
+
+        return block
+
+    def __call__(self, in_chs, model_block_args):
+        """ Build the blocks
+        Args:
+            in_chs: Number of input-channels passed to first block
+            model_block_args: A list of lists, outer list defines stages, inner
+                list contains strings defining block configuration(s)
+        Return:
+             List of block stacks (each stack wrapped in nn.Sequential)
+        """
+        if self.verbose:
+            logging.info('Building model trunk with %d stages...' % len(model_block_args))
+        self.in_chs = in_chs
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        feature_idx = 0
+        stages = []
+        # outer list of block_args defines the stacks ('stages' by some conventions)
+        for stage_idx, stage_block_args in enumerate(model_block_args):
+            last_stack = stage_idx == (len(model_block_args) - 1)
+            if self.verbose:
+                logging.info('Stack: {}'.format(stage_idx))
+            assert isinstance(stage_block_args, list)
+
+            blocks = []
+            # each stack (stage) contains a list of block arguments
+            for block_idx, block_args in enumerate(stage_block_args):
+                last_block = block_idx == (len(stage_block_args) - 1)
+                extract_features = ''  # No features extracted
+                if self.verbose:
+                    logging.info(' Block: {}'.format(block_idx))
+
+                # Sort out stride, dilation, and feature extraction details
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:
+                    # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                do_extract = False
+                if self.feature_location == 'pre_pwl':
+                    if last_block:
+                        next_stage_idx = stage_idx + 1
+                        if next_stage_idx >= len(model_block_args):
+                            do_extract = True
+                        else:
+                            do_extract = model_block_args[next_stage_idx][0]['stride'] > 1
+                elif self.feature_location == 'post_exp':
+                    if block_args['stride'] > 1 or (last_stack and last_block) :
+                        do_extract = True
+                if do_extract:
+                    extract_features = self.feature_location
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                        if self.verbose:
+                            logging.info('  Converting stride to dilation to maintain output_stride=={}'.format(
+                                self.output_stride))
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+                # create the block
+                block = self._make_block(block_args, total_block_idx, total_block_count)
+                blocks.append(block)
+
+                # stash feature module name and channel info for model feature extraction
+                if extract_features:
+                    feature_module = block.feature_module(extract_features)
+                    if feature_module:
+                        feature_module = 'blocks.{}.{}.'.format(stage_idx, block_idx) + feature_module
+                    feature_channels = block.feature_channels(extract_features)
+                    self.features[feature_idx] = dict(
+                        name=feature_module,
+                        num_chs=feature_channels
+                    )
+                    feature_idx += 1
+
+                total_block_idx += 1  # incr global block idx (across all stacks)
+            stages.append(nn.Sequential(*blocks))
+        return stages
+
+
+def efficientnet_init_goog(m, n=''):
+    # weight init as per Tensorflow Official impl
+    # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+    if isinstance(m, CondConv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        init_weight_fn = get_condconv_initializer(
+            lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
+        init_weight_fn(m.weight)
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        fan_out = m.weight.size(0)  # fan-out
+        fan_in = 0
+        if 'routing_fn' in n:
+            fan_in = m.weight.size(1)
+        init_range = 1.0 / math.sqrt(fan_in + fan_out)
+        m.weight.data.uniform_(-init_range, init_range)
+        m.bias.data.zero_()
+
+
+def efficientnet_init_default(m, n=''):
+    if isinstance(m, CondConv2d):
+        init_fn = get_condconv_initializer(partial(
+            nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape)
+        init_fn(m.weight)
+    elif isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')
+
+
diff --git a/timm/models/feature_hooks.py b/timm/models/feature_hooks.py
new file mode 100644
index 00000000..8ffcda86
--- /dev/null
+++ b/timm/models/feature_hooks.py
@@ -0,0 +1,31 @@
+from collections import defaultdict, OrderedDict
+from functools import partial
+
+
+class FeatureHooks:
+
+    def __init__(self, hooks, named_modules):
+        # setup feature hooks
+        modules = {k: v for k, v in named_modules}
+        for h in hooks:
+            hook_name = h['name']
+            m = modules[hook_name]
+            hook_fn = partial(self._collect_output_hook, hook_name)
+            if h['type'] == 'forward_pre':
+                m.register_forward_pre_hook(hook_fn)
+            elif h['type'] == 'forward':
+                m.register_forward_hook(hook_fn)
+            else:
+                assert False, "Unsupported hook type"
+        self._feature_outputs = defaultdict(OrderedDict)
+
+    def _collect_output_hook(self, name, *args):
+        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
+        if isinstance(x, tuple):
+            x = x[0]  # unwrap input tuple
+        self._feature_outputs[x.device][name] = x
+
+    def get_output(self, device):
+        output = tuple(self._feature_outputs[device].values())[::-1]
+        self._feature_outputs[device] = OrderedDict()  # clear after reading
+        return output
diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py
index c3b1b0e2..fe20ff13 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/gen_efficientnet.py
@@ -7,8 +7,7 @@ A generic class with building blocks to support a variety of models with efficie
 * MixNet (Small, Medium, and Large)
 * MnasNet B1, A1 (SE), Small
 * MobileNet V1, V2, and V3
-* FBNet-C (TODO A & B)
-* ChamNet (TODO still guessing at architecture definition)
+* FBNet-C
 * Single-Path NAS Pixel1
 * And likely more...
 
@@ -16,28 +15,16 @@ TODO not all combinations and variations have been tested. Currently working on
 
 Hacked together by Ross Wightman
 """
-
-import math
-import re
-import logging
-from copy import deepcopy
-from functools import partial
-from collections import OrderedDict, defaultdict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from timm.models.activations import Swish, sigmoid, HardSwish, hard_sigmoid
-from .registry import register_model, model_entrypoint
+from .efficientnet_builder import *
+from .feature_hooks import FeatureHooks
+from .registry import register_model
 from .helpers import load_pretrained
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from .conv2d_layers import select_conv2d
-from .layers import Flatten
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 
-__all__ = ['GenEfficientNet']
+__all__ = ['EfficientNet']
 
 
 def _cfg(url='', **kwargs):
@@ -62,14 +49,7 @@ default_cfgs = {
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth'),
     'semnasnet_140': _cfg(url=''),
     'mnasnet_small': _cfg(url=''),
-    'mobilenetv1_100': _cfg(url=''),
     'mobilenetv2_100': _cfg(url=''),
-    'mobilenetv3_050': _cfg(url=''),
-    'mobilenetv3_075': _cfg(url=''),
-    'mobilenetv3_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth'),
-    'chamnetv1_100': _cfg(url=''),
-    'chamnetv2_100': _cfg(url=''),
     'fbnetc_100': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
         interpolation='bilinear'),
@@ -94,14 +74,14 @@ default_cfgs = {
         url='', input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
     'efficientnet_b7': _cfg(
         url='', input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'efficientnet_b8': _cfg(
+        url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
     'efficientnet_es': _cfg(
         url=''),
     'efficientnet_em': _cfg(
-        url='',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+        url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
     'efficientnet_el': _cfg(
-        url='',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+        url='', input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
     'efficientnet_cc_b0_4e': _cfg(url=''),
     'efficientnet_cc_b0_8e': _cfg(url=''),
     'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
@@ -129,6 +109,41 @@ default_cfgs = {
     'tf_efficientnet_b7': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
         input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b0_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, input_size=(3, 224, 224)),
+    'tf_efficientnet_b1_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b8_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
     'tf_efficientnet_es': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
         mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
@@ -169,896 +184,72 @@ default_cfgs = {
 }
 
 
-_DEBUG = True
 
-# Default args for PyTorch BN impl
-_BN_MOMENTUM_PT_DEFAULT = 0.1
-_BN_EPS_PT_DEFAULT = 1e-5
-_BN_ARGS_PT = dict(momentum=_BN_MOMENTUM_PT_DEFAULT, eps=_BN_EPS_PT_DEFAULT)
-
-# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
-# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
-# NOTE: momentum varies btw .99 and .9997 depending on source
-# .99 in official TF TPU impl
-# .9997 (/w .999 in search space) for paper
-_BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
-_BN_EPS_TF_DEFAULT = 1e-3
-_BN_ARGS_TF = dict(momentum=_BN_MOMENTUM_TF_DEFAULT, eps=_BN_EPS_TF_DEFAULT)
+_DEBUG = False
 
 
-def _resolve_bn_args(kwargs):
-    bn_args = _BN_ARGS_TF.copy() if kwargs.pop('bn_tf', False) else _BN_ARGS_PT.copy()
-    bn_momentum = kwargs.pop('bn_momentum', None)
-    if bn_momentum is not None:
-        bn_args['momentum'] = bn_momentum
-    bn_eps = kwargs.pop('bn_eps', None)
-    if bn_eps is not None:
-        bn_args['eps'] = bn_eps
-    return bn_args
+class EfficientNet(nn.Module):
+    """ (Generic) EfficientNet
 
+    A flexible and performant PyTorch implementation of efficient network architectures, including:
+      * EfficientNet B0-B8
+      * EfficientNet-EdgeTPU
+      * EfficientNet-CondConv
+      * MixNet S, M, L, XL
+      * MnasNet A1, B1, and small
+      * FBNet C
+      * Single-Path NAS Pixel1
 
-def _round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
-    """Round number of filters based on depth multiplier."""
-    if not multiplier:
-        return channels
-
-    channels *= multiplier
-    channel_min = channel_min or divisor
-    new_channels = max(
-        int(channels + divisor / 2) // divisor * divisor,
-        channel_min)
-    # Make sure that round down does not go down by more than 10%.
-    if new_channels < 0.9 * channels:
-        new_channels += divisor
-    return new_channels
-
-
-def _parse_ksize(ss):
-    if ss.isdigit():
-        return int(ss)
-    else:
-        return [int(k) for k in ss.split('.')]
-
-
-def _decode_block_str(block_str):
-    """ Decode block definition string
-
-    Gets a list of block arg (dicts) through a string notation of arguments.
-    E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
-
-    All args can exist in any order with the exception of the leading string which
-    is assumed to indicate the block type.
-
-    leading string - block type (
-      ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
-    r - number of repeat blocks,
-    k - kernel size,
-    s - strides (1-9),
-    e - expansion ratio,
-    c - output channels,
-    se - squeeze/excitation ratio
-    n - activation fn ('re', 'r6', 'hs', or 'sw')
-    Args:
-        block_str: a string representation of block arguments.
-    Returns:
-        A list of block args (dicts)
-    Raises:
-        ValueError: if the string def not properly specified (TODO)
-    """
-    assert isinstance(block_str, str)
-    ops = block_str.split('_')
-    block_type = ops[0]  # take the block type off the front
-    ops = ops[1:]
-    options = {}
-    noskip = False
-    for op in ops:
-        # string options being checked on individual basis, combine if they grow
-        if op == 'noskip':
-            noskip = True
-        elif op.startswith('n'):
-            # activation fn
-            key = op[0]
-            v = op[1:]
-            if v == 're':
-                value = nn.ReLU
-            elif v == 'r6':
-                value = nn.ReLU6
-            elif v == 'hs':
-                value = HardSwish
-            elif v == 'sw':
-                value = Swish
-            else:
-                continue
-            options[key] = value
-        else:
-            # all numeric options
-            splits = re.split(r'(\d.*)', op)
-            if len(splits) >= 2:
-                key, value = splits[:2]
-                options[key] = value
-
-    # if act_layer is None, the model default (passed to model init) will be used
-    act_layer = options['n'] if 'n' in options else None
-    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
-    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
-    fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
-
-    num_repeat = int(options['r'])
-    # each type of block has different valid arguments, fill accordingly
-    if block_type == 'ir':
-        block_args = dict(
-            block_type=block_type,
-            dw_kernel_size=_parse_ksize(options['k']),
-            exp_kernel_size=exp_kernel_size,
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            exp_ratio=float(options['e']),
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            noskip=noskip,
-            num_experts=int(options['cc']) if 'cc' in options else 0
-        )
-    elif block_type == 'ds' or block_type == 'dsa':
-        block_args = dict(
-            block_type=block_type,
-            dw_kernel_size=_parse_ksize(options['k']),
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            pw_act=block_type == 'dsa',
-            noskip=block_type == 'dsa' or noskip,
-        )
-    elif block_type == 'er':
-        block_args = dict(
-            block_type=block_type,
-            exp_kernel_size=_parse_ksize(options['k']),
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            exp_ratio=float(options['e']),
-            fake_in_chs=fake_in_chs,
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            noskip=noskip,
-        )
-    elif block_type == 'cn':
-        block_args = dict(
-            block_type=block_type,
-            kernel_size=int(options['k']),
-            out_chs=int(options['c']),
-            stride=int(options['s']),
-            act_layer=act_layer,
-        )
-    else:
-        assert False, 'Unknown block type (%s)' % block_type
-
-    return block_args, num_repeat
-
-
-def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
-    """ Per-stage depth scaling
-    Scales the block repeats in each stage. This depth scaling impl maintains
-    compatibility with the EfficientNet scaling method, while allowing sensible
-    scaling for other models that may have multiple block arg definitions in each stage.
     """
 
-    # We scale the total repeat count for each stage, there may be multiple
-    # block arg defs per stage so we need to sum.
-    num_repeat = sum(repeats)
-    if depth_trunc == 'round':
-        # Truncating to int by rounding allows stages with few repeats to remain
-        # proportionally smaller for longer. This is a good choice when stage definitions
-        # include single repeat stages that we'd prefer to keep that way as long as possible
-        num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
-    else:
-        # The default for EfficientNet truncates repeats to int via 'ceil'.
-        # Any multiplier > 1.0 will result in an increased depth for every stage.
-        num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
-
-    # Proportionally distribute repeat count scaling to each block definition in the stage.
-    # Allocation is done in reverse as it results in the first block being less likely to be scaled.
-    # The first block makes less sense to repeat in most of the arch definitions.
-    repeats_scaled = []
-    for r in repeats[::-1]:
-        rs = max(1, round((r / num_repeat * num_repeat_scaled)))
-        repeats_scaled.append(rs)
-        num_repeat -= r
-        num_repeat_scaled -= rs
-    repeats_scaled = repeats_scaled[::-1]
-
-    # Apply the calculated scaling to each block arg in the stage
-    sa_scaled = []
-    for ba, rep in zip(stack_args, repeats_scaled):
-        sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
-    return sa_scaled
-
-
-def _decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1):
-    arch_args = []
-    for stack_idx, block_strings in enumerate(arch_def):
-        assert isinstance(block_strings, list)
-        stack_args = []
-        repeats = []
-        for block_str in block_strings:
-            assert isinstance(block_str, str)
-            ba, rep = _decode_block_str(block_str)
-            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
-                ba['num_experts'] *= experts_multiplier
-            stack_args.append(ba)
-            repeats.append(rep)
-        arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
-    return arch_args
-
-
-_USE_SWISH_OPT = True
-if _USE_SWISH_OPT:
-    @torch.jit.script
-    def swish_jit_fwd(x):
-        return x.mul(torch.sigmoid(x))
-
-
-    @torch.jit.script
-    def swish_jit_bwd(x, grad_output):
-        x_sigmoid = torch.sigmoid(x)
-        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
-
-
-    class SwishJitAutoFn(torch.autograd.Function):
-        """ torch.jit.script optimised Swish
-        Inspired by conversation btw Jeremy Howard & Adam Pazske
-        https://twitter.com/jeremyphoward/status/1188251041835315200
-        """
-
-        @staticmethod
-        def forward(ctx, x):
-            ctx.save_for_backward(x)
-            return swish_jit_fwd(x)
-
-        @staticmethod
-        def backward(ctx, grad_output):
-            x = ctx.saved_tensors[0]
-            return swish_jit_bwd(x, grad_output)
-
-
-    def swish(x, inplace=False):
-        # inplace ignored
-        return SwishJitAutoFn.apply(x)
-else:
-    def swish(x, inplace=False):
-        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
-
-
-def sigmoid(x, inplace=False):
-    return x.sigmoid_() if inplace else x.sigmoid()
-
-
-def hard_swish(x, inplace=False):
-    if inplace:
-        return x.mul_(F.relu6(x + 3.) / 6.)
-    else:
-        return x * F.relu6(x + 3.) / 6.
-
-
-def hard_sigmoid(x, inplace=False):
-    if inplace:
-        return x.add_(3.).clamp_(0., 6.).div_(6.)
-    else:
-        return F.relu6(x + 3.) / 6.
-
-
-class _BlockBuilder:
-    """ Build Trunk Blocks
-
-    This ended up being somewhat of a cross between
-    https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
-    and
-    https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
-
-    """
-    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=None, se_gate_fn=sigmoid, se_reduce_mid=False,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0., feature_location='',
-                 verbose=False):
-        self.channel_multiplier = channel_multiplier
-        self.channel_divisor = channel_divisor
-        self.channel_min = channel_min
-        self.output_stride = output_stride
-        self.pad_type = pad_type
-        self.act_layer = act_layer
-        self.se_gate_fn = se_gate_fn
-        self.se_reduce_mid = se_reduce_mid
-        self.norm_layer = norm_layer
-        self.norm_kwargs = norm_kwargs
-        self.drop_connect_rate = drop_connect_rate
-        self.feature_location = feature_location
-        assert feature_location in ('pre_pwl', 'post_exp', '')
-        self.verbose = verbose
-
-        # state updated during build, consumed by model
-        self.in_chs = None
-        self.features = OrderedDict()
-
-    def _round_channels(self, chs):
-        return _round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
-
-    def _make_block(self, ba, block_idx, block_count):
-        drop_connect_rate = self.drop_connect_rate * block_idx / block_count
-        bt = ba.pop('block_type')
-        ba['in_chs'] = self.in_chs
-        ba['out_chs'] = self._round_channels(ba['out_chs'])
-        if 'fake_in_chs' in ba and ba['fake_in_chs']:
-            # FIXME this is a hack to work around mismatch in origin impl input filters
-            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
-        ba['norm_layer'] = self.norm_layer
-        ba['norm_kwargs'] = self.norm_kwargs
-        ba['pad_type'] = self.pad_type
-        # block act fn overrides the model default
-        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
-        assert ba['act_layer'] is not None
-        if bt == 'ir':
-            ba['drop_connect_rate'] = drop_connect_rate
-            ba['se_gate_fn'] = self.se_gate_fn
-            ba['se_reduce_mid'] = self.se_reduce_mid
-            if self.verbose:
-                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
-            block = InvertedResidual(**ba)
-        elif bt == 'ds' or bt == 'dsa':
-            ba['drop_connect_rate'] = drop_connect_rate
-            if self.verbose:
-                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
-            block = DepthwiseSeparableConv(**ba)
-        elif bt == 'er':
-            ba['drop_connect_rate'] = drop_connect_rate
-            ba['se_gate_fn'] = self.se_gate_fn
-            ba['se_reduce_mid'] = self.se_reduce_mid
-            if self.verbose:
-                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
-            block = EdgeResidual(**ba)
-        elif bt == 'cn':
-            if self.verbose:
-                logging.info('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)))
-            block = ConvBnAct(**ba)
-        else:
-            assert False, 'Uknkown block type (%s) while building model.' % bt
-        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
-
-        return block
-
-    def __call__(self, in_chs, model_block_args):
-        """ Build the blocks
-        Args:
-            in_chs: Number of input-channels passed to first block
-            model_block_args: A list of lists, outer list defines stages, inner
-                list contains strings defining block configuration(s)
-        Return:
-             List of block stacks (each stack wrapped in nn.Sequential)
-        """
-        if self.verbose:
-            logging.info('Building model trunk with %d stages...' % len(model_block_args))
-        self.in_chs = in_chs
-        total_block_count = sum([len(x) for x in model_block_args])
-        total_block_idx = 0
-        current_stride = 2
-        current_dilation = 1
-        feature_idx = 0
-        stages = []
-        # outer list of block_args defines the stacks ('stages' by some conventions)
-        for stage_idx, stage_block_args in enumerate(model_block_args):
-            last_stack = stage_idx == (len(model_block_args) - 1)
-            if self.verbose:
-                logging.info('Stack: {}'.format(stage_idx))
-            assert isinstance(stage_block_args, list)
-
-            blocks = []
-            # each stack (stage) contains a list of block arguments
-            for block_idx, block_args in enumerate(stage_block_args):
-                last_block = block_idx == (len(stage_block_args) - 1)
-                extract_features = ''  # No features extracted
-                if self.verbose:
-                    logging.info(' Block: {}'.format(block_idx))
-
-                # Sort out stride, dilation, and feature extraction details
-                assert block_args['stride'] in (1, 2)
-                if block_idx >= 1:
-                    # only the first block in any stack can have a stride > 1
-                    block_args['stride'] = 1
-
-                do_extract = False
-                if self.feature_location == 'pre_pwl':
-                    if last_block:
-                        next_stage_idx = stage_idx + 1
-                        if next_stage_idx >= len(model_block_args):
-                            do_extract = True
-                        else:
-                            do_extract = model_block_args[next_stage_idx][0]['stride'] > 1
-                elif self.feature_location == 'post_exp':
-                    if block_args['stride'] > 1 or (last_stack and last_block) :
-                        do_extract = True
-                if do_extract:
-                    extract_features = self.feature_location
-
-                next_dilation = current_dilation
-                if block_args['stride'] > 1:
-                    next_output_stride = current_stride * block_args['stride']
-                    if next_output_stride > self.output_stride:
-                        next_dilation = current_dilation * block_args['stride']
-                        block_args['stride'] = 1
-                        if self.verbose:
-                            logging.info('  Converting stride to dilation to maintain output_stride=={}'.format(
-                                self.output_stride))
-                    else:
-                        current_stride = next_output_stride
-                block_args['dilation'] = current_dilation
-                if next_dilation != current_dilation:
-                    current_dilation = next_dilation
-
-                # create the block
-                block = self._make_block(block_args, total_block_idx, total_block_count)
-                blocks.append(block)
-
-                # stash feature module name and channel info for model feature extraction
-                if extract_features:
-                    feature_module = block.feature_module(extract_features)
-                    if feature_module:
-                        feature_module = 'blocks.{}.{}.'.format(stage_idx, block_idx) + feature_module
-                    feature_channels = block.feature_channels(extract_features)
-                    self.features[feature_idx] = dict(
-                        name=feature_module,
-                        num_chs=feature_channels
-                    )
-                    feature_idx += 1
-
-                total_block_idx += 1  # incr global block idx (across all stacks)
-            stages.append(nn.Sequential(*blocks))
-        return stages
-
-
-def _init_weight_goog(m):
-    # weight init as per Tensorflow Official impl
-    # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
-    if isinstance(m, nn.Conv2d):
-        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels  # fan-out
-        m.weight.data.normal_(0, math.sqrt(2.0 / n))
-        if m.bias is not None:
-            m.bias.data.zero_()
-    elif isinstance(m, nn.BatchNorm2d):
-        m.weight.data.fill_(1.0)
-        m.bias.data.zero_()
-    elif isinstance(m, nn.Linear):
-        n = m.weight.size(0)  # fan-out
-        init_range = 1.0 / math.sqrt(n)
-        m.weight.data.uniform_(-init_range, init_range)
-        m.bias.data.zero_()
-
-
-def _init_weight_default(m):
-    if isinstance(m, nn.Conv2d):
-        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-    elif isinstance(m, nn.BatchNorm2d):
-        m.weight.data.fill_(1.0)
-        m.bias.data.zero_()
-    elif isinstance(m, nn.Linear):
-        nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')
-
-
-def drop_connect(inputs, training=False, drop_connect_rate=0.):
-    """Apply drop connect."""
-    if not training:
-        return inputs
-
-    keep_prob = 1 - drop_connect_rate
-    random_tensor = keep_prob + torch.rand(
-        (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
-    random_tensor.floor_()  # binarize
-    output = inputs.div(keep_prob) * random_tensor
-    return output
-
-
-class ChannelShuffle(nn.Module):
-    # FIXME haven't used yet
-    def __init__(self, groups):
-        super(ChannelShuffle, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
-        N, C, H, W = x.size()
-        g = self.groups
-        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
-            g, C
-        )
-        return (
-            x.view(N, g, int(C / g), H, W)
-            .permute(0, 2, 1, 3, 4)
-            .contiguous()
-            .view(N, C, H, W)
-        )
-
-
-class SqueezeExcite(nn.Module):
-    def __init__(self, in_chs, reduce_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid):
-        super(SqueezeExcite, self).__init__()
-        self.gate_fn = gate_fn
-        reduced_chs = reduce_chs or in_chs
-        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
-        self.act1 = act_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
-
-    def forward(self, x):
-        # NOTE adaptiveavgpool can be used here, but seems to cause issues with NVIDIA AMP performance
-        x_se = x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1, 1)
-        x_se = self.conv_reduce(x_se)
-        x_se = self.act1(x_se)
-        x_se = self.conv_expand(x_se)
-        x = x * self.gate_fn(x_se)
-        return x
-
-
-class ConvBnAct(nn.Module):
-    def __init__(self, in_chs, out_chs, kernel_size,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,):
-        super(ConvBnAct, self).__init__()
-        assert stride in [1, 2]
-        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn1 = norm_layer(out_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-    def feature_module(self, location):
-        return 'act1'
-
-    def feature_channels(self, location):
-        return self.conv.out_channels
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        return x
-
-
-class EdgeResidual(nn.Module):
-    """ Residual block with expansion convolution followed by pointwise-linear w/ stride"""
-
-    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
-                 se_ratio=0., se_reduce_mid=False, se_gate_fn=sigmoid,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0.):
-        super(EdgeResidual, self).__init__()
-        mid_chs = int(fake_in_chs * exp_ratio) if fake_in_chs > 0 else int(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
-
-        # Expansion convolution
-        self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
-        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if self.has_se:
-            se_base_chs = mid_chs if se_reduce_mid else in_chs
-            self.se = SqueezeExcite(
-                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
-
-        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(
-            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn2 = norm_layer(out_chs, **norm_kwargs)
-
-    def feature_module(self, location):
-        if location == 'post_exp':
-            return 'act1'
-        return 'conv_pwl'
-
-    def feature_channels(self, location):
-        if location == 'post_exp':
-            return self.conv_exp.out_channels
-        # location == 'pre_pw'
-        return self.conv_pwl.in_channels
-
-    def forward(self, x):
-        residual = x
-
-        # Expansion convolution
-        x = self.conv_exp(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Squeeze-and-excitation
-        if self.has_se:
-            x = self.se(x)
-
-        # Point-wise linear projection
-        x = self.conv_pwl(x)
-        x = self.bn2(x)
-
-        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
-            x += residual
-
-        return x
-
-
-class DepthwiseSeparableConv(nn.Module):
-    """ DepthwiseSeparable block
-    Used for DS convs in MobileNet-V1 and in the place of IR blocks with an expansion
-    factor of 1.0. This is an alternative to having a IR with an optional first pw conv.
-    """
-    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_gate_fn=sigmoid,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT, drop_connect_rate=0.):
-        super(DepthwiseSeparableConv, self).__init__()
-        assert stride in [1, 2]
-        self.has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
-        self.has_pw_act = pw_act  # activation after point-wise conv
-        self.drop_connect_rate = drop_connect_rate
-
-        self.conv_dw = select_conv2d(
-            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
-        self.bn1 = norm_layer(in_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if self.has_se:
-            self.se = SqueezeExcite(
-                in_chs, reduce_chs=max(1, int(in_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
-
-        self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_layer(out_chs, **norm_kwargs)
-        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
-
-    def feature_module(self, location):
-        # no expansion in this block, pre pw only feature extraction point
-        return 'conv_pw'
-
-    def feature_channels(self, location):
-        return self.conv_pw.in_channels
-
-    def forward(self, x):
-        residual = x
-
-        x = self.conv_dw(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        if self.has_se:
-            x = self.se(x)
-
-        x = self.conv_pw(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
-            x += residual
-        return x
-
-
-class InvertedResidual(nn.Module):
-    """ Inverted residual block w/ optional SE and CondConv routing"""
-
-    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
-                 se_ratio=0., se_reduce_mid=False, se_gate_fn=sigmoid,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
-                 num_experts=0, drop_connect_rate=0.):
-        super(InvertedResidual, self).__init__()
-        mid_chs = int(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
-
-        self.num_experts = num_experts
-        extra_args = dict()
-        if num_experts > 0:
-            extra_args = dict(num_experts=self.num_experts)
-            self.routing_fn = nn.Linear(in_chs, self.num_experts)
-            self.routing_act = torch.sigmoid
-
-        # Point-wise expansion
-        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **extra_args)
-        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Depth-wise convolution
-        self.conv_dw = select_conv2d(
-            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
-            padding=pad_type, depthwise=True, **extra_args)
-        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if self.has_se:
-            se_base_chs = mid_chs if se_reduce_mid else in_chs
-            self.se = SqueezeExcite(
-                mid_chs, reduce_chs=max(1, int(se_base_chs * se_ratio)), act_layer=act_layer, gate_fn=se_gate_fn)
-
-        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **extra_args)
-        self.bn3 = norm_layer(out_chs, **norm_kwargs)
-
-    def feature_module(self, location):
-        if location == 'post_exp':
-            return 'act1'
-        return 'conv_pwl'
-
-    def feature_channels(self, location):
-        if location == 'post_exp':
-            return self.conv_pw.out_channels
-        # location == 'pre_pw'
-        return self.conv_pwl.in_channels
-
-    def forward(self, x):
-        residual = x
-
-        conv_pw, conv_dw, conv_pwl = self.conv_pw, self.conv_dw, self.conv_pwl
-        if self.num_experts > 0:
-            pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
-            routing_weights = self.routing_act(self.routing_fn(pooled_inputs))
-            conv_pw = partial(self.conv_pw, routing_weights=routing_weights)
-            conv_dw = partial(self.conv_dw, routing_weights=routing_weights)
-            conv_pwl = partial(self.conv_pwl, routing_weights=routing_weights)
-
-        # Point-wise expansion
-        x = conv_pw(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Depth-wise convolution
-        x = conv_dw(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Squeeze-and-excitation
-        if self.has_se:
-            x = self.se(x)
-
-        # Point-wise linear projection
-        x = conv_pwl(x)
-        x = self.bn3(x)
-
-        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
-            x += residual
-
-        return x
-
-
-class _GenEfficientNet(nn.Module):
-    """ Generic EfficientNet Base
-    """
-
-    def __init__(self, block_args, in_chans=3, stem_size=32,
+    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                  channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_gate_fn=sigmoid, se_reduce_mid=False, norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
-                 feature_location='pre_pwl'):
-        super(_GenEfficientNet, self).__init__()
+                 pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 global_pool='avg', weight_init='goog'):
+        super(EfficientNet, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+
+        self.num_classes = num_classes
+        self.num_features = num_features
         self.drop_rate = drop_rate
         self._in_chs = in_chans
 
         # Stem
-        stem_size = _round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
         self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
         self.bn1 = norm_layer(stem_size, **norm_kwargs)
         self.act1 = act_layer(inplace=True)
         self._in_chs = stem_size
 
         # Middle stages (IR/ER/DS Blocks)
-        builder = _BlockBuilder(
-            channel_multiplier, channel_divisor, channel_min,
-            output_stride, pad_type, act_layer, se_gate_fn, se_reduce_mid,
-            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+        builder = EfficientNetBuilder(
+            channel_multiplier, channel_divisor, channel_min, 32, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
         self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
         self.feature_info = builder.features
         self._in_chs = builder.in_chs
 
-    def as_sequential(self):
-        layers = [self.conv_stem, self.bn1, self.act1]
-        layers.extend(self.blocks)
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.blocks(x)
-        return x
-
-
-class GenEfficientNet(_GenEfficientNet):
-    """ Generic EfficientNet
-
-    An implementation of efficient network architectures, in many cases mobile optimized networks:
-      * MobileNet-V1
-      * MobileNet-V2
-      * MobileNet-V3
-      * MnasNet A1, B1, and small
-      * FBNet A, B, and C
-      * ChamNet (arch details are murky)
-      * Single-Path NAS Pixel1
-      * EfficientNet B0-B7
-      * MixNet S, M, L
-    """
-
-    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
-                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_gate_fn=sigmoid, se_reduce_mid=False,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
-                 global_pool='avg', head_conv='default', weight_init='goog'):
-
-        self.num_classes = num_classes
-        self.num_features = num_features
-        super(GenEfficientNet, self).__init__(  # FIXME it would be nice if Python made this nicer
-            block_args, in_chans=in_chans, stem_size=stem_size,
-            pad_type=pad_type, act_layer=act_layer, drop_rate=drop_rate, drop_connect_rate=drop_connect_rate,
-            channel_multiplier=channel_multiplier, channel_divisor=channel_divisor, channel_min=channel_min,
-            se_gate_fn=se_gate_fn, se_reduce_mid=se_reduce_mid, norm_layer=norm_layer, norm_kwargs=norm_kwargs)
-
         # Head + Pooling
-        self.conv_head = None
-        self.global_pool = None
-        self.act2 = None
-        self.forward_head = None
-        self.head_conv = head_conv
-        if head_conv == 'efficient':
-            self._create_head_efficient(global_pool, pad_type, act_layer)
-        elif head_conv == 'default':
-            self._create_head_default(global_pool, pad_type, act_layer, norm_layer, norm_kwargs)
+        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
+        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
 
         # Classifier
         self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), self.num_classes)
 
         for m in self.modules():
             if weight_init == 'goog':
-                _init_weight_goog(m)
+                efficientnet_init_goog(m)
             else:
-                _init_weight_default(m)
-
-    def _create_head_default(self, global_pool, pad_type, act_layer, norm_layer, norm_kwargs):
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
-        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-
-    def _create_head_efficient(self, global_pool, pad_type, act_layer):
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
-        self.act2 = act_layer(inplace=True)
-
-    def _forward_head_default(self, x):
-        x = self.conv_head(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-        return x
-
-    def _forward_head_efficient(self, x):
-        x = self.global_pool(x)
-        x = self.conv_head(x)
-        x = self.act2(x)
-        return x
+                efficientnet_init_default(m)
 
     def as_sequential(self):
         layers = [self.conv_stem, self.bn1, self.act1]
         layers.extend(self.blocks)
-        if self.head_conv == 'efficient':
-            layers.extend([self.global_pool, self.conv_head, self.act2])
-        else:
-            layers.extend([self.conv_head, self.bn2, self.act2])
-            if self.global_pool is not None:
-                layers.append(self.global_pool)
-        layers.extend([Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+        layers.extend([self.conv_head, self.bn2, self.act2, self.global_pool])
+        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
         return nn.Sequential(*layers)
 
     def get_classifier(self):
@@ -1075,86 +266,80 @@ class GenEfficientNet(_GenEfficientNet):
             self.classifier = None
 
     def forward_features(self, x):
-        x = super(GenEfficientNet, self).forward(x)
-        if self.head_conv == 'efficient':
-            x = self._forward_head_efficient(x)
-        elif self.head_conv == 'default':
-            x = self._forward_head_default(x)
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.conv_head(x)
+        x = self.bn2(x)
+        x = self.act2(x)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
-        if self.global_pool is not None and x.shape[-1] > 1 or x.shape[-2] > 1:
-            x = self.global_pool(x)
+        x = self.global_pool(x)
         x = x.flatten(1)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         return self.classifier(x)
 
 
-class GenEfficientNetFeatures(_GenEfficientNet):
-    """ Generic EfficientNet Feature Extractor
+class EfficientNetFeatures(nn.Module):
+    """ EfficientNet Feature Extractor
+
+    A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation
+    and object detection models.
     """
 
     def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                  in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                  output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_gate_fn=sigmoid, se_reduce_mid=False, norm_layer=nn.BatchNorm2d, norm_kwargs=_BN_ARGS_PT,
-                 weight_init='goog'):
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+        super(EfficientNetFeatures, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+
+        # TODO only create stages needed, currently all stages are created regardless of out_indices
+        num_stages = max(out_indices) + 1
 
-        # validate and modify block arguments and out indices for feature extraction
-        num_stages = max(out_indices) + 1  # FIXME reduce num stages created if not needed
-        #assert len(block_args) >= num_stages - 1
-        #block_args = block_args[:num_stages - 1]
         self.out_indices = out_indices
+        self.drop_rate = drop_rate
+        self._in_chs = in_chans
 
-        # FIXME it would be nice if Python made this nicer without using kwargs and erasing IDE hints, etc
-        super(GenEfficientNetFeatures, self).__init__(
-            block_args, in_chans=in_chans, stem_size=stem_size,
-            output_stride=output_stride, pad_type=pad_type, act_layer=act_layer,
-            drop_rate=drop_rate, drop_connect_rate=drop_connect_rate, feature_location=feature_location,
-            channel_multiplier=channel_multiplier, channel_divisor=channel_divisor, channel_min=channel_min,
-            se_gate_fn=se_gate_fn, se_reduce_mid=se_reduce_mid, norm_layer=norm_layer, norm_kwargs=norm_kwargs)
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self._in_chs = stem_size
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
+        self.feature_info = builder.features  # builder provides info about feature channels for each block
+        self._in_chs = builder.in_chs
 
         for m in self.modules():
             if weight_init == 'goog':
-                _init_weight_goog(m)
+                efficientnet_init_goog(m)
             else:
-                _init_weight_default(m)
+                efficientnet_init_default(m)
 
         if _DEBUG:
             for k, v in self.feature_info.items():
                 print('Feature idx: {}: Name: {}, Channels: {}'.format(k, v['name'], v['num_chs']))
+
+        # Register feature extraction hooks with FeatureHooks helper
         hook_type = 'forward_pre' if feature_location == 'pre_pwl' else 'forward'
         hooks = [dict(name=self.feature_info[idx]['name'], type=hook_type) for idx in out_indices]
-        self._feature_outputs = None
-        self._register_hooks(hooks)
-
-    def _collect_output_hook(self, name, *args):
-        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
-        if isinstance(x, tuple):
-            x = x[0]  # unwrap input tuple
-        self._feature_outputs[x.device][name] = x
-
-    def _get_output(self, device):
-        output = tuple(self._feature_outputs[device].values())[::-1]
-        self._feature_outputs[device] = OrderedDict()
-        return output
-
-    def _register_hooks(self, hooks):
-        # setup feature hooks
-        modules = {k: v for k, v in self.named_modules()}
-        for h in hooks:
-            hook_name = h['name']
-            m = modules[hook_name]
-            hook_fn = partial(self._collect_output_hook, hook_name)
-            if h['type'] == 'forward_pre':
-                m.register_forward_pre_hook(hook_fn)
-            else:
-                m.register_forward_hook(hook_fn)
-        self._feature_outputs = defaultdict(OrderedDict)
+        self.feature_hooks = FeatureHooks(hooks, self.named_modules())
 
     def feature_channels(self, idx=None):
+        """ Feature Channel Shortcut
+        Returns feature channel count for each output index if idx == None. If idx is an integer, will
+        return feature channel count for that feature block index (independent of out_indices setting).
+        """
         if isinstance(idx, int):
             return self.feature_info[idx]['num_chs']
         return [self.feature_info[i]['num_chs'] for i in self.out_indices]
@@ -1164,7 +349,7 @@ class GenEfficientNetFeatures(_GenEfficientNet):
         x = self.bn1(x)
         x = self.act1(x)
         self.blocks(x)
-        return self._get_output(x.device)
+        return self.feature_hooks.get_output(x.device)
 
 
 def _create_model(model_kwargs, default_cfg, pretrained=False):
@@ -1173,10 +358,10 @@ def _create_model(model_kwargs, default_cfg, pretrained=False):
         model_kwargs.pop('num_classes', 0)
         model_kwargs.pop('num_features', 0)
         model_kwargs.pop('head_conv', None)
-        model_class = GenEfficientNetFeatures
+        model_class = EfficientNetFeatures
     else:
         load_strict = True
-        model_class = GenEfficientNet
+        model_class = EfficientNet
 
     model = model_class(**model_kwargs)
     model.default_cfg = default_cfg
@@ -1216,10 +401,10 @@ def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs)
         ['ir_r1_k3_s1_e6_c320'],
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1252,10 +437,10 @@ def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs)
         ['ir_r1_k3_s1_e6_c320_noskip']
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1281,36 +466,10 @@ def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwar
         ['ir_r1_k3_s1_e6_c144']
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=8,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
-    return model
-
-
-def _gen_mobilenet_v1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """ Generate MobileNet-V1 network
-    Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
-    Paper: https://arxiv.org/abs/1801.04381
-    """
-    arch_def = [
-        ['dsa_r1_k3_s1_c64'],
-        ['dsa_r2_k3_s2_c128'],
-        ['dsa_r2_k3_s2_c256'],
-        ['dsa_r6_k3_s2_c512'],
-        ['dsa_r2_k3_s2_c1024'],
-    ]
-    model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
-        stem_size=32,
-        num_features=1024,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
-        act_layer=nn.ReLU6,
-        head_conv='none',
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1332,10 +491,10 @@ def _gen_mobilenet_v2(variant, channel_multiplier=1.0, pretrained=False, **kwarg
         ['ir_r1_k3_s1_e6_c320'],
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         act_layer=nn.ReLU6,
         **kwargs
     )
@@ -1343,104 +502,6 @@ def _gen_mobilenet_v2(variant, channel_multiplier=1.0, pretrained=False, **kwarg
     return model
 
 
-def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a MobileNet-V3 model.
-
-    Ref impl: ?
-    Paper: https://arxiv.org/abs/1905.02244
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_e1_c16_nre_noskip'],  # relu
-        # stage 1, 112x112 in
-        ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
-        # stage 2, 56x56 in
-        ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
-        # stage 3, 28x28 in
-        ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
-        # stage 4, 14x14in
-        ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
-        # stage 5, 14x14in
-        ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
-        # stage 6, 7x7 in
-        ['cn_r1_k1_s1_c960'],  # hard-swish
-    ]
-    model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
-        stem_size=16,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
-        act_layer=HardSwish,
-        se_gate_fn=hard_sigmoid,
-        se_reduce_mid=True,
-        head_conv='efficient',
-        **kwargs,
-    )
-    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
-    return model
-
-
-def _gen_chamnet_v1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """ Generate Chameleon Network (ChamNet)
-
-    Paper: https://arxiv.org/abs/1812.08934
-    Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
-
-    FIXME: this a bit of an educated guess based on trunkd def in maskrcnn_benchmark
-    """
-    arch_def = [
-        ['ir_r1_k3_s1_e1_c24'],
-        ['ir_r2_k7_s2_e4_c48'],
-        ['ir_r5_k3_s2_e7_c64'],
-        ['ir_r7_k5_s2_e12_c56'],
-        ['ir_r5_k3_s1_e8_c88'],
-        ['ir_r4_k3_s2_e7_c152'],
-        ['ir_r1_k3_s1_e10_c104'],
-    ]
-    model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
-        stem_size=32,
-        num_features=1280,  # no idea what this is? try mobile/mnasnet default?
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
-    return model
-
-
-def _gen_chamnet_v2(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """ Generate Chameleon Network (ChamNet)
-
-    Paper: https://arxiv.org/abs/1812.08934
-    Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
-
-    FIXME: this a bit of an educated guess based on trunk def in maskrcnn_benchmark
-    """
-    arch_def = [
-        ['ir_r1_k3_s1_e1_c24'],
-        ['ir_r4_k5_s2_e8_c32'],
-        ['ir_r6_k7_s2_e5_c48'],
-        ['ir_r3_k5_s2_e9_c56'],
-        ['ir_r6_k3_s1_e6_c56'],
-        ['ir_r6_k3_s2_e2_c152'],
-        ['ir_r1_k3_s1_e6_c112'],
-    ]
-    model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
-        stem_size=32,
-        num_features=1280,  # no idea what this is? try mobile/mnasnet default?
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
-    return model
-
-
 def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
     """ FBNet-C
 
@@ -1460,11 +521,11 @@ def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
         ['ir_r1_k3_s1_e6_c352'],
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=16,
         num_features=1984,  # paper suggests this, but is not 100% clear
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1496,10 +557,10 @@ def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
         ['ir_r1_k3_s1_e6_c320_noskip']
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1522,6 +583,7 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
     'efficientnet-b5': (1.6, 2.2, 456, 0.4),
     'efficientnet-b6': (1.8, 2.6, 528, 0.5),
     'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
 
     Args:
       channel_multiplier: multiplier to number of channels per layer
@@ -1538,12 +600,12 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
         ['ir_r1_k3_s1_e6_c320_se0.25'],
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def, depth_multiplier),
-        num_features=_round_channels(1280, channel_multiplier, 8, None),
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
         act_layer=Swish,
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs,
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1567,11 +629,11 @@ def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0
         ['ir_r2_k5_s2_e8_c192'],
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def, depth_multiplier),
-        num_features=_round_channels(1280, channel_multiplier, 8, None),
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         act_layer=nn.ReLU,
         **kwargs,
     )
@@ -1597,11 +659,11 @@ def _gen_efficientnet_condconv(
     # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
     # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
-        num_features=_round_channels(1280, channel_multiplier, 8, None),
+        block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
         stem_size=32,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         act_layer=Swish,
         **kwargs,
     )
@@ -1631,11 +693,11 @@ def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
         # 7x7
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def),
+        block_args=decode_arch_def(arch_def),
         num_features=1536,
         stem_size=16,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1664,11 +726,11 @@ def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrai
         # 7x7
     ]
     model_kwargs = dict(
-        block_args=_decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
         num_features=1536,
         stem_size=24,
         channel_multiplier=channel_multiplier,
-        norm_kwargs=_resolve_bn_args(kwargs),
+        norm_kwargs=resolve_bn_args(kwargs),
         **kwargs
     )
     model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@@ -1750,13 +812,6 @@ def mnasnet_small(pretrained=False, **kwargs):
     return model
 
 
-@register_model
-def mobilenetv1_100(pretrained=False, **kwargs):
-    """ MobileNet V1 """
-    model = _gen_mobilenet_v1('mobilenetv1_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def mobilenetv2_100(pretrained=False, **kwargs):
     """ MobileNet V2 """
@@ -1764,54 +819,16 @@ def mobilenetv2_100(pretrained=False, **kwargs):
     return model
 
 
-@register_model
-def mobilenetv3_050(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_050', 0.5, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_075(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    if pretrained:
-        # pretrained model trained with non-default BN epsilon
-        kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
-    model = _gen_mobilenet_v3('mobilenetv3_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def fbnetc_100(pretrained=False, **kwargs):
     """ FBNet-C """
     if pretrained:
         # pretrained model trained with non-default BN epsilon
-        kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
     return model
 
 
-@register_model
-def chamnetv1_100(pretrained=False, **kwargs):
-    """ ChamNet """
-    model = _gen_chamnet_v1('chamnetv1_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def chamnetv2_100(pretrained=False, **kwargs):
-    """ ChamNet """
-    model = _gen_chamnet_v2('chamnetv2_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
 @register_model
 def spnasnet_100(pretrained=False, **kwargs):
     """ Single-Path NAS Pixel1"""
@@ -1957,7 +974,7 @@ def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_b0(pretrained=False, **kwargs):
     """ EfficientNet-B0. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
@@ -1967,7 +984,7 @@ def tf_efficientnet_b0(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_b1(pretrained=False, **kwargs):
     """ EfficientNet-B1. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
@@ -1977,7 +994,7 @@ def tf_efficientnet_b1(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_b2(pretrained=False, **kwargs):
     """ EfficientNet-B2. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
@@ -1987,7 +1004,7 @@ def tf_efficientnet_b2(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_b3(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """ EfficientNet-B3. Tensorflow compatible variant """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
@@ -1997,7 +1014,7 @@ def tf_efficientnet_b3(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
 @register_model
 def tf_efficientnet_b4(pretrained=False, **kwargs):
     """ EfficientNet-B4. Tensorflow compatible variant """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
@@ -2007,7 +1024,7 @@ def tf_efficientnet_b4(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_b5(pretrained=False, **kwargs):
     """ EfficientNet-B5. Tensorflow compatible variant """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
@@ -2018,7 +1035,7 @@ def tf_efficientnet_b5(pretrained=False, **kwargs):
 def tf_efficientnet_b6(pretrained=False, **kwargs):
     """ EfficientNet-B6. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
@@ -2029,17 +1046,111 @@ def tf_efficientnet_b6(pretrained=False, **kwargs):
 def tf_efficientnet_b7(pretrained=False, **kwargs):
     """ EfficientNet-B7. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet(
         'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
     return model
 
 
+@register_model
+def tf_efficientnet_b0_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B0. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b1_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B1. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B2. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b3_ap(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """ EfficientNet-B3. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b4_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B4. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b5_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B5. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b6_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B6. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b7_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B7. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_efficientnet_b8_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B7. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+
 @register_model
 def tf_efficientnet_es(pretrained=False, **kwargs):
     """ EfficientNet-Edge Small. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
         'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
@@ -2049,7 +1160,7 @@ def tf_efficientnet_es(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_em(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Medium. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
         'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
@@ -2059,7 +1170,7 @@ def tf_efficientnet_em(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_el(pretrained=False, **kwargs):
     """ EfficientNet-Edge-Large. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_edge(
         'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
@@ -2071,7 +1182,7 @@ def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
         'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
@@ -2083,7 +1194,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
         'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
@@ -2095,7 +1206,7 @@ def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
     # NOTE for train, drop_rate should be 0.2
     #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
         'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
@@ -2155,7 +1266,7 @@ def mixnet_xxl(pretrained=False, **kwargs):
 def tf_mixnet_s(pretrained=False, **kwargs):
     """Creates a MixNet Small model. Tensorflow compatible variant
     """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_s(
         'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
@@ -2166,7 +1277,7 @@ def tf_mixnet_s(pretrained=False, **kwargs):
 def tf_mixnet_m(pretrained=False, **kwargs):
     """Creates a MixNet Medium model. Tensorflow compatible variant
     """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_m(
         'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
@@ -2177,7 +1288,7 @@ def tf_mixnet_m(pretrained=False, **kwargs):
 def tf_mixnet_l(pretrained=False, **kwargs):
     """Creates a MixNet Large model. Tensorflow compatible variant
     """
-    kwargs['bn_eps'] = _BN_EPS_TF_DEFAULT
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_mixnet_m(
         'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
diff --git a/timm/models/layers.py b/timm/models/layers.py
deleted file mode 100644
index c8e0a837..00000000
--- a/timm/models/layers.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def versiontuple(v):
-    return tuple(map(int, (v.split("."))))[:3]
-
-
-if versiontuple(torch.__version__) >= versiontuple('1.2.0'):
-    Flatten = nn.Flatten
-else:
-    class Flatten(nn.Module):
-        r"""
-        Flattens a contiguous range of dims into a tensor. For use with :class:`~nn.Sequential`.
-        Args:
-            start_dim: first dim to flatten (default = 1).
-            end_dim: last dim to flatten (default = -1).
-        Shape:
-            - Input: :math:`(N, *dims)`
-            - Output: :math:`(N, \prod *dims)` (for the default case).
-        """
-        __constants__ = ['start_dim', 'end_dim']
-
-        def __init__(self, start_dim=1, end_dim=-1):
-            super(Flatten, self).__init__()
-            self.start_dim = start_dim
-            self.end_dim = end_dim
-
-        def forward(self, input):
-            return input.flatten(self.start_dim, self.end_dim)
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
new file mode 100644
index 00000000..13fd16e6
--- /dev/null
+++ b/timm/models/mobilenetv3.py
@@ -0,0 +1,439 @@
+
+""" MobileNet V3
+
+A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
+
+Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
+
+Hacked together by Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .efficientnet_builder import *
+from .activations import HardSwish, hard_sigmoid
+from .registry import register_model
+from .helpers import load_pretrained
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .conv2d_layers import select_conv2d
+from .feature_hooks import FeatureHooks
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+
+__all__ = ['MobileNetV3']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mobilenetv3_large_075': _cfg(url=''),
+    'mobilenetv3_large_100': _cfg(url=''),
+    'mobilenetv3_rw': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
+        interpolation='bicubic'),
+    'tf_mobilenetv3_large_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_100': _cfg(
+        url= 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+}
+
+_DEBUG = False
+
+
+class MobileNetV3(nn.Module):
+    """ MobiletNet-V3
+
+    Based on my EfficientNet implementation and building blocks, this model utilizes the MobileNet-v3 specific
+    'efficient head', where global pooling is done before the head convolution without a final batch-norm
+    layer before the classifier.
+
+    Paper: https://arxiv.org/abs/1905.02244
+    """
+
+    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
+                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 global_pool='avg', weight_init='goog'):
+        super(MobileNetV3, self).__init__()
+        
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+        self._in_chs = in_chans
+
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier)
+        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self._in_chs = stem_size
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
+        self.feature_info = builder.features
+        self._in_chs = builder.in_chs
+        
+        # Head + Pooling
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
+        self.act2 = act_layer(inplace=True)
+
+        # Classifier
+        self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), self.num_classes)
+
+        for m in self.modules():
+            if weight_init == 'goog':
+                efficientnet_init_goog(m)
+            else:
+                efficientnet_init_default(m)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1, self.act1]
+        layers.extend(self.blocks)
+        layers.extend([self.global_pool, self.conv_head, self.act2])
+        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.num_classes = num_classes
+        del self.classifier
+        if num_classes:
+            self.classifier = nn.Linear(
+                self.num_features * self.global_pool.feat_mult(), num_classes)
+        else:
+            self.classifier = None
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.flatten(1)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return self.classifier(x)
+
+
+class MobileNetV3Features(nn.Module):
+    """ MobileNetV3 Feature Extractor
+
+    A work-in-progress feature extraction module for MobileNet-V3 to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
+                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
+                 act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0., se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+        super(MobileNetV3Features, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+
+        # TODO only create stages needed, currently all stages are created regardless of out_indices
+        num_stages = max(out_indices) + 1
+
+        self.out_indices = out_indices
+        self.drop_rate = drop_rate
+        self._in_chs = in_chans
+
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier)
+        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self._in_chs = stem_size
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
+        self.feature_info = builder.features  # builder provides info about feature channels for each block
+        self._in_chs = builder.in_chs
+
+        for m in self.modules():
+            if weight_init == 'goog':
+                efficientnet_init_goog(m)
+            else:
+                efficientnet_init_default(m)
+
+        if _DEBUG:
+            for k, v in self.feature_info.items():
+                print('Feature idx: {}: Name: {}, Channels: {}'.format(k, v['name'], v['num_chs']))
+
+        # Register feature extraction hooks with FeatureHooks helper
+        hook_type = 'forward_pre' if feature_location == 'pre_pwl' else 'forward'
+        hooks = [dict(name=self.feature_info[idx]['name'], type=hook_type) for idx in out_indices]
+        self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def feature_channels(self, idx=None):
+        """ Feature Channel Shortcut
+        Returns feature channel count for each output index if idx == None. If idx is an integer, will
+        return feature channel count for that feature block index (independent of out_indices setting).
+        """
+        if isinstance(idx, int):
+            return self.feature_info[idx]['num_chs']
+        return [self.feature_info[i]['num_chs'] for i in self.out_indices]
+
+    def forward(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        self.blocks(x)
+        return self.feature_hooks.get_output(x.device)
+
+
+def _create_model(model_kwargs, default_cfg, pretrained=False):
+    if model_kwargs.pop('features_only', False):
+        load_strict = False
+        model_kwargs.pop('num_classes', 0)
+        model_kwargs.pop('num_features', 0)
+        model_kwargs.pop('head_conv', None)
+        model_class = MobileNetV3Features
+    else:
+        load_strict = True
+        model_class = MobileNetV3
+
+    model = model_class(**model_kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model,
+            default_cfg,
+            num_classes=model_kwargs.get('num_classes', 0),
+            in_chans=model_kwargs.get('in_chans', 3),
+            strict=load_strict)
+    return model
+
+
+def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_nre_noskip'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+        # stage 3, 28x28 in
+        ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+        # stage 6, 7x7 in
+        ['cn_r1_k1_s1_c960'],  # hard-swish
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        head_bias=False,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=HardSwish,
+        se_kwargs=dict(gate_fn=hard_sigmoid, reduce_mid=True, divisor=1),
+        **kwargs,
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
+    return model
+
+
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    if 'small' in variant:
+        num_features = 1024
+        if 'minimal' in variant:
+            act_layer = nn.ReLU
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16'],
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
+                # stage 2, 28x28 in
+                ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
+                # stage 3, 14x14 in
+                ['ir_r2_k3_s1_e3_c48'],
+                # stage 4, 14x14in
+                ['ir_r3_k3_s2_e6_c96'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],
+            ]
+        else:
+            act_layer = HardSwish
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'],  # relu
+                # stage 2, 28x28 in
+                ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'],  # hard-swish
+                # stage 3, 14x14 in
+                ['ir_r2_k5_s1_e3_c48_se0.25'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r3_k5_s2_e6_c96_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],  # hard-swish
+            ]
+    else:
+        num_features = 1280
+        if 'minimal' in variant:
+            act_layer = nn.ReLU
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16'],
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
+                # stage 2, 56x56 in
+                ['ir_r3_k3_s2_e3_c40'],
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112'],
+                # stage 5, 14x14in
+                ['ir_r3_k3_s2_e6_c160'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],
+            ]
+        else:
+            act_layer = HardSwish
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+                # stage 2, 56x56 in
+                ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+                # stage 5, 14x14in
+                ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],  # hard-swish
+            ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=num_features,
+        stem_size=16,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=act_layer,
+        se_kwargs=dict(act_layer=nn.ReLU, gate_fn=hard_sigmoid, reduce_mid=True, divisor=8),
+        **kwargs,
+    )
+    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
+    return model
+
+
+@register_model
+def mobilenetv3_rw(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    if pretrained:
+        # pretrained model trained with non-default BN epsilon
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+
+@register_model
+def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model

From 3b8f63084db5bef537dbef5f2b1afb209d0898b8 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 16:42:37 -0800
Subject: [PATCH 18/35] Clean checkpoint renames pth w/ SHA hash in a torch hub
 compatible way

---
 clean_checkpoint.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/clean_checkpoint.py b/clean_checkpoint.py
index 59a6e306..d51e0d96 100644
--- a/clean_checkpoint.py
+++ b/clean_checkpoint.py
@@ -2,6 +2,7 @@ import torch
 import argparse
 import os
 import hashlib
+import shutil
 from collections import OrderedDict
 
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
@@ -31,10 +32,9 @@ def main():
             if state_dict_key in checkpoint:
                 state_dict = checkpoint[state_dict_key]
             else:
-                print("Error: No state_dict found in checkpoint {}.".format(args.checkpoint))
-                exit(1)
+                state_dict = checkpoint
         else:
-            state_dict = checkpoint
+            assert False
         for k, v in state_dict.items():
             name = k[7:] if k.startswith('module') else k
             new_state_dict[name] = v
@@ -43,7 +43,11 @@ def main():
         torch.save(new_state_dict, args.output)
         with open(args.output, 'rb') as f:
             sha_hash = hashlib.sha256(f.read()).hexdigest()
-        print("=> Saved state_dict to '{}, SHA256: {}'".format(args.output, sha_hash))
+
+        checkpoint_base = os.path.splitext(args.checkpoint)[0]
+        final_filename = '-'.join([checkpoint_base, sha_hash[:8]]) + '.pth'
+        shutil.move(args.output, final_filename)
+        print("=> Saved state_dict to '{}, SHA256: {}'".format(final_filename, sha_hash))
     else:
         print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
 

From 5a0a8de7e376df850c018131e916de1f57f89d0e Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 16:46:21 -0800
Subject: [PATCH 19/35] ResNet updates: * remove redundant GluonResNet
 model/blocks and use the code in ResNet for Gluon weights * change SEModules
 back to using AdaptiveAvgPool instead of mean, PyTorch issue long fixed

---
 timm/models/gluon_resnet.py | 390 ++++--------------------------------
 timm/models/resnet.py       |   8 +-
 timm/models/senet.py        |   5 +-
 3 files changed, 50 insertions(+), 353 deletions(-)

diff --git a/timm/models/gluon_resnet.py b/timm/models/gluon_resnet.py
index 715e0950..3d0f926f 100644
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@@ -11,11 +11,9 @@ import torch.nn.functional as F
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
-
-__all__ = ['GluonResNet']
+from .resnet import ResNet, Bottleneck, BasicBlock
 
 
 def _cfg(url='', **kwargs):
@@ -57,312 +55,12 @@ default_cfgs = {
 }
 
 
-def _get_padding(kernel_size, stride, dilation=1):
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-class SEModule(nn.Module):
-
-    def __init__(self, channels, reduction_channels):
-        super(SEModule, self).__init__()
-        #self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc1 = nn.Conv2d(
-            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
-        self.relu = nn.ReLU()
-        self.fc2 = nn.Conv2d(
-            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, x):
-        module_input = x
-        #x = self.avg_pool(x)
-        x = x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1, 1)
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.fc2(x)
-        x = self.sigmoid(x)
-        return module_input * x
-
-
-class BasicBlockGl(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=64, use_se=False,
-                 reduce_first=1, dilation=1, previous_dilation=1, norm_layer=nn.BatchNorm2d):
-        super(BasicBlockGl, self).__init__()
-
-        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
-        assert base_width == 64, 'BasicBlock doest not support changing base width'
-        first_planes = planes // reduce_first
-        outplanes = planes * self.expansion
-
-        self.conv1 = nn.Conv2d(
-            inplanes, first_planes, kernel_size=3, stride=stride, padding=dilation,
-            dilation=dilation, bias=False)
-        self.bn1 = norm_layer(first_planes)
-        self.relu = nn.ReLU()
-        self.conv2 = nn.Conv2d(
-            first_planes, outplanes, kernel_size=3, padding=previous_dilation,
-            dilation=previous_dilation, bias=False)
-        self.bn2 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.se is not None:
-            out = self.se(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class BottleneckGl(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=64, use_se=False,
-                 reduce_first=1, dilation=1, previous_dilation=1, norm_layer=nn.BatchNorm2d):
-        super(BottleneckGl, self).__init__()
-
-        width = int(math.floor(planes * (base_width / 64)) * cardinality)
-        first_planes = width // reduce_first
-        outplanes = planes * self.expansion
-
-        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(first_planes)
-        self.conv2 = nn.Conv2d(
-            first_planes, width, kernel_size=3, stride=stride,
-            padding=dilation, dilation=dilation, groups=cardinality, bias=False)
-        self.bn2 = norm_layer(width)
-        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
-        self.relu = nn.ReLU()
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.se is not None:
-            out = self.se(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class GluonResNet(nn.Module):
-    """ Gluon ResNet (https://gluon-cv.mxnet.io/model_zoo/classification.html)
-    This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet found in the gluon model zoo that
-      * have stride in 3x3 conv layer of bottleneck
-      * have conv-bn-act ordering
-
-    Included ResNet variants are:
-      * v1b - 7x7 stem, stem_width=64, same as torchvision ResNet (checkpoint compatible), or NVIDIA ResNet 'v1.5'
-      * v1c - 3 layer deep 3x3 stem, stem_width = 32
-      * v1d - 3 layer deep 3x3 stem, stem_width = 32, average pool in downsample
-      * v1e - 3 layer deep 3x3 stem, stem_width = 64, average pool in downsample  *no pretrained weights available
-      * v1s - 3 layer deep 3x3 stem, stem_width = 64
-
-    ResNeXt is standard and checkpoint compatible with torchvision pretrained models. 7x7 stem,
-        stem_width = 64, standard cardinality and base width calcs
-
-    SE-ResNeXt is standard. 7x7 stem, stem_width = 64,
-        checkpoints are not compatible with Cadene pretrained, but could be with key mapping
-
-    SENet-154 is standard. 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
-        reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
-
-    Original ResNet-V1, ResNet-V2 (bn-act-conv), and SE-ResNet (stride in first bottleneck conv) are NOT supported.
-    They do have Gluon pretrained weights but are, at best, comparable (or inferior) to the supported models.
-
-    Parameters
-    ----------
-    block : Block
-        Class for the residual block. Options are BasicBlockGl, BottleneckGl.
-    layers : list of int
-        Numbers of layers in each block
-    num_classes : int, default 1000
-        Number of classification classes.
-    deep_stem : bool, default False
-        Whether to replace the 7x7 conv1 with 3 3x3 convolution layers.
-    block_reduce_first: int, default 1
-        Reduction factor for first convolution output width of residual blocks,
-        1 for all archs except senets, where 2
-    down_kernel_size: int, default 1
-        Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
-    avg_down : bool, default False
-        Whether to use average pooling for projection skip connection between stages/downsample.
-    dilated : bool, default False
-        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
-        typically used in Semantic Segmentation.
-    """
-    def __init__(self, block, layers, num_classes=1000, in_chans=3, use_se=False,
-                 cardinality=1, base_width=64, stem_width=64, deep_stem=False,
-                 block_reduce_first=1, down_kernel_size=1, avg_down=False, dilated=False,
-                 norm_layer=nn.BatchNorm2d, drop_rate=0.0, global_pool='avg'):
-        self.num_classes = num_classes
-        self.inplanes = stem_width * 2 if deep_stem else 64
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.drop_rate = drop_rate
-        self.expansion = block.expansion
-        self.dilated = dilated
-        super(GluonResNet, self).__init__()
-
-        if not deep_stem:
-            self.conv1 = nn.Conv2d(in_chans, stem_width, kernel_size=7, stride=2, padding=3, bias=False)
-        else:
-            conv1_modules = [
-                nn.Conv2d(in_chans, stem_width, 3, stride=2, padding=1, bias=False),
-                norm_layer(stem_width),
-                nn.ReLU(),
-                nn.Conv2d(stem_width, stem_width, 3, stride=1, padding=1, bias=False),
-                norm_layer(stem_width),
-                nn.ReLU(),
-                nn.Conv2d(stem_width, self.inplanes, 3, stride=1, padding=1, bias=False),
-            ]
-            self.conv1 = nn.Sequential(*conv1_modules)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU()
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        stride_3_4 = 1 if self.dilated else 2
-        dilation_3 = 2 if self.dilated else 1
-        dilation_4 = 4 if self.dilated else 1
-        self.layer1 = self._make_layer(
-            block, 64, layers[0], stride=1, reduce_first=block_reduce_first,
-            use_se=use_se, avg_down=avg_down, down_kernel_size=1, norm_layer=norm_layer)
-        self.layer2 = self._make_layer(
-            block, 128, layers[1], stride=2, reduce_first=block_reduce_first,
-            use_se=use_se, avg_down=avg_down, down_kernel_size=down_kernel_size, norm_layer=norm_layer)
-        self.layer3 = self._make_layer(
-            block, 256, layers[2], stride=stride_3_4, dilation=dilation_3, reduce_first=block_reduce_first,
-            use_se=use_se, avg_down=avg_down, down_kernel_size=down_kernel_size, norm_layer=norm_layer)
-        self.layer4 = self._make_layer(
-            block, 512, layers[3], stride=stride_3_4, dilation=dilation_4, reduce_first=block_reduce_first,
-            use_se=use_se, avg_down=avg_down, down_kernel_size=down_kernel_size, norm_layer=norm_layer)
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.num_features = 512 * block.expansion
-        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1.)
-                nn.init.constant_(m.bias, 0.)
-
-    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, reduce_first=1,
-                    use_se=False, avg_down=False, down_kernel_size=1, norm_layer=nn.BatchNorm2d):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample_padding = _get_padding(down_kernel_size, stride)
-            if avg_down:
-                avg_stride = stride if dilation == 1 else 1
-                downsample_layers = [
-                    nn.AvgPool2d(avg_stride, avg_stride, ceil_mode=True, count_include_pad=False),
-                    nn.Conv2d(self.inplanes, planes * block.expansion, down_kernel_size,
-                              stride=1, padding=downsample_padding, bias=False),
-                    norm_layer(planes * block.expansion),
-                ]
-            else:
-                downsample_layers = [
-                    nn.Conv2d(self.inplanes, planes * block.expansion, down_kernel_size,
-                              stride=stride, padding=downsample_padding, bias=False),
-                    norm_layer(planes * block.expansion),
-                ]
-            downsample = nn.Sequential(*downsample_layers)
-
-        first_dilation = 1 if dilation in (1, 2) else 2
-        layers = [block(
-            self.inplanes, planes, stride, downsample,
-            cardinality=self.cardinality, base_width=self.base_width, reduce_first=reduce_first,
-            use_se=use_se, dilation=first_dilation, previous_dilation=dilation, norm_layer=norm_layer)]
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(
-                self.inplanes, planes,
-                cardinality=self.cardinality, base_width=self.base_width, reduce_first=reduce_first,
-                use_se=use_se, dilation=dilation, previous_dilation=dilation, norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.num_classes = num_classes
-        del self.fc
-        if num_classes:
-            self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
-        else:
-            self.fc = None
-
-    def forward_features(self, x, pool=True):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        if pool:
-            x = self.global_pool(x)
-            x = x.view(x.size(0), -1)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x
-
-
 @register_model
 def gluon_resnet18_v1b(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """Constructs a ResNet-18 model.
     """
     default_cfg = default_cfgs['gluon_resnet18_v1b']
-    model = GluonResNet(BasicBlockGl, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -374,7 +72,7 @@ def gluon_resnet34_v1b(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-34 model.
     """
     default_cfg = default_cfgs['gluon_resnet34_v1b']
-    model = GluonResNet(BasicBlockGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model = ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -386,7 +84,7 @@ def gluon_resnet50_v1b(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-50 model.
     """
     default_cfg = default_cfgs['gluon_resnet50_v1b']
-    model = GluonResNet(BottleneckGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -398,7 +96,7 @@ def gluon_resnet101_v1b(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-101 model.
     """
     default_cfg = default_cfgs['gluon_resnet101_v1b']
-    model = GluonResNet(BottleneckGl, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -410,7 +108,7 @@ def gluon_resnet152_v1b(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-152 model.
     """
     default_cfg = default_cfgs['gluon_resnet152_v1b']
-    model = GluonResNet(BottleneckGl, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -422,8 +120,8 @@ def gluon_resnet50_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-50 model.
     """
     default_cfg = default_cfgs['gluon_resnet50_v1c']
-    model = GluonResNet(BottleneckGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -435,8 +133,8 @@ def gluon_resnet101_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-101 model.
     """
     default_cfg = default_cfgs['gluon_resnet101_v1c']
-    model = GluonResNet(BottleneckGl, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -448,8 +146,8 @@ def gluon_resnet152_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-152 model.
     """
     default_cfg = default_cfgs['gluon_resnet152_v1c']
-    model = GluonResNet(BottleneckGl, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -461,8 +159,8 @@ def gluon_resnet50_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-50 model.
     """
     default_cfg = default_cfgs['gluon_resnet50_v1d']
-    model = GluonResNet(BottleneckGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -474,8 +172,8 @@ def gluon_resnet101_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-101 model.
     """
     default_cfg = default_cfgs['gluon_resnet101_v1d']
-    model = GluonResNet(BottleneckGl, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -487,8 +185,8 @@ def gluon_resnet152_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-152 model.
     """
     default_cfg = default_cfgs['gluon_resnet152_v1d']
-    model = GluonResNet(BottleneckGl, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -500,8 +198,8 @@ def gluon_resnet50_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-50-V1e model. No pretrained weights for any 'e' variants
     """
     default_cfg = default_cfgs['gluon_resnet50_v1e']
-    model = GluonResNet(BottleneckGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     #if pretrained:
     #    load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -513,8 +211,8 @@ def gluon_resnet101_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-101 model.
     """
     default_cfg = default_cfgs['gluon_resnet101_v1e']
-    model = GluonResNet(BottleneckGl, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -526,8 +224,8 @@ def gluon_resnet152_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-152 model.
     """
     default_cfg = default_cfgs['gluon_resnet152_v1e']
-    model = GluonResNet(BottleneckGl, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -539,8 +237,8 @@ def gluon_resnet50_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """Constructs a ResNet-50 model.
     """
     default_cfg = default_cfgs['gluon_resnet50_v1s']
-    model = GluonResNet(BottleneckGl, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -552,8 +250,8 @@ def gluon_resnet101_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-101 model.
     """
     default_cfg = default_cfgs['gluon_resnet101_v1s']
-    model = GluonResNet(BottleneckGl, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -565,8 +263,8 @@ def gluon_resnet152_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """Constructs a ResNet-152 model.
     """
     default_cfg = default_cfgs['gluon_resnet152_v1s']
-    model = GluonResNet(BottleneckGl, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                        stem_width=64, deep_stem=True, **kwargs)
+    model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
+                   stem_width=64, deep_stem=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -578,8 +276,8 @@ def gluon_resnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwar
     """Constructs a ResNeXt50-32x4d model.
     """
     default_cfg = default_cfgs['gluon_resnext50_32x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 6, 3], cardinality=32, base_width=4,
+    model = ResNet(
+        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -592,8 +290,8 @@ def gluon_resnext101_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwa
     """Constructs a ResNeXt-101 model.
     """
     default_cfg = default_cfgs['gluon_resnext101_32x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 23, 3], cardinality=32, base_width=4,
+    model = ResNet(
+        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -606,8 +304,8 @@ def gluon_resnext101_64x4d(pretrained=False, num_classes=1000, in_chans=3, **kwa
     """Constructs a ResNeXt-101 model.
     """
     default_cfg = default_cfgs['gluon_resnext101_64x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 23, 3], cardinality=64, base_width=4,
+    model = ResNet(
+        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -620,8 +318,8 @@ def gluon_seresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kw
     """Constructs a SEResNeXt50-32x4d model.
     """
     default_cfg = default_cfgs['gluon_seresnext50_32x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 6, 3], cardinality=32, base_width=4, use_se=True,
+    model = ResNet(
+        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4, use_se=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -634,8 +332,8 @@ def gluon_seresnext101_32x4d(pretrained=False, num_classes=1000, in_chans=3, **k
     """Constructs a SEResNeXt-101-32x4d model.
     """
     default_cfg = default_cfgs['gluon_seresnext101_32x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 23, 3], cardinality=32, base_width=4, use_se=True,
+    model = ResNet(
+        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4, use_se=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -648,8 +346,8 @@ def gluon_seresnext101_64x4d(pretrained=False, num_classes=1000, in_chans=3, **k
     """Constructs a SEResNeXt-101-64x4d model.
     """
     default_cfg = default_cfgs['gluon_seresnext101_64x4d']
-    model = GluonResNet(
-        BottleneckGl, [3, 4, 23, 3], cardinality=64, base_width=4, use_se=True,
+    model = ResNet(
+        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4, use_se=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -662,8 +360,8 @@ def gluon_senet154(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """Constructs an SENet-154 model.
     """
     default_cfg = default_cfgs['gluon_senet154']
-    model = GluonResNet(
-        BottleneckGl, [3, 8, 36, 3], cardinality=64, base_width=4, use_se=True,
+    model = ResNet(
+        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, use_se=True,
         deep_stem=True, down_kernel_size=3, block_reduce_first=2,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index bedd303d..c7d80dba 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -103,7 +103,7 @@ class SEModule(nn.Module):
 
     def __init__(self, channels, reduction_channels):
         super(SEModule, self).__init__()
-        #self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc1 = nn.Conv2d(
             channels, reduction_channels, kernel_size=1, padding=0, bias=True)
         self.relu = nn.ReLU(inplace=True)
@@ -111,8 +111,7 @@ class SEModule(nn.Module):
             reduction_channels, channels, kernel_size=1, padding=0, bias=True)
 
     def forward(self, x):
-        #x_se = self.avg_pool(x)
-        x_se = x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1, 1)
+        x_se = self.avg_pool(x)
         x_se = self.fc1(x_se)
         x_se = self.relu(x_se)
         x_se = self.fc2(x_se)
@@ -287,7 +286,8 @@ class ResNet(nn.Module):
                  cardinality=1, base_width=64, stem_width=64, deep_stem=False,
                  block_reduce_first=1, down_kernel_size=1, avg_down=False, dilated=False,
                  norm_layer=nn.BatchNorm2d, drop_rate=0.0, global_pool='avg',
-                 zero_init_last_bn=True, block_args=dict()):
+                 zero_init_last_bn=True, block_args=None):
+        block_args = block_args or dict()
         self.num_classes = num_classes
         self.inplanes = stem_width * 2 if deep_stem else 64
         self.cardinality = cardinality
diff --git a/timm/models/senet.py b/timm/models/senet.py
index 7ec1c453..0fbcfb86 100644
--- a/timm/models/senet.py
+++ b/timm/models/senet.py
@@ -68,7 +68,7 @@ class SEModule(nn.Module):
 
     def __init__(self, channels, reduction):
         super(SEModule, self).__init__()
-        #self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc1 = nn.Conv2d(
             channels, channels // reduction, kernel_size=1, padding=0)
         self.relu = nn.ReLU(inplace=True)
@@ -78,8 +78,7 @@ class SEModule(nn.Module):
 
     def forward(self, x):
         module_input = x
-        #x = self.avg_pool(x)
-        x = x.view(x.size(0), x.size(1), -1).mean(-1).view(x.size(0), x.size(1), 1, 1)
+        x = self.avg_pool(x)
         x = self.fc1(x)
         x = self.relu(x)
         x = self.fc2(x)

From 902d32fb1693a1e0756e5dc7b088e06ee0ddcb4b Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 16:55:36 -0800
Subject: [PATCH 20/35] Renamed gen_efficientnet.py -> efficientnet.py

---
 timm/models/__init__.py                              | 2 +-
 timm/models/{gen_efficientnet.py => efficientnet.py} | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)
 rename timm/models/{gen_efficientnet.py => efficientnet.py} (99%)

diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 4ef966ea..7119c4f5 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -7,7 +7,7 @@ from .senet import *
 from .xception import *
 from .nasnet import *
 from .pnasnet import *
-from .gen_efficientnet import *
+from .efficientnet import *
 from .mobilenetv3 import *
 from .inception_v3 import *
 from .gluon_resnet import *
diff --git a/timm/models/gen_efficientnet.py b/timm/models/efficientnet.py
similarity index 99%
rename from timm/models/gen_efficientnet.py
rename to timm/models/efficientnet.py
index fe20ff13..2ed2a14e 100644
--- a/timm/models/gen_efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -1294,6 +1294,3 @@ def tf_mixnet_l(pretrained=False, **kwargs):
         'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
     return model
 
-
-def gen_efficientnet_model_names():
-    return set(_models)

From eccbadca740579857f4a76bd67b3d6f29290913d Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 17:11:53 -0800
Subject: [PATCH 21/35] Update EfficientNet comments

---
 timm/models/efficientnet.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index 2ed2a14e..a4f89055 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -1,17 +1,25 @@
-""" Generic EfficientNets
+""" PyTorch EfficientNet Family
+
+An implementation of EfficienNet that covers variety of related models with efficient architectures:
+
+* EfficientNet (B0-B8 + Tensorflow pretrained AutoAug/RandAug/AdvProp weight ports)
+  - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
+  - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
+  - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
 
-A generic class with building blocks to support a variety of models with efficient architectures:
-* EfficientNet (B0-B7)
-* EfficientNet-EdgeTPU
-* EfficientNet-CondConv
 * MixNet (Small, Medium, and Large)
-* MnasNet B1, A1 (SE), Small
-* MobileNet V1, V2, and V3
-* FBNet-C
-* Single-Path NAS Pixel1
-* And likely more...
+  - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
 
-TODO not all combinations and variations have been tested. Currently working on training hyper-params...
+* MNasNet B1, A1 (SE), Small
+  - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
+
+* FBNet-C
+  - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
+
+* Single-Path NAS Pixel1
+  - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
+
+* And likely more...
 
 Hacked together by Ross Wightman
 """
@@ -183,8 +191,6 @@ default_cfgs = {
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth'),
 }
 
-
-
 _DEBUG = False
 
 

From 6ca0828166a1a904a96e1c4bb1b0740d40b6ffa5 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 17:11:53 -0800
Subject: [PATCH 22/35] Update EfficientNet comments, MobileNetV3 non-TF create
 fns, fix factory arg checks, bump PyTorch version req to 1.2

---
 requirements.txt            |  4 ++--
 timm/models/efficientnet.py | 47 +++++++++++++------------------------
 timm/models/factory.py      |  4 ++--
 timm/models/mobilenetv3.py  | 32 ++++++++++++++++++++++++-
 4 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 88ce152f..f05f9812 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-torch>=1.1.0
-torchvision>=0.3.0
+torch>=1.2.0
+torchvision>=0.4.0
 pyyaml
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index a4f89055..9163a023 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -845,8 +845,7 @@ def spnasnet_100(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b0(pretrained=False, **kwargs):
     """ EfficientNet-B0 """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
@@ -855,8 +854,7 @@ def efficientnet_b0(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b1(pretrained=False, **kwargs):
     """ EfficientNet-B1 """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
     return model
@@ -865,8 +863,7 @@ def efficientnet_b1(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b2(pretrained=False, **kwargs):
     """ EfficientNet-B2 """
-    # NOTE for train, drop_rate should be 0.3
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
     return model
@@ -875,8 +872,7 @@ def efficientnet_b2(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b3(pretrained=False, **kwargs):
     """ EfficientNet-B3 """
-    # NOTE for train, drop_rate should be 0.3
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
     return model
@@ -885,8 +881,7 @@ def efficientnet_b3(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b4(pretrained=False, **kwargs):
     """ EfficientNet-B4 """
-    # NOTE for train, drop_rate should be 0.4
-    #kwargs['drop_connect_rate'] = 0.2  #  set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
     return model
@@ -895,8 +890,7 @@ def efficientnet_b4(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b5(pretrained=False, **kwargs):
     """ EfficientNet-B5 """
-    # NOTE for train, drop_rate should be 0.4
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
     return model
@@ -905,8 +899,7 @@ def efficientnet_b5(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b6(pretrained=False, **kwargs):
     """ EfficientNet-B6 """
-    # NOTE for train, drop_rate should be 0.5
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
     return model
@@ -915,8 +908,7 @@ def efficientnet_b6(pretrained=False, **kwargs):
 @register_model
 def efficientnet_b7(pretrained=False, **kwargs):
     """ EfficientNet-B7 """
-    # NOTE for train, drop_rate should be 0.5
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
     model = _gen_efficientnet(
         'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
     return model
@@ -949,8 +941,7 @@ def efficientnet_el(pretrained=False, **kwargs):
 @register_model
 def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     model = _gen_efficientnet_condconv(
         'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
     return model
@@ -959,8 +950,7 @@ def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
 @register_model
 def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     model = _gen_efficientnet_condconv(
         'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
         pretrained=pretrained, **kwargs)
@@ -969,8 +959,7 @@ def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
 @register_model
 def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B1 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     model = _gen_efficientnet_condconv(
         'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
         pretrained=pretrained, **kwargs)
@@ -1008,7 +997,7 @@ def tf_efficientnet_b2(pretrained=False, **kwargs):
 
 
 @register_model
-def tf_efficientnet_b3(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b3(pretrained=False, **kwargs):
     """ EfficientNet-B3. Tensorflow compatible variant """
     kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
@@ -1090,7 +1079,7 @@ def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
 
 
 @register_model
-def tf_efficientnet_b3_ap(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def tf_efficientnet_b3_ap(pretrained=False, **kwargs):
     """ EfficientNet-B3. Tensorflow compatible variant """
     kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
@@ -1186,8 +1175,7 @@ def tf_efficientnet_el(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
@@ -1198,8 +1186,7 @@ def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
@@ -1210,8 +1197,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
 @register_model
 def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
     """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2
-    #kwargs['drop_connect_rate'] = 0.2  # set when training, TODO add as cmd arg
+    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
     kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
     kwargs['pad_type'] = 'same'
     model = _gen_efficientnet_condconv(
@@ -1262,7 +1248,6 @@ def mixnet_xxl(pretrained=False, **kwargs):
     """Creates a MixNet Double Extra Large model.
     Not a paper spec, experimental def by RW w/ depth scaling.
     """
-    # kwargs['drop_connect_rate'] = 0.2
     model = _gen_mixnet_m(
         'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
     return model
diff --git a/timm/models/factory.py b/timm/models/factory.py
index 3c051e75..7b9f7a07 100644
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@@ -25,8 +25,8 @@ def create_model(
     """
     margs = dict(pretrained=pretrained, num_classes=num_classes, in_chans=in_chans)
 
-    # Only gen_efficientnet models have support for batchnorm params or drop_connect_rate passed as args
-    is_efficientnet = is_model_in_modules(model_name, ['gen_efficientnet'])
+    # Only EfficientNet and MobileNetV3 models have support for batchnorm params or drop_connect_rate passed as args
+    is_efficientnet = is_model_in_modules(model_name, ['efficientnet', 'mobilenetv3'])
     if not is_efficientnet:
         kwargs.pop('bn_tf', None)
         kwargs.pop('bn_momentum', None)
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
index 13fd16e6..a89adea4 100644
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@@ -35,6 +35,8 @@ def _cfg(url='', **kwargs):
 default_cfgs = {
     'mobilenetv3_large_075': _cfg(url=''),
     'mobilenetv3_large_100': _cfg(url=''),
+    'mobilenetv3_small_075': _cfg(url=''),
+    'mobilenetv3_small_100': _cfg(url=''),
     'mobilenetv3_rw': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
         interpolation='bicubic'),
@@ -374,6 +376,35 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
     return model
 
 
+@register_model
+def mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def mobilenetv3_small_100(pretrained=False, **kwargs):
+    print(kwargs)
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
 @register_model
 def mobilenetv3_rw(pretrained=False, **kwargs):
     """ MobileNet V3 """
@@ -384,7 +415,6 @@ def mobilenetv3_rw(pretrained=False, **kwargs):
     return model
 
 
-
 @register_model
 def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
     """ MobileNet V3 """

From 7b3c235ccc6dc9e006047ea4f4efe7119c0f85bf Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 28 Nov 2019 17:56:59 -0800
Subject: [PATCH 23/35] Update sotabench with new models

---
 sotabench.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/sotabench.py b/sotabench.py
index 5b61a93f..cd25412f 100644
--- a/sotabench.py
+++ b/sotabench.py
@@ -78,7 +78,7 @@ model_list = [
     _entry('mixnet_m', 'MixNet-M', '1907.09595'),
     _entry('mixnet_s', 'MixNet-S', '1907.09595'),
     _entry('mnasnet_100', 'MnasNet-B1', '1807.11626'),
-    _entry('mobilenetv3_100', 'MobileNet V3-Large 1.0', '1905.02244',
+    _entry('mobilenetv3_rw', 'MobileNet V3-Large 1.0', '1905.02244',
            model_desc='Trained in PyTorch with RMSProp, exponential LR decay, and hyper-params matching '
                       'paper as closely as possible.'),
     _entry('resnet18', 'ResNet-18', '1812.01187'),
@@ -114,6 +114,30 @@ model_list = [
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_b7', 'EfficientNet-B7 (RandAugment)', '1905.11946', batch_size=BATCH_SIZE//8,
            model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b0_ap', 'EfficientNet-B0 (AdvProp)', '1911.09665',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b1_ap', 'EfficientNet-B1 (AdvProp)', '1911.09665',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b2_ap', 'EfficientNet-B2 (AdvProp)', '1911.09665',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b3_ap', 'EfficientNet-B3 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 2,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b4_ap', 'EfficientNet-B4 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 2,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b5_ap', 'EfficientNet-B5 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 4,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b6_ap', 'EfficientNet-B6 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 8,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b7_ap', 'EfficientNet-B7 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 8,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_b8_ap', 'EfficientNet-B8 (AdvProp)', '1911.09665', batch_size=BATCH_SIZE // 8,
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_cc_b0_4e', 'EfficientNet-CondConv-B0 4 experts', '1904.04971',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_cc_b0_8e', 'EfficientNet-CondConv-B0 8 experts', '1904.04971',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_efficientnet_cc_b1_8e', 'EfficientNet-CondConv-B1 8 experts', '1904.04971',
+           model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_es', 'EfficientNet-EdgeTPU-S', '1905.11946',
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientnet_em', 'EfficientNet-EdgeTPU-M', '1905.11946',
@@ -124,6 +148,18 @@ model_list = [
     _entry('tf_mixnet_l', 'MixNet-L', '1907.09595', model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_mixnet_m', 'MixNet-M', '1907.09595', model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_mixnet_s', 'MixNet-S', '1907.09595', model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_large_100', 'MobileNet V3-Large 1.0', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_large_075', 'MobileNet V3-Large 0.75', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_large_minimal_100', 'MobileNet V3-Large Minimal 1.0', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_small_100', 'MobileNet V3-Small 1.0', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_small_075', 'MobileNet V3-Small 0.75', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
+    _entry('tf_mobilenetv3_small_minimal_100', 'MobileNet V3-Small Minimal 1.0', '1905.02244',
+           model_desc='Ported from official Google AI Tensorflow weights'),
 
     ## Cadene ported weights (to remove if Cadene adds sotabench)
     _entry('inception_resnet_v2', 'Inception ResNet V2', '1602.07261'),

From 3bef524f9c45a713eabce808124c9fc5ac0971a0 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 29 Nov 2019 17:56:36 -0800
Subject: [PATCH 24/35] Finish with HRNet, weights and models updated. Improve
 consistency in model classifier/global pool treatment.

---
 clean_checkpoint.py                        |  18 +-
 results/results-inv2-matched-frequency.csv | 246 +++---
 sotabench.py                               |  11 +
 timm/models/densenet.py                    |  29 +-
 timm/models/dla.py                         |  16 +-
 timm/models/dpn.py                         |  23 +-
 timm/models/efficientnet.py                |  28 +-
 timm/models/efficientnet_builder.py        |  18 +-
 timm/models/gluon_xception.py              |  40 +-
 timm/models/hrnet.py                       | 901 ++++++++++-----------
 timm/models/inception_resnet_v2.py         |  24 +-
 timm/models/inception_v4.py                |  22 +-
 timm/models/mobilenetv3.py                 |  26 +-
 timm/models/nasnet.py                      |  22 +-
 timm/models/pnasnet.py                     |  10 +-
 timm/models/resnet.py                      |  14 +-
 timm/models/senet.py                       |  13 +-
 timm/models/test_time_pool.py              |   2 +-
 timm/models/xception.py                    |  25 +-
 19 files changed, 724 insertions(+), 764 deletions(-)

diff --git a/clean_checkpoint.py b/clean_checkpoint.py
index d51e0d96..b088aa8f 100644
--- a/clean_checkpoint.py
+++ b/clean_checkpoint.py
@@ -8,12 +8,15 @@ from collections import OrderedDict
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
 parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
                     help='path to latest checkpoint (default: none)')
-parser.add_argument('--output', default='./cleaned.pth', type=str, metavar='PATH',
+parser.add_argument('--output', default='', type=str, metavar='PATH',
                     help='output path')
 parser.add_argument('--use-ema', dest='use_ema', action='store_true',
                     help='use ema version of weights if present')
 
 
+_TEMP_NAME = './_checkpoint.pth'
+
+
 def main():
     args = parser.parse_args()
 
@@ -40,13 +43,18 @@ def main():
             new_state_dict[name] = v
         print("=> Loaded state_dict from '{}'".format(args.checkpoint))
 
-        torch.save(new_state_dict, args.output)
-        with open(args.output, 'rb') as f:
+        torch.save(new_state_dict, _TEMP_NAME)
+        with open(_TEMP_NAME, 'rb') as f:
             sha_hash = hashlib.sha256(f.read()).hexdigest()
 
-        checkpoint_base = os.path.splitext(args.checkpoint)[0]
+        if args.output:
+            checkpoint_root, checkpoint_base = os.path.split(args.output)
+            checkpoint_base = os.path.splitext(checkpoint_base)[0]
+        else:
+            checkpoint_root = ''
+            checkpoint_base = os.path.splitext(args.checkpoint)[0]
         final_filename = '-'.join([checkpoint_base, sha_hash[:8]]) + '.pth'
-        shutil.move(args.output, final_filename)
+        shutil.move(_TEMP_NAME, os.path.join(checkpoint_root, final_filename))
         print("=> Saved state_dict to '{}, SHA256: {}'".format(final_filename, sha_hash))
     else:
         print("Error: Checkpoint ({}) doesn't exist".format(args.checkpoint))
diff --git a/results/results-inv2-matched-frequency.csv b/results/results-inv2-matched-frequency.csv
index 4b42fdd9..3f791d6e 100644
--- a/results/results-inv2-matched-frequency.csv
+++ b/results/results-inv2-matched-frequency.csv
@@ -1,97 +1,155 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
-resnet18,57.18,42.82,80.19,19.81,11.69,224,0.875,bilinear
-gluon_resnet18_v1b,58.32,41.68,80.96,19.04,11.69,224,0.875,bicubic
-seresnet18,59.81,40.19,81.68,18.32,11.78,224,0.875,bicubic
-tv_resnet34,61.2,38.8,82.72,17.28,21.8,224,0.875,bilinear
-spnasnet_100,61.21,38.79,82.77,17.23,4.42,224,0.875,bilinear
-mnasnet_100,61.91,38.09,83.71,16.29,4.38,224,0.875,bicubic
-fbnetc_100,62.43,37.57,83.39,16.61,5.57,224,0.875,bilinear
-gluon_resnet34_v1b,62.56,37.44,84,16,21.8,224,0.875,bicubic
-resnet34,62.82,37.18,84.12,15.88,21.8,224,0.875,bilinear
-seresnet34,62.89,37.11,84.22,15.78,21.96,224,0.875,bilinear
-densenet121,62.94,37.06,84.26,15.74,7.98,224,0.875,bicubic
-semnasnet_100,63.12,36.88,84.53,15.47,3.89,224,0.875,bicubic
-mobilenetv3_100,63.23,36.77,84.52,15.48,5.48,224,0.875,bicubic
-tv_resnet50,63.33,36.67,84.65,15.35,25.56,224,0.875,bilinear
-mixnet_s,63.38,36.62,84.71,15.29,4.13,224,0.875,bicubic
-resnet26,63.45,36.55,84.27,15.73,16,224,0.875,bicubic
-tf_mixnet_s,63.59,36.41,84.27,15.73,4.13,224,0.875,bicubic
-dpn68,64.22,35.78,85.18,14.82,12.61,224,0.875,bicubic
-tf_mixnet_m,64.27,35.73,85.09,14.91,5.01,224,0.875,bicubic
-tf_efficientnet_b0,64.29,35.71,85.25,14.75,5.29,224,0.875,bicubic
-efficientnet_b0,64.58,35.42,85.89,14.11,5.29,224,0.875,bicubic
-resnet26d,64.63,35.37,85.12,14.88,16.01,224,0.875,bicubic
-mixnet_m,64.69,35.31,85.47,14.53,5.01,224,0.875,bicubic
-densenet169,64.78,35.22,85.25,14.75,14.15,224,0.875,bicubic
-seresnext26_32x4d,65.04,34.96,85.65,14.35,16.79,224,0.875,bicubic
-tf_efficientnet_es,65.24,34.76,85.54,14.46,5.44,224,0.875,bicubic
-densenet201,65.28,34.72,85.67,14.33,20.01,224,0.875,bicubic
-dpn68b,65.6,34.4,85.94,14.06,12.61,224,0.875,bicubic
-resnet101,65.68,34.32,85.98,14.02,44.55,224,0.875,bilinear
-densenet161,65.85,34.15,86.46,13.54,28.68,224,0.875,bicubic
-gluon_resnet50_v1b,66.04,33.96,86.27,13.73,25.56,224,0.875,bicubic
-inception_v3,66.12,33.88,86.34,13.66,27.16,299,0.875,bicubic
-tv_resnext50_32x4d,66.18,33.82,86.04,13.96,25.03,224,0.875,bilinear
-seresnet50,66.24,33.76,86.33,13.67,28.09,224,0.875,bilinear
-tf_inception_v3,66.41,33.59,86.68,13.32,23.83,299,0.875,bicubic
-gluon_resnet50_v1c,66.54,33.46,86.16,13.84,25.58,224,0.875,bicubic
-adv_inception_v3,66.6,33.4,86.56,13.44,23.83,299,0.875,bicubic
-wide_resnet50_2,66.65,33.35,86.81,13.19,68.88,224,0.875,bilinear
-wide_resnet101_2,66.68,33.32,87.04,12.96,126.89,224,0.875,bilinear
-tf_mixnet_l,66.78,33.22,86.46,13.54,7.33,224,0.875,bicubic
-resnet50,66.81,33.19,87,13,25.56,224,0.875,bicubic
-tf_efficientnet_em,66.87,33.13,86.98,13.02,6.9,240,0.882,bicubic
-resnext50_32x4d,66.88,33.12,86.36,13.64,25.03,224,0.875,bicubic
-tf_efficientnet_b1,66.89,33.11,87.04,12.96,7.79,240,0.882,bicubic
-mixnet_l,66.97,33.03,86.94,13.06,7.33,224,0.875,bicubic
-resnet152,67.02,32.98,87.57,12.43,60.19,224,0.875,bilinear
-gluon_resnet50_v1s,67.1,32.9,86.86,13.14,25.68,224,0.875,bicubic
-seresnet101,67.15,32.85,87.05,12.95,49.33,224,0.875,bilinear
-gluon_resnet101_v1b,67.45,32.55,87.23,12.77,44.55,224,0.875,bicubic
-efficientnet_b1,67.55,32.45,87.29,12.71,7.79,240,0.882,bicubic
-seresnet152,67.55,32.45,87.39,12.61,66.82,224,0.875,bilinear
-gluon_resnet101_v1c,67.56,32.44,87.16,12.84,44.57,224,0.875,bicubic
-gluon_inception_v3,67.59,32.41,87.46,12.54,23.83,299,0.875,bicubic
-xception,67.67,32.33,87.57,12.43,22.86,299,0.8975,bicubic
-efficientnet_b2,67.8,32.2,88.2,11.8,9.11,260,0.89,bicubic
-resnext101_32x8d,67.85,32.15,87.48,12.52,88.79,224,0.875,bilinear
-seresnext50_32x4d,67.87,32.13,87.62,12.38,27.56,224,0.875,bilinear
-gluon_resnet50_v1d,67.91,32.09,87.12,12.88,25.58,224,0.875,bicubic
-dpn92,68.01,31.99,87.59,12.41,37.67,224,0.875,bicubic
-tf_efficientnet_el,68.18,31.82,88.35,11.65,10.59,300,0.904,bicubic
-gluon_resnext50_32x4d,68.28,31.72,87.32,12.68,25.03,224,0.875,bicubic
-dpn98,68.58,31.42,87.66,12.34,61.57,224,0.875,bicubic
-gluon_seresnext50_32x4d,68.67,31.33,88.32,11.68,27.56,224,0.875,bicubic
-dpn107,68.71,31.29,88.13,11.87,86.92,224,0.875,bicubic
-gluon_resnet101_v1s,68.72,31.28,87.9,12.1,44.67,224,0.875,bicubic
+ig_resnext101_32x48d,76.87,23.13,93.32,6.68,828.41,224,0.875,bilinear
+ig_resnext101_32x32d,76.84,23.16,93.19,6.81,468.53,224,0.875,bilinear
+tf_efficientnet_b7_ap,76.09,23.91,92.97,7.03,66.35,600,0.949,bicubic
+tf_efficientnet_b8_ap,76.09,23.91,92.73,7.27,87.41,672,0.954,bicubic
+ig_resnext101_32x16d,75.71,24.29,92.9,7.1,194.03,224,0.875,bilinear
+swsl_resnext101_32x8d,75.45,24.55,92.75,7.25,88.79,224,0.875,bilinear
+tf_efficientnet_b6_ap,75.38,24.62,92.44,7.56,43.04,528,0.942,bicubic
+tf_efficientnet_b7,74.72,25.28,92.22,7.78,66.35,600,0.949,bicubic
+tf_efficientnet_b5_ap,74.59,25.41,91.99,8.01,30.39,456,0.934,bicubic
+swsl_resnext101_32x4d,74.15,25.85,91.99,8.01,44.18,224,0.875,bilinear
+swsl_resnext101_32x16d,74.01,25.99,92.17,7.83,194.03,224,0.875,bilinear
+tf_efficientnet_b6,73.9,26.1,91.75,8.25,43.04,528,0.942,bicubic
+ig_resnext101_32x8d,73.66,26.34,92.15,7.85,88.79,224,0.875,bilinear
+tf_efficientnet_b5,73.54,26.46,91.46,8.54,30.39,456,0.934,bicubic
+tf_efficientnet_b4_ap,72.89,27.11,90.98,9.02,19.34,380,0.922,bicubic
+swsl_resnext50_32x4d,72.58,27.42,90.84,9.16,25.03,224,0.875,bilinear
+pnasnet5large,72.37,27.63,90.26,9.74,86.06,331,0.875,bicubic
+nasnetalarge,72.31,27.69,90.51,9.49,88.75,331,0.875,bicubic
+tf_efficientnet_b4,72.28,27.72,90.6,9.4,19.34,380,0.922,bicubic
+swsl_resnet50,71.69,28.31,90.51,9.49,25.56,224,0.875,bilinear
+ssl_resnext101_32x8d,71.49,28.51,90.47,9.53,88.79,224,0.875,bilinear
+ssl_resnext101_32x16d,71.4,28.6,90.55,9.45,194.03,224,0.875,bilinear
+tf_efficientnet_b3_ap,70.92,29.08,89.43,10.57,12.23,300,0.904,bicubic
+tf_efficientnet_b3,70.62,29.38,89.44,10.56,12.23,300,0.904,bicubic
+gluon_senet154,70.6,29.4,88.92,11.08,115.09,224,0.875,bicubic
+ssl_resnext101_32x4d,70.5,29.5,89.76,10.24,44.18,224,0.875,bilinear
+senet154,70.48,29.52,88.99,11.01,115.09,224,0.875,bilinear
+gluon_seresnext101_64x4d,70.44,29.56,89.35,10.65,88.23,224,0.875,bicubic
+gluon_resnet152_v1s,70.32,29.68,88.87,11.13,60.32,224,0.875,bicubic
+inception_resnet_v2,70.12,29.88,88.68,11.32,55.84,299,0.8975,bicubic
+gluon_seresnext101_32x4d,70.01,29.99,88.91,11.09,48.96,224,0.875,bicubic
+gluon_resnet152_v1d,69.95,30.05,88.47,11.53,60.21,224,0.875,bicubic
+gluon_resnext101_64x4d,69.69,30.31,88.26,11.74,83.46,224,0.875,bicubic
+ssl_resnext50_32x4d,69.69,30.31,89.42,10.58,25.03,224,0.875,bilinear
+ens_adv_inception_resnet_v2,69.52,30.48,88.5,11.5,55.84,299,0.8975,bicubic
+inception_v4,69.35,30.65,88.78,11.22,42.68,299,0.875,bicubic
+seresnext101_32x4d,69.34,30.66,88.05,11.95,48.96,224,0.875,bilinear
+gluon_resnet152_v1c,69.13,30.87,87.89,12.11,60.21,224,0.875,bicubic
+mixnet_xl,69,31,88.19,11.81,11.9,224,0.875,bicubic
+gluon_resnet101_v1d,68.99,31.01,88.08,11.92,44.57,224,0.875,bicubic
+gluon_xception65,68.98,31.02,88.32,11.68,39.92,299,0.875,bicubic
+gluon_resnext101_32x4d,68.96,31.04,88.34,11.66,44.18,224,0.875,bicubic
+tf_efficientnet_b2_ap,68.93,31.07,88.34,11.66,9.11,260,0.89,bicubic
+gluon_resnet152_v1b,68.81,31.19,87.71,12.29,60.19,224,0.875,bicubic
+dpn131,68.76,31.24,87.48,12.52,79.25,224,0.875,bicubic
 resnext50d_32x4d,68.75,31.25,88.31,11.69,25.05,224,0.875,bicubic
 tf_efficientnet_b2,68.75,31.25,87.95,12.05,9.11,260,0.89,bicubic
-dpn131,68.76,31.24,87.48,12.52,79.25,224,0.875,bicubic
-gluon_resnet152_v1b,68.81,31.19,87.71,12.29,60.19,224,0.875,bicubic
-gluon_resnext101_32x4d,68.96,31.04,88.34,11.66,44.18,224,0.875,bicubic
-gluon_xception65,68.98,31.02,88.32,11.68,39.92,299,0.875,bicubic
-gluon_resnet101_v1d,68.99,31.01,88.08,11.92,44.57,224,0.875,bicubic
-mixnet_xl,69,31,88.19,11.81,11.9,224,0.875,bicubic
-gluon_resnet152_v1c,69.13,30.87,87.89,12.11,60.21,224,0.875,bicubic
-seresnext101_32x4d,69.34,30.66,88.05,11.95,48.96,224,0.875,bilinear
-inception_v4,69.35,30.65,88.78,11.22,42.68,299,0.875,bicubic
-ens_adv_inception_resnet_v2,69.52,30.48,88.5,11.5,55.84,299,0.8975,bicubic
-gluon_resnext101_64x4d,69.69,30.31,88.26,11.74,83.46,224,0.875,bicubic
-gluon_resnet152_v1d,69.95,30.05,88.47,11.53,60.21,224,0.875,bicubic
-gluon_seresnext101_32x4d,70.01,29.99,88.91,11.09,48.96,224,0.875,bicubic
-inception_resnet_v2,70.12,29.88,88.68,11.32,55.84,299,0.8975,bicubic
-gluon_resnet152_v1s,70.32,29.68,88.87,11.13,60.32,224,0.875,bicubic
-gluon_seresnext101_64x4d,70.44,29.56,89.35,10.65,88.23,224,0.875,bicubic
-senet154,70.48,29.52,88.99,11.01,115.09,224,0.875,bilinear
-gluon_senet154,70.6,29.4,88.92,11.08,115.09,224,0.875,bicubic
-tf_efficientnet_b3,70.62,29.38,89.44,10.56,12.23,300,0.904,bicubic
-tf_efficientnet_b4,72.28,27.72,90.6,9.4,19.34,380,0.922,bicubic
-nasnetalarge,72.31,27.69,90.51,9.49,88.75,331,0.875,bicubic
-pnasnet5large,72.37,27.63,90.26,9.74,86.06,331,0.875,bicubic
-tf_efficientnet_b5,73.37,26.63,91.21,8.79,30.39,456,0.934,bicubic
-ig_resnext101_32x8d,73.66,26.34,92.15,7.85,88.79,224,0.875,bilinear
-tf_efficientnet_b6,73.9,26.1,91.75,8.25,43.04,528,0.942,bicubic
-tf_efficientnet_b7,74.04,25.96,91.86,8.14,66.35,600,0.949,bicubic
-ig_resnext101_32x16d,75.71,24.29,92.9,7.1,194.03,224,0.875,bilinear
-ig_resnext101_32x32d,76.84,23.16,93.19,6.81,468.53,224,0.875,bilinear
-ig_resnext101_32x48d,76.87,23.13,93.32,6.68,828.41,224,0.875,bilinear
+gluon_resnet101_v1s,68.72,31.28,87.9,12.1,44.67,224,0.875,bicubic
+dpn107,68.71,31.29,88.13,11.87,86.92,224,0.875,bicubic
+gluon_seresnext50_32x4d,68.67,31.33,88.32,11.68,27.56,224,0.875,bicubic
+hrnet_w64,68.63,31.37,88.07,11.93,128.06,224,0.875,bilinear
+dpn98,68.58,31.42,87.66,12.34,61.57,224,0.875,bicubic
+ssl_resnet50,68.42,31.58,88.58,11.42,25.56,224,0.875,bilinear
+dla102x2,68.34,31.66,87.87,12.13,41.75,224,0.875,bilinear
+gluon_resnext50_32x4d,68.28,31.72,87.32,12.68,25.03,224,0.875,bicubic
+tf_efficientnet_el,68.18,31.82,88.35,11.65,10.59,300,0.904,bicubic
+dpn92,68.01,31.99,87.59,12.41,37.67,224,0.875,bicubic
+gluon_resnet50_v1d,67.91,32.09,87.12,12.88,25.58,224,0.875,bicubic
+seresnext50_32x4d,67.87,32.13,87.62,12.38,27.56,224,0.875,bilinear
+resnext101_32x8d,67.85,32.15,87.48,12.52,88.79,224,0.875,bilinear
+efficientnet_b2,67.8,32.2,88.2,11.8,9.11,260,0.89,bicubic
+hrnet_w44,67.77,32.23,87.53,12.47,67.06,224,0.875,bilinear
+hrnet_w48,67.77,32.23,87.42,12.58,77.47,224,0.875,bilinear
+xception,67.67,32.33,87.57,12.43,22.86,299,0.8975,bicubic
+dla169,67.61,32.39,87.56,12.44,53.99,224,0.875,bilinear
+gluon_inception_v3,67.59,32.41,87.46,12.54,23.83,299,0.875,bicubic
+hrnet_w40,67.59,32.41,87.13,12.87,57.56,224,0.875,bilinear
+gluon_resnet101_v1c,67.56,32.44,87.16,12.84,44.57,224,0.875,bicubic
+efficientnet_b1,67.55,32.45,87.29,12.71,7.79,240,0.882,bicubic
+seresnet152,67.55,32.45,87.39,12.61,66.82,224,0.875,bilinear
+res2net50_26w_8s,67.53,32.47,87.27,12.73,48.4,224,0.875,bilinear
+tf_efficientnet_b1_ap,67.52,32.48,87.77,12.23,7.79,240,0.882,bicubic
+tf_efficientnet_cc_b1_8e,67.48,32.52,87.31,12.69,39.72,240,0.882,bicubic
+gluon_resnet101_v1b,67.45,32.55,87.23,12.77,44.55,224,0.875,bicubic
+res2net101_26w_4s,67.45,32.55,87.01,12.99,45.21,224,0.875,bilinear
+seresnet101,67.15,32.85,87.05,12.95,49.33,224,0.875,bilinear
+gluon_resnet50_v1s,67.1,32.9,86.86,13.14,25.68,224,0.875,bicubic
+dla60x,67.08,32.92,87.17,12.83,17.65,224,0.875,bilinear
+dla60_res2net,67.03,32.97,87.14,12.86,21.15,224,0.875,bilinear
+resnet152,67.02,32.98,87.57,12.43,60.19,224,0.875,bilinear
+dla102x,67,33,86.77,13.23,26.77,224,0.875,bilinear
+mixnet_l,66.97,33.03,86.94,13.06,7.33,224,0.875,bicubic
+res2net50_26w_6s,66.91,33.09,86.9,13.1,37.05,224,0.875,bilinear
+tf_efficientnet_b1,66.89,33.11,87.04,12.96,7.79,240,0.882,bicubic
+resnext50_32x4d,66.88,33.12,86.36,13.64,25.03,224,0.875,bicubic
+tf_efficientnet_em,66.87,33.13,86.98,13.02,6.9,240,0.882,bicubic
+resnet50,66.81,33.19,87,13,25.56,224,0.875,bicubic
+hrnet_w32,66.79,33.21,87.29,12.71,41.23,224,0.875,bilinear
+tf_mixnet_l,66.78,33.22,86.46,13.54,7.33,224,0.875,bicubic
+hrnet_w30,66.76,33.24,86.79,13.21,37.71,224,0.875,bilinear
+wide_resnet101_2,66.68,33.32,87.04,12.96,126.89,224,0.875,bilinear
+wide_resnet50_2,66.65,33.35,86.81,13.19,68.88,224,0.875,bilinear
+dla60_res2next,66.64,33.36,87.02,12.98,17.33,224,0.875,bilinear
+adv_inception_v3,66.6,33.4,86.56,13.44,23.83,299,0.875,bicubic
+dla102,66.55,33.45,86.91,13.09,33.73,224,0.875,bilinear
+gluon_resnet50_v1c,66.54,33.46,86.16,13.84,25.58,224,0.875,bicubic
+tf_inception_v3,66.42,33.58,86.68,13.32,23.83,299,0.875,bicubic
+seresnet50,66.24,33.76,86.33,13.67,28.09,224,0.875,bilinear
+tf_efficientnet_cc_b0_8e,66.21,33.79,86.22,13.78,24.01,224,0.875,bicubic
+tv_resnext50_32x4d,66.18,33.82,86.04,13.96,25.03,224,0.875,bilinear
+res2net50_26w_4s,66.17,33.83,86.6,13.4,25.7,224,0.875,bilinear
+inception_v3,66.12,33.88,86.34,13.66,27.16,299,0.875,bicubic
+gluon_resnet50_v1b,66.04,33.96,86.27,13.73,25.56,224,0.875,bicubic
+res2net50_14w_8s,66.02,33.98,86.24,13.76,25.06,224,0.875,bilinear
+densenet161,65.85,34.15,86.46,13.54,28.68,224,0.875,bicubic
+res2next50,65.85,34.15,85.83,14.17,24.67,224,0.875,bilinear
+resnet101,65.68,34.32,85.98,14.02,44.55,224,0.875,bilinear
+dpn68b,65.6,34.4,85.94,14.06,12.61,224,0.875,bicubic
+tf_efficientnet_b0_ap,65.49,34.51,85.55,14.45,5.29,224,0.875,bicubic
+res2net50_48w_2s,65.32,34.68,85.96,14.04,25.29,224,0.875,bilinear
+densenet201,65.28,34.72,85.67,14.33,20.01,224,0.875,bicubic
+tf_efficientnet_es,65.24,34.76,85.54,14.46,5.44,224,0.875,bicubic
+dla60,65.22,34.78,85.75,14.25,22.33,224,0.875,bilinear
+tf_efficientnet_cc_b0_4e,65.13,34.87,85.13,14.87,13.31,224,0.875,bicubic
+seresnext26_32x4d,65.04,34.96,85.65,14.35,16.79,224,0.875,bicubic
+hrnet_w18,64.91,35.09,85.75,14.25,21.3,224,0.875,bilinear
+densenet169,64.78,35.22,85.25,14.75,14.15,224,0.875,bicubic
+mixnet_m,64.69,35.31,85.47,14.53,5.01,224,0.875,bicubic
+resnet26d,64.63,35.37,85.12,14.88,16.01,224,0.875,bicubic
+efficientnet_b0,64.58,35.42,85.89,14.11,5.29,224,0.875,bicubic
+tf_efficientnet_b0,64.29,35.71,85.25,14.75,5.29,224,0.875,bicubic
+tf_mixnet_m,64.27,35.73,85.09,14.91,5.01,224,0.875,bicubic
+dpn68,64.22,35.78,85.18,14.82,12.61,224,0.875,bicubic
+tf_mixnet_s,63.59,36.41,84.27,15.73,4.13,224,0.875,bicubic
+resnet26,63.45,36.55,84.27,15.73,16,224,0.875,bicubic
+mixnet_s,63.38,36.62,84.71,15.29,4.13,224,0.875,bicubic
+tv_resnet50,63.33,36.67,84.65,15.35,25.56,224,0.875,bilinear
+mobilenetv3_rw,63.23,36.77,84.52,15.48,5.48,224,0.875,bicubic
+semnasnet_100,63.12,36.88,84.53,15.47,3.89,224,0.875,bicubic
+densenet121,62.94,37.06,84.26,15.74,7.98,224,0.875,bicubic
+seresnet34,62.89,37.11,84.22,15.78,21.96,224,0.875,bilinear
+hrnet_w18_small_v2,62.83,37.17,83.97,16.03,15.6,224,0.875,bilinear
+resnet34,62.82,37.18,84.12,15.88,21.8,224,0.875,bilinear
+swsl_resnet18,62.73,37.27,84.3,15.7,11.69,224,0.875,bilinear
+gluon_resnet34_v1b,62.56,37.44,84,16,21.8,224,0.875,bicubic
+dla34,62.51,37.49,83.92,16.08,15.78,224,0.875,bilinear
+tf_mobilenetv3_large_100,62.47,37.53,83.96,16.04,5.48,224,0.875,bilinear
+fbnetc_100,62.43,37.57,83.39,16.61,5.57,224,0.875,bilinear
+mnasnet_100,61.91,38.09,83.71,16.29,4.38,224,0.875,bicubic
+ssl_resnet18,61.49,38.51,83.33,16.67,11.69,224,0.875,bilinear
+spnasnet_100,61.21,38.79,82.77,17.23,4.42,224,0.875,bilinear
+tv_resnet34,61.2,38.8,82.72,17.28,21.8,224,0.875,bilinear
+tf_mobilenetv3_large_075,60.38,39.62,81.96,18.04,3.99,224,0.875,bilinear
+seresnet18,59.81,40.19,81.68,18.32,11.78,224,0.875,bicubic
+tf_mobilenetv3_large_minimal_100,59.07,40.93,81.14,18.86,3.92,224,0.875,bilinear
+hrnet_w18_small,58.97,41.03,81.34,18.66,13.19,224,0.875,bilinear
+gluon_resnet18_v1b,58.32,41.68,80.96,19.04,11.69,224,0.875,bicubic
+resnet18,57.18,42.82,80.19,19.81,11.69,224,0.875,bilinear
+dla60x_c,56.02,43.98,78.96,21.04,1.34,224,0.875,bilinear
+tf_mobilenetv3_small_100,54.51,45.49,77.08,22.92,2.54,224,0.875,bilinear
+dla46x_c,53.08,46.92,76.84,23.16,1.08,224,0.875,bilinear
+dla46_c,52.2,47.8,75.68,24.32,1.31,224,0.875,bilinear
+tf_mobilenetv3_small_075,52.15,47.85,75.46,24.54,2.04,224,0.875,bilinear
+tf_mobilenetv3_small_minimal_100,49.53,50.47,73.05,26.95,2.04,224,0.875,bilinear
diff --git a/sotabench.py b/sotabench.py
index cd25412f..5f6345f5 100644
--- a/sotabench.py
+++ b/sotabench.py
@@ -294,6 +294,17 @@ model_list = [
     _entry('res2next50', 'Res2NeXt-50', '1904.01169'),
     _entry('dla60_res2net', 'Res2Net-DLA-60', '1904.01169'),
     _entry('dla60_res2next', 'Res2NeXt-DLA-60', '1904.01169'),
+
+    ## HRNet official impl weights
+    _entry('hrnet_w18_small', 'HRNet-W18-C-Small-V1', '1908.07919'),
+    _entry('hrnet_w18_small_v2', 'HRNet-W18-C-Small-V2', '1908.07919'),
+    _entry('hrnet_w18', 'HRNet-W18-C', '1908.07919'),
+    _entry('hrnet_w30', 'HRNet-W30-C', '1908.07919'),
+    _entry('hrnet_w32', 'HRNet-W32-C', '1908.07919'),
+    _entry('hrnet_w40', 'HRNet-W40-C', '1908.07919'),
+    _entry('hrnet_w44', 'HRNet-W44-C', '1908.07919'),
+    _entry('hrnet_w48', 'HRNet-W48-C', '1908.07919'),
+    _entry('hrnet_w64', 'HRNet-W64-C', '1908.07919'),
 ]
 
 for m in model_list:
diff --git a/timm/models/densenet.py b/timm/models/densenet.py
index 1e49f6df..d1ac5857 100644
--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@@ -10,7 +10,7 @@ import torch.nn.functional as F
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 import re
 
@@ -88,8 +88,8 @@ class DenseNet(nn.Module):
     def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
                  num_init_features=64, bn_size=4, drop_rate=0,
                  num_classes=1000, in_chans=3, global_pool='avg'):
-        self.global_pool = global_pool
         self.num_classes = num_classes
+        self.drop_rate = drop_rate
         super(DenseNet, self).__init__()
 
         # First convolution
@@ -117,32 +117,31 @@ class DenseNet(nn.Module):
         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
 
         # Linear layer
-        self.classifier = nn.Linear(num_features, num_classes)
-
         self.num_features = num_features
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
     def get_classifier(self):
         return self.classifier
 
     def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = global_pool
         self.num_classes = num_classes
-        del self.classifier
-        if num_classes:
-            self.classifier = nn.Linear(self.num_features, num_classes)
-        else:
-            self.classifier = None
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x = self.features(x)
         x = F.relu(x, inplace=True)
-        if pool:
-            x = select_adaptive_pool2d(x, self.global_pool)
-            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, x):
-        return self.classifier(self.forward_features(x, pool=True))
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        return x
 
 
 def _filter_pretrained(state_dict):
diff --git a/timm/models/dla.py b/timm/models/dla.py
index 255a389d..cd560f44 100644
--- a/timm/models/dla.py
+++ b/timm/models/dla.py
@@ -276,8 +276,7 @@ class DLA(nn.Module):
 
         self.num_features = channels[-1]
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.fc = nn.Conv2d(self.num_features * self.global_pool.feat_mult(), num_classes,
-                            kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc = nn.Conv2d(self.num_features * self.global_pool.feat_mult(), num_classes, 1, bias=True)
 
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
@@ -302,15 +301,14 @@ class DLA(nn.Module):
         return self.fc
 
     def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
-        del self.fc
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         if num_classes:
-            self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
+            self.fc = nn.Conv2d(self.num_features * self.global_pool.feat_mult(), num_classes, 1, bias=True)
         else:
             self.fc = None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x = self.base_layer(x)
         x = self.level0(x)
         x = self.level1(x)
@@ -318,17 +316,15 @@ class DLA(nn.Module):
         x = self.level3(x)
         x = self.level4(x)
         x = self.level5(x)
-        if pool:
-            x = self.global_pool(x)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.fc(x)
-        x = x.flatten(1)
-        return x
+        return x.flatten(1)
 
 
 @register_model
diff --git a/timm/models/dpn.py b/timm/models/dpn.py
index 1496a067..7f46e8e0 100644
--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@@ -16,7 +16,7 @@ from collections import OrderedDict
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD
 
 
@@ -160,7 +160,6 @@ class DPN(nn.Module):
         super(DPN, self).__init__()
         self.num_classes = num_classes
         self.drop_rate = drop_rate
-        self.global_pool = global_pool
         self.b = b
         bw_factor = 1 if small else 4
 
@@ -218,32 +217,32 @@ class DPN(nn.Module):
         self.features = nn.Sequential(blocks)
 
         # Using 1x1 conv for the FC layer to allow the extra pooling scheme
-        self.classifier = nn.Conv2d(in_chs, num_classes, kernel_size=1, bias=True)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Conv2d(
+            self.num_features * self.global_pool.feat_mult(), num_classes, kernel_size=1, bias=True)
 
     def get_classifier(self):
         return self.classifier
 
     def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
-        self.global_pool = global_pool
-        del self.classifier
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         if num_classes:
-            self.classifier = nn.Conv2d(self.num_features, num_classes, kernel_size=1, bias=True)
+            self.classifier = nn.Conv2d(
+                self.num_features * self.global_pool.feat_mult(), num_classes, kernel_size=1, bias=True)
         else:
             self.classifier = None
 
-    def forward_features(self, x, pool=True):
-        x = self.features(x)
-        if pool:
-            x = select_adaptive_pool2d(x, pool_type=self.global_pool)
-        return x
+    def forward_features(self, x):
+        return self.features(x)
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         out = self.classifier(x)
-        return out.view(out.size(0), -1)
+        return out.flatten(1)
 
 
 @register_model
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index 9163a023..ac3c244c 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -211,8 +211,7 @@ class EfficientNet(nn.Module):
     def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                  channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                  pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 global_pool='avg', weight_init='goog'):
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
         super(EfficientNet, self).__init__()
         norm_kwargs = norm_kwargs or {}
 
@@ -245,11 +244,7 @@ class EfficientNet(nn.Module):
         # Classifier
         self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), self.num_classes)
 
-        for m in self.modules():
-            if weight_init == 'goog':
-                efficientnet_init_goog(m)
-            else:
-                efficientnet_init_default(m)
+        efficientnet_init_weights(self)
 
     def as_sequential(self):
         layers = [self.conv_stem, self.bn1, self.act1]
@@ -262,14 +257,10 @@ class EfficientNet(nn.Module):
         return self.classifier
 
     def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
-        del self.classifier
-        if num_classes:
-            self.classifier = nn.Linear(
-                self.num_features * self.global_pool.feat_mult(), num_classes)
-        else:
-            self.classifier = None
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
     def forward_features(self, x):
         x = self.conv_stem(x)
@@ -300,7 +291,7 @@ class EfficientNetFeatures(nn.Module):
     def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                  in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                  output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
         super(EfficientNetFeatures, self).__init__()
         norm_kwargs = norm_kwargs or {}
 
@@ -326,12 +317,7 @@ class EfficientNetFeatures(nn.Module):
         self.feature_info = builder.features  # builder provides info about feature channels for each block
         self._in_chs = builder.in_chs
 
-        for m in self.modules():
-            if weight_init == 'goog':
-                efficientnet_init_goog(m)
-            else:
-                efficientnet_init_default(m)
-
+        efficientnet_init_weights(self)
         if _DEBUG:
             for k, v in self.feature_info.items():
                 print('Feature idx: {}: Name: {}, Channels: {}'.format(k, v['name'], v['num_chs']))
diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py
index c2b3a801..db6f54f9 100644
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@@ -358,9 +358,13 @@ class EfficientNetBuilder:
         return stages
 
 
-def efficientnet_init_goog(m, n=''):
-    # weight init as per Tensorflow Official impl
-    # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+def _init_weight_goog(m, n=''):
+    """ Weight initialization as per Tensorflow official implementations.
+
+    Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc:
+    * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+    * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    """
     if isinstance(m, CondConv2d):
         fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
         init_weight_fn = get_condconv_initializer(
@@ -386,7 +390,8 @@ def efficientnet_init_goog(m, n=''):
         m.bias.data.zero_()
 
 
-def efficientnet_init_default(m, n=''):
+def _init_weight_default(m, n=''):
+    """ Basic ResNet (Kaiming) style weight init"""
     if isinstance(m, CondConv2d):
         init_fn = get_condconv_initializer(partial(
             nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape)
@@ -400,3 +405,8 @@ def efficientnet_init_default(m, n=''):
         nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear')
 
 
+def efficientnet_init_weights(model: nn.Module, init_fn=None):
+    init_fn = init_fn or _init_weight_goog
+    for n, m in model.named_modules():
+        init_fn(m, n)
+
diff --git a/timm/models/gluon_xception.py b/timm/models/gluon_xception.py
index 9393e5ba..5a35d226 100644
--- a/timm/models/gluon_xception.py
+++ b/timm/models/gluon_xception.py
@@ -13,7 +13,7 @@ from collections import OrderedDict
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 __all__ = ['Xception65', 'Xception71']
@@ -185,7 +185,6 @@ class Xception65(nn.Module):
         super(Xception65, self).__init__()
         self.num_classes = num_classes
         self.drop_rate = drop_rate
-        self.global_pool = global_pool
         norm_kwargs = norm_kwargs if norm_kwargs is not None else {}
         if output_stride == 32:
             entry_block3_stride = 2
@@ -249,21 +248,18 @@ class Xception65(nn.Module):
             1536, self.num_features, 3, stride=1, dilation=exit_block_dilations[1],
             norm_layer=norm_layer, norm_kwargs=norm_kwargs)
         self.bn5 = norm_layer(num_features=self.num_features, **norm_kwargs)
-        self.fc = nn.Linear(in_features=self.num_features, out_features=num_classes)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
     def get_classifier(self):
         return self.fc
 
     def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
-        self.global_pool = global_pool
-        del self.fc
-        if num_classes:
-            self.fc = nn.Linear(self.num_features, num_classes)
-        else:
-            self.fc = None
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         # Entry flow
         x = self.conv1(x)
         x = self.bn1(x)
@@ -299,14 +295,11 @@ class Xception65(nn.Module):
         x = self.conv5(x)
         x = self.bn5(x)
         x = self.relu(x)
-
-        if pool:
-            x = select_adaptive_pool2d(x, pool_type=self.global_pool)
-            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate:
             F.dropout(x, self.drop_rate, training=self.training)
         x = self.fc(x)
@@ -322,7 +315,6 @@ class Xception71(nn.Module):
         super(Xception71, self).__init__()
         self.num_classes = num_classes
         self.drop_rate = drop_rate
-        self.global_pool = global_pool
         norm_kwargs = norm_kwargs if norm_kwargs is not None else {}
         if output_stride == 32:
             entry_block3_stride = 2
@@ -393,21 +385,18 @@ class Xception71(nn.Module):
             1536, self.num_features, 3, stride=1, dilation=exit_block_dilations[1],
             norm_layer=norm_layer, norm_kwargs=norm_kwargs)
         self.bn5 = norm_layer(num_features=self.num_features, **norm_kwargs)
-        self.fc = nn.Linear(in_features=self.num_features, out_features=num_classes)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
     def get_classifier(self):
         return self.fc
 
     def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
-        self.global_pool = global_pool
-        del self.fc
-        if num_classes:
-            self.fc = nn.Linear(self.num_features, num_classes)
-        else:
-            self.fc = None
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         # Entry flow
         x = self.conv1(x)
         x = self.bn1(x)
@@ -443,14 +432,11 @@ class Xception71(nn.Module):
         x = self.conv5(x)
         x = self.bn5(x)
         x = self.relu(x)
-
-        if pool:
-            x = select_adaptive_pool2d(x, pool_type=self.global_pool)
-            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate:
             F.dropout(x, self.drop_rate, training=self.training)
         x = self.fc(x)
diff --git a/timm/models/hrnet.py b/timm/models/hrnet.py
index 59ded4ab..99a2bd91 100644
--- a/timm/models/hrnet.py
+++ b/timm/models/hrnet.py
@@ -17,20 +17,18 @@ import os
 import logging
 import functools
 
-import numpy as np
-
 import torch
 import torch.nn as nn
 import torch._utils
 import torch.nn.functional as F
 
+from .resnet import BasicBlock, Bottleneck  # leveraging ResNet blocks w/ additional features like SE
 from .registry import register_model
 from .helpers import load_pretrained
-from .helpers import load_pretrained
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
-BN_MOMENTUM = 0.1
+_BN_MOMENTUM = 0.1
 logger = logging.getLogger(__name__)
 
 
@@ -46,380 +44,353 @@ def _cfg(url='', **kwargs):
 
 
 default_cfgs = {
-    'hrnet_w18_small': _cfg(url=''),
-    'hrnet_w18_small_v2': _cfg(url=''),
-    'hrnet_w18': _cfg(url=''),
-    'hrnet_w30': _cfg(url=''),
-    'hrnet_w32': _cfg(url=''),
-    'hrnet_w40': _cfg(url=''),
-    'hrnet_w44': _cfg(url=''),
-    'hrnet_w48': _cfg(url=''),
+    'hrnet_w18_small': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v1-f460c6bc.pth'),
+    'hrnet_w18_small_v2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v2-4c50a8cb.pth'),
+    'hrnet_w18': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w18-8cb57bb9.pth'),
+    'hrnet_w30': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w30-8d7f8dab.pth'),
+    'hrnet_w32': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w32-90d8c5fb.pth'),
+    'hrnet_w40': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w40-7cd397a4.pth'),
+    'hrnet_w44': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w44-c9ac8c18.pth'),
+    'hrnet_w48': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w48-abd2e6ab.pth'),
+    'hrnet_w64': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w64-b47cc881.pth'),
 }
 
-cfg_cls_hrnet_w18_small = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(1,),
-        NUM_CHANNELS=(32,),
-        FUSE_METHOD='SUM',
+cfg_cls = dict(
+    hrnet_w18_small=dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(1,),
+            NUM_CHANNELS=(32,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2),
+            NUM_CHANNELS=(16, 32),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2, 2),
+            NUM_CHANNELS=(16, 32, 64),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2, 2, 2),
+            NUM_CHANNELS=(16, 32, 64, 128),
+            FUSE_METHOD='SUM',
+        ),
     ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2),
-        NUM_CHANNELS=(16, 32),
-        FUSE_METHOD='SUM'
+
+    hrnet_w18_small_v2 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(2,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2),
+            NUM_CHANNELS=(18, 36),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2, 2),
+            NUM_CHANNELS=(18, 36, 72),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=2,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(2, 2, 2, 2),
+            NUM_CHANNELS=(18, 36, 72, 144),
+            FUSE_METHOD='SUM',
+        ),
     ),
-    STAGE3=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2, 2),
-        NUM_CHANNELS=(16, 32, 64),
-        FUSE_METHOD='SUM'
+
+    hrnet_w18 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(18, 36),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(18, 36, 72),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(18, 36, 72, 144),
+            FUSE_METHOD='SUM',
+        ),
     ),
-    STAGE4=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2, 2, 2),
-        NUM_CHANNELS=(16, 32, 64, 128),
-        FUSE_METHOD='SUM',
+
+    hrnet_w30 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(30, 60),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(30, 60, 120),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(30, 60, 120, 240),
+            FUSE_METHOD='SUM',
+        ),
     ),
+
+    hrnet_w32 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(32, 64),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(32, 64, 128),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(32, 64, 128, 256),
+            FUSE_METHOD='SUM',
+        ),
+    ),
+
+    hrnet_w40 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(40, 80),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(40, 80, 160),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(40, 80, 160, 320),
+            FUSE_METHOD='SUM',
+        ),
+    ),
+
+    hrnet_w44 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(44, 88),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(44, 88, 176),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(44, 88, 176, 352),
+            FUSE_METHOD='SUM',
+        ),
+    ),
+
+    hrnet_w48 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(48, 96),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(48, 96, 192),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(48, 96, 192, 384),
+            FUSE_METHOD='SUM',
+        ),
+    ),
+
+    hrnet_w64 = dict(
+        STEM_WIDTH=64,
+        STAGE1=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=1,
+            BLOCK='BOTTLENECK',
+            NUM_BLOCKS=(4,),
+            NUM_CHANNELS=(64,),
+            FUSE_METHOD='SUM',
+        ),
+        STAGE2=dict(
+            NUM_MODULES=1,
+            NUM_BRANCHES=2,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4),
+            NUM_CHANNELS=(64, 128),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE3=dict(
+            NUM_MODULES=4,
+            NUM_BRANCHES=3,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4),
+            NUM_CHANNELS=(64, 128, 256),
+            FUSE_METHOD='SUM'
+        ),
+        STAGE4=dict(
+            NUM_MODULES=3,
+            NUM_BRANCHES=4,
+            BLOCK='BASIC',
+            NUM_BLOCKS=(4, 4, 4, 4),
+            NUM_CHANNELS=(64, 128, 256, 512),
+            FUSE_METHOD='SUM',
+        ),
+    )
 )
 
 
-cfg_cls_hrnet_w18_small_v2 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(2,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2),
-        NUM_CHANNELS=(18, 36),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2, 2),
-        NUM_CHANNELS=(18, 36, 72),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=2,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(2, 2, 2, 2),
-        NUM_CHANNELS=(18, 36, 72, 144),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-cfg_cls_hrnet_w18 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(18, 36),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(18, 36, 72),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(18, 36, 72, 144),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-
-cfg_cls_hrnet_w30 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(30, 60),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(30, 60, 120),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(30, 60, 120, 240),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-
-cfg_cls_hrnet_w32 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(32, 64),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(32, 64, 128),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(32, 64, 128, 256),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-cfg_cls_hrnet_w40 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(40, 80),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(40, 80, 160),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(40, 80, 160, 320),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-
-cfg_cls_hrnet_w44 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(44, 88),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(44, 88, 176),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(44, 88, 176, 352),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-
-cfg_cls_hrnet_w48 = dict(
-    STAGE1=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=1,
-        BLOCK='BOTTLENECK',
-        NUM_BLOCKS=(4,),
-        NUM_CHANNELS=(64,),
-        FUSE_METHOD='SUM',
-    ),
-    STAGE2=dict(
-        NUM_MODULES=1,
-        NUM_BRANCHES=2,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4),
-        NUM_CHANNELS=(48, 96),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE3=dict(
-        NUM_MODULES=4,
-        NUM_BRANCHES=3,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4),
-        NUM_CHANNELS=(48, 96, 192),
-        FUSE_METHOD='SUM'
-    ),
-    STAGE4=dict(
-        NUM_MODULES=3,
-        NUM_BRANCHES=4,
-        BLOCK='BASIC',
-        NUM_BLOCKS=(4, 4, 4, 4),
-        NUM_CHANNELS=(48, 96, 192, 384),
-        FUSE_METHOD='SUM',
-    ),
-)
-
-
-def conv3x3(in_planes, out_planes, stride=1):
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(Bottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
-        self.conv3 = nn.Conv2d(
-            planes, planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(
-            planes * self.expansion, momentum=BN_MOMENTUM)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
 class HighResolutionModule(nn.Module):
     def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                  num_channels, fuse_method, multi_scale_output=True):
@@ -466,11 +437,10 @@ class HighResolutionModule(nn.Module):
                 nn.Conv2d(
                     self.num_inchannels[branch_index], num_channels[branch_index] * block.expansion,
                     kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=_BN_MOMENTUM),
             )
 
-        layers = []
-        layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample))
+        layers = [block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)]
         self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
         for i in range(1, num_blocks[branch_index]):
             layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
@@ -479,7 +449,6 @@ class HighResolutionModule(nn.Module):
 
     def _make_branches(self, num_branches, block, num_blocks, num_channels):
         branches = []
-
         for i in range(num_branches):
             branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
 
@@ -498,7 +467,7 @@ class HighResolutionModule(nn.Module):
                 if j > i:
                     fuse_layer.append(nn.Sequential(
                         nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
-                        nn.BatchNorm2d(num_inchannels[i], momentum=BN_MOMENTUM),
+                        nn.BatchNorm2d(num_inchannels[i], momentum=_BN_MOMENTUM),
                         nn.Upsample(scale_factor=2 ** (j - i), mode='nearest')))
                 elif j == i:
                     fuse_layer.append(None)
@@ -509,12 +478,12 @@ class HighResolutionModule(nn.Module):
                             num_outchannels_conv3x3 = num_inchannels[i]
                             conv3x3s.append(nn.Sequential(
                                 nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
-                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM)))
+                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM)))
                         else:
                             num_outchannels_conv3x3 = num_inchannels[j]
                             conv3x3s.append(nn.Sequential(
                                 nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
-                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=BN_MOMENTUM),
+                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM),
                                 nn.ReLU(False)))
                     fuse_layer.append(nn.Sequential(*conv3x3s))
             fuse_layers.append(nn.ModuleList(fuse_layer))
@@ -552,13 +521,16 @@ blocks_dict = {
 
 class HighResolutionNet(nn.Module):
 
-    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg'):
+    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0.0):
         super(HighResolutionNet, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
 
-        self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
-        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        stem_width = cfg['STEM_WIDTH']
+        self.conv1 = nn.Conv2d(in_chans, stem_width, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(stem_width, momentum=_BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(stem_width, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=_BN_MOMENTUM)
         self.relu = nn.ReLU(inplace=True)
 
         self.stage1_cfg = cfg['STAGE1']
@@ -590,9 +562,10 @@ class HighResolutionNet(nn.Module):
         self.stage4, pre_stage_channels = self._make_stage(self.stage4_cfg, num_channels, multi_scale_output=True)
 
         # Classification Head
+        self.num_features = 2048
         self.incre_modules, self.downsamp_modules, self.final_layer = self._make_head(pre_stage_channels)
-
-        self.classifier = nn.Linear(2048, num_classes)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
         self.init_weights()
 
@@ -616,7 +589,7 @@ class HighResolutionNet(nn.Module):
             downsamp_module = nn.Sequential(
                 nn.Conv2d(
                     in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1),
-                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
+                nn.BatchNorm2d(out_channels, momentum=_BN_MOMENTUM),
                 nn.ReLU(inplace=True)
             )
             downsamp_modules.append(downsamp_module)
@@ -625,9 +598,9 @@ class HighResolutionNet(nn.Module):
         final_layer = nn.Sequential(
             nn.Conv2d(
                 in_channels=head_channels[3] * head_block.expansion,
-                out_channels=2048, kernel_size=1, stride=1, padding=0
+                out_channels=self.num_features, kernel_size=1, stride=1, padding=0
             ),
-            nn.BatchNorm2d(2048, momentum=BN_MOMENTUM),
+            nn.BatchNorm2d(self.num_features, momentum=_BN_MOMENTUM),
             nn.ReLU(inplace=True)
         )
 
@@ -643,7 +616,7 @@ class HighResolutionNet(nn.Module):
                 if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                     transition_layers.append(nn.Sequential(
                         nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
-                        nn.BatchNorm2d(num_channels_cur_layer[i], momentum=BN_MOMENTUM),
+                        nn.BatchNorm2d(num_channels_cur_layer[i], momentum=_BN_MOMENTUM),
                         nn.ReLU(inplace=True)))
                 else:
                     transition_layers.append(None)
@@ -654,7 +627,7 @@ class HighResolutionNet(nn.Module):
                     outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
                     conv3x3s.append(nn.Sequential(
                         nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
-                        nn.BatchNorm2d(outchannels, momentum=BN_MOMENTUM),
+                        nn.BatchNorm2d(outchannels, momentum=_BN_MOMENTUM),
                         nn.ReLU(inplace=True)))
                 transition_layers.append(nn.Sequential(*conv3x3s))
 
@@ -665,11 +638,10 @@ class HighResolutionNet(nn.Module):
         if stride != 1 or inplanes != planes * block.expansion:
             downsample = nn.Sequential(
                 nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+                nn.BatchNorm2d(planes * block.expansion, momentum=_BN_MOMENTUM),
             )
 
-        layers = []
-        layers.append(block(inplanes, planes, stride, downsample))
+        layers = [block(inplanes, planes, stride, downsample)]
         inplanes = planes * block.expansion
         for i in range(1, blocks):
             layers.append(block(inplanes, planes))
@@ -699,8 +671,7 @@ class HighResolutionNet(nn.Module):
 
         return nn.Sequential(*modules), num_inchannels
 
-    def init_weights(self, pretrained='', ):
-        logger.info('=> init weights from normal distribution')
+    def init_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 nn.init.kaiming_normal_(
@@ -709,7 +680,16 @@ class HighResolutionNet(nn.Module):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
 
-    def forward(self, x):
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+
+    def forward_features(self, x):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
@@ -746,124 +726,79 @@ class HighResolutionNet(nn.Module):
         y = self.incre_modules[0](y_list[0])
         for i in range(len(self.downsamp_modules)):
             y = self.incre_modules[i + 1](y_list[i + 1]) + self.downsamp_modules[i](y)
-
         y = self.final_layer(y)
-
-        if torch._C._get_tracing_state():
-            y = y.flatten(start_dim=2).mean(dim=2)
-        else:
-            y = F.avg_pool2d(y, kernel_size=y.size()[2:]).view(y.size(0), -1)
-
-        y = self.classifier(y)
-
         return y
 
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        return x
+
+
+def _create_model(variant, pretrained, model_kwargs):
+    if model_kwargs.pop('features_only', False):
+        assert False, 'Not Implemented'  # TODO
+        load_strict = False
+        model_kwargs.pop('num_classes', 0)
+        model_class = HighResolutionNet
+    else:
+        load_strict = True
+        model_class = HighResolutionNet
+
+    model = model_class(cfg_cls[variant], **model_kwargs)
+    model.default_cfg = default_cfgs[variant]
+    if pretrained:
+        load_pretrained(
+            model,
+            num_classes=model_kwargs.get('num_classes', 0),
+            in_chans=model_kwargs.get('in_chans', 3),
+            strict=load_strict)
+    return model
 
 
 @register_model
 def hrnet_w18_small(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w18_small']
-    model = HighResolutionNet(cfg_cls_hrnet_w18_small, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w18_small', pretrained, kwargs)
 
 
 @register_model
 def hrnet_w18_small_v2(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w18_small_v2']
-    model = HighResolutionNet(cfg_cls_hrnet_w18_small_v2, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w18_small_v2', pretrained, kwargs)
+
 
 @register_model
 def hrnet_w18(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w18']
-    model = HighResolutionNet(cfg_cls_hrnet_w18, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w18', pretrained, kwargs)
 
 
 @register_model
 def hrnet_w30(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w30']
-    model = HighResolutionNet(cfg_cls_hrnet_w30, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w30', pretrained, kwargs)
+
 
 @register_model
 def hrnet_w32(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w32']
-    model = HighResolutionNet(cfg_cls_hrnet_w32, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w32', pretrained, kwargs)
+
 
 @register_model
 def hrnet_w40(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w40']
-    model = HighResolutionNet(cfg_cls_hrnet_w40, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w40', pretrained, kwargs)
 
 
 @register_model
 def hrnet_w44(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w44']
-    model = HighResolutionNet(cfg_cls_hrnet_w44, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w44', pretrained, kwargs)
 
 
 @register_model
 def hrnet_w48(pretrained=True, **kwargs):
-    default_cfg = default_cfgs['hrnet_w48']
-    model = HighResolutionNet(cfg_cls_hrnet_w48, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(
-            model,
-            default_cfg,
-            num_classes=kwargs.get('num_classes', 0),
-            in_chans=kwargs.get('in_chans', 3))
-    return model
+    return _create_model('hrnet_w48', pretrained, kwargs)
+
+
+@register_model
+def hrnet_w64(pretrained=True, **kwargs):
+    return _create_model('hrnet_w64', pretrained, kwargs)
diff --git a/timm/models/inception_resnet_v2.py b/timm/models/inception_resnet_v2.py
index fe5679fe..da019075 100644
--- a/timm/models/inception_resnet_v2.py
+++ b/timm/models/inception_resnet_v2.py
@@ -8,7 +8,7 @@ import torch.nn.functional as F
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 __all__ = ['InceptionResnetV2']
@@ -226,7 +226,6 @@ class InceptionResnetV2(nn.Module):
     def __init__(self, num_classes=1001, in_chans=3, drop_rate=0., global_pool='avg'):
         super(InceptionResnetV2, self).__init__()
         self.drop_rate = drop_rate
-        self.global_pool = global_pool
         self.num_classes = num_classes
         self.num_features = 1536
 
@@ -287,22 +286,20 @@ class InceptionResnetV2(nn.Module):
         )
         self.block8 = Block8(noReLU=True)
         self.conv2d_7b = BasicConv2d(2080, self.num_features, kernel_size=1, stride=1)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         # NOTE some variants/checkpoints for this model may have 'last_linear' as the name for the FC
-        self.classif = nn.Linear(self.num_features, num_classes)
+        self.classif = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
     def get_classifier(self):
         return self.classif
 
     def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = global_pool
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
-        del self.classif
-        if num_classes:
-            self.classif = torch.nn.Linear(self.num_features, num_classes)
-        else:
-            self.classif = None
+        self.classif = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x = self.conv2d_1a(x)
         x = self.conv2d_2a(x)
         x = self.conv2d_2b(x)
@@ -318,14 +315,11 @@ class InceptionResnetV2(nn.Module):
         x = self.repeat_2(x)
         x = self.block8(x)
         x = self.conv2d_7b(x)
-        if pool:
-            x = select_adaptive_pool2d(x, self.global_pool)
-            #x = F.avg_pool2d(x, 8, count_include_pad=False)
-            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, x):
-        x = self.forward_features(x, pool=True)
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate > 0:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.classif(x)
diff --git a/timm/models/inception_v4.py b/timm/models/inception_v4.py
index e389eb88..8c3dee86 100644
--- a/timm/models/inception_v4.py
+++ b/timm/models/inception_v4.py
@@ -8,7 +8,7 @@ import torch.nn.functional as F
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 __all__ = ['InceptionV4']
@@ -244,7 +244,6 @@ class InceptionV4(nn.Module):
     def __init__(self, num_classes=1001, in_chans=3, drop_rate=0., global_pool='avg'):
         super(InceptionV4, self).__init__()
         self.drop_rate = drop_rate
-        self.global_pool = global_pool
         self.num_classes = num_classes
         self.num_features = 1536
 
@@ -272,25 +271,24 @@ class InceptionV4(nn.Module):
             Inception_C(),
             Inception_C(),
         )
-        self.last_linear = nn.Linear(self.num_features, num_classes)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.last_linear = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
     def get_classifier(self):
-        return self.classif
+        return self.last_linear
 
     def reset_classifier(self, num_classes, global_pool='avg'):
-        self.global_pool = global_pool
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
-        self.classif = nn.Linear(self.num_features, num_classes)
+        self.last_linear = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
-        x = self.features(x)
-        if pool:
-            x = select_adaptive_pool2d(x, self.global_pool)
-            x = x.view(x.size(0), -1)
-        return x
+    def forward_features(self, x):
+        return self.features(x)
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate > 0:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.last_linear(x)
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
index a89adea4..a6b67532 100644
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@@ -75,8 +75,7 @@ class MobileNetV3(nn.Module):
 
     def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
                  channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 global_pool='avg', weight_init='goog'):
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
         super(MobileNetV3, self).__init__()
         
         self.num_classes = num_classes
@@ -107,11 +106,7 @@ class MobileNetV3(nn.Module):
         # Classifier
         self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), self.num_classes)
 
-        for m in self.modules():
-            if weight_init == 'goog':
-                efficientnet_init_goog(m)
-            else:
-                efficientnet_init_default(m)
+        efficientnet_init_weights(self)
 
     def as_sequential(self):
         layers = [self.conv_stem, self.bn1, self.act1]
@@ -126,12 +121,8 @@ class MobileNetV3(nn.Module):
     def reset_classifier(self, num_classes, global_pool='avg'):
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
-        del self.classifier
-        if num_classes:
-            self.classifier = nn.Linear(
-                self.num_features * self.global_pool.feat_mult(), num_classes)
-        else:
-            self.classifier = None
+        self.classifier = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if self.num_classes else None
 
     def forward_features(self, x):
         x = self.conv_stem(x)
@@ -161,7 +152,7 @@ class MobileNetV3Features(nn.Module):
     def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                  in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
                  act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0., se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
         super(MobileNetV3Features, self).__init__()
         norm_kwargs = norm_kwargs or {}
 
@@ -187,12 +178,7 @@ class MobileNetV3Features(nn.Module):
         self.feature_info = builder.features  # builder provides info about feature channels for each block
         self._in_chs = builder.in_chs
 
-        for m in self.modules():
-            if weight_init == 'goog':
-                efficientnet_init_goog(m)
-            else:
-                efficientnet_init_default(m)
-
+        efficientnet_init_weights(self)
         if _DEBUG:
             for k, v in self.feature_info.items():
                 print('Feature idx: {}: Name: {}, Channels: {}'.format(k, v['name'], v['num_chs']))
diff --git a/timm/models/nasnet.py b/timm/models/nasnet.py
index 9caee809..009c62d3 100644
--- a/timm/models/nasnet.py
+++ b/timm/models/nasnet.py
@@ -556,8 +556,18 @@ class NASNetALarge(nn.Module):
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.last_linear = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
-    def forward_features(self, input, pool=True):
-        x_conv0 = self.conv0(input)
+    def get_classifier(self):
+        return self.last_linear
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        del self.last_linear
+        self.last_linear = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+
+    def forward_features(self, x):
+        x_conv0 = self.conv0(x)
         x_stem_0 = self.cell_stem_0(x_conv0)
         x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
 
@@ -586,13 +596,11 @@ class NASNetALarge(nn.Module):
         x_cell_16 = self.cell_16(x_cell_15, x_cell_14)
         x_cell_17 = self.cell_17(x_cell_16, x_cell_15)
         x = self.relu(x_cell_17)
-        if pool:
-            x = self.global_pool(x)
-            x = x.view(x.size(0), -1)
         return x
 
-    def forward(self, input):
-        x = self.forward_features(input)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate > 0:
             x = F.dropout(x, self.drop_rate, training=self.training)
         x = self.last_linear(x)
diff --git a/timm/models/pnasnet.py b/timm/models/pnasnet.py
index e04a2b1f..396e6157 100644
--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@@ -355,7 +355,7 @@ class PNASNet5Large(nn.Module):
         else:
             self.last_linear = None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x_conv_0 = self.conv_0(x)
         x_stem_0 = self.cell_stem_0(x_conv_0)
         x_stem_1 = self.cell_stem_1(x_conv_0, x_stem_0)
@@ -372,13 +372,11 @@ class PNASNet5Large(nn.Module):
         x_cell_10 = self.cell_10(x_cell_8, x_cell_9)
         x_cell_11 = self.cell_11(x_cell_9, x_cell_10)
         x = self.relu(x_cell_11)
-        if pool:
-            x = self.global_pool(x)
-            x = x.view(x.size(0), -1)
         return x
 
-    def forward(self, input):
-        x = self.forward_features(input)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate > 0:
             x = F.dropout(x, self.drop_rate, training=self.training)
         x = self.last_linear(x)
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index c7d80dba..b90bb9d5 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -17,7 +17,7 @@ from .adaptive_avgmax_pool import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
 
-__all__ = ['ResNet']  # model_registry will add each entrypoint fn to this
+__all__ = ['ResNet', 'BasicBlock', 'Bottleneck']  # model_registry will add each entrypoint fn to this
 
 
 def _cfg(url='', **kwargs):
@@ -374,12 +374,9 @@ class ResNet(nn.Module):
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.num_classes = num_classes
         del self.fc
-        if num_classes:
-            self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
-        else:
-            self.fc = None
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
@@ -389,14 +386,11 @@ class ResNet(nn.Module):
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
-
-        if pool:
-            x = self.global_pool(x)
-            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.fc(x)
diff --git a/timm/models/senet.py b/timm/models/senet.py
index 0fbcfb86..90ef5ae1 100644
--- a/timm/models/senet.py
+++ b/timm/models/senet.py
@@ -274,6 +274,7 @@ class SENet(nn.Module):
         super(SENet, self).__init__()
         self.inplanes = inplanes
         self.num_classes = num_classes
+        self.drop_rate = drop_rate
         if input_3x3:
             layer0_modules = [
                 ('conv1', nn.Conv2d(in_chans, 64, 3, stride=2, padding=1, bias=False)),
@@ -337,7 +338,6 @@ class SENet(nn.Module):
             downsample_padding=downsample_padding
         )
         self.avg_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.drop_rate = drop_rate
         self.num_features = 512 * block.expansion
         self.last_linear = nn.Linear(self.num_features, num_classes)
 
@@ -366,26 +366,25 @@ class SENet(nn.Module):
     def get_classifier(self):
         return self.last_linear
 
-    def reset_classifier(self, num_classes):
+    def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
+        self.avg_pool = SelectAdaptivePool2d(pool_type=global_pool)
         del self.last_linear
         if num_classes:
-            self.last_linear = nn.Linear(self.num_features, num_classes)
+            self.last_linear = nn.Linear(self.num_features * self.avg_pool.feat_mult(), num_classes)
         else:
             self.last_linear = None
 
-    def forward_features(self, x, pool=True):
+    def forward_features(self, x):
         x = self.layer0(x)
         x = self.layer1(x)
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
-        if pool:
-            x = self.avg_pool(x)
-            x = x.view(x.size(0), -1)
         return x
 
     def logits(self, x):
+        x = self.avg_pool(x).flatten(1)
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.last_linear(x)
diff --git a/timm/models/test_time_pool.py b/timm/models/test_time_pool.py
index 7d5bb571..ce6ddf07 100644
--- a/timm/models/test_time_pool.py
+++ b/timm/models/test_time_pool.py
@@ -20,7 +20,7 @@ class TestTimePoolHead(nn.Module):
         self.base.reset_classifier(0)  # delete original fc layer
 
     def forward(self, x):
-        x = self.base.forward_features(x, pool=False)
+        x = self.base.forward_features(x)
         x = F.avg_pool2d(x, kernel_size=self.original_pool, stride=1)
         x = self.fc(x)
         x = adaptive_avgmax_pool2d(x, 1)
diff --git a/timm/models/xception.py b/timm/models/xception.py
index e76ed9ff..2dc334fa 100644
--- a/timm/models/xception.py
+++ b/timm/models/xception.py
@@ -29,7 +29,7 @@ import torch.nn.functional as F
 
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import select_adaptive_pool2d
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
 
 __all__ = ['Xception']
 
@@ -163,7 +163,8 @@ class Xception(nn.Module):
         self.conv4 = SeparableConv2d(1536, self.num_features, 3, 1, 1)
         self.bn4 = nn.BatchNorm2d(self.num_features)
 
-        self.fc = nn.Linear(self.num_features, num_classes)
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
         # #------- init weights --------
         for m in self.modules():
@@ -178,15 +179,12 @@ class Xception(nn.Module):
 
     def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
-        self.global_pool = global_pool
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         del self.fc
-        if num_classes:
-            self.fc = nn.Linear(self.num_features, num_classes)
-        else:
-            self.fc = None
+        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
 
-    def forward_features(self, input, pool=True):
-        x = self.conv1(input)
+    def forward_features(self, x):
+        x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
 
@@ -214,14 +212,11 @@ class Xception(nn.Module):
         x = self.conv4(x)
         x = self.bn4(x)
         x = self.relu(x)
-
-        if pool:
-            x = select_adaptive_pool2d(x, pool_type=self.global_pool)
-            x = x.view(x.size(0), -1)
         return x
 
-    def forward(self, input):
-        x = self.forward_features(input)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.global_pool(x).flatten(1)
         if self.drop_rate:
             F.dropout(x, self.drop_rate, training=self.training)
         x = self.fc(x)

From 5259dbcbb4465595c20fdfeb1ae311000298c11c Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 29 Nov 2019 17:45:35 -0800
Subject: [PATCH 25/35] Update README.md with latest changes

---
 README.md | 194 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 128 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index bb7f4206..f8e4d3b0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,21 @@
 # PyTorch Image Models, etc
 
+## What's New
+
+### Nov 29, 2019
+* Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded.
+  * AdvProp weights added
+  * Official TF MobileNetv3 weights added
+* EfficientNet and MobileNetV3 hook based 'feature extraction' classes added. Will serve as basis for using models as backbones in obj detection/segmentation tasks. Lots more to be done here...
+* HRNet classification models and weights added from https://github.com/HRNet/HRNet-Image-Classification
+* Consistency in global pooling, `reset_classifer`, and `forward_features` across models
+  * `forward_features` always returns unpooled feature maps now
+* Reasonable chance I broke something... let me know
+
+### Nov 22, 2019
+* Add ImageNet training RandAugment implementation alongside AutoAugment. PyTorch Transform compatible format, using PIL. Currently training two EfficientNet models from scratch with promising results... will update.
+* `drop-connect` cmd line arg finally added to `train.py`, no need to hack model fns. Works for efficientnet/mobilenetv3 based models, ignored otherwise.
+
 ## Introduction 
 
 For each competition, personal, or freelance project involving images + Convolution Neural Networks, I build on top of an evolving collection of code and models. This repo contains a (somewhat) cleaned up and paired down iteration of that code. Hopefully it'll be of use to others.
@@ -34,21 +50,24 @@ I've included a few of my favourite models, but this is not an exhaustive collec
 * Squeeze-and-Excitation ResNet/ResNeXt (from [Cadene](https://github.com/Cadene/pretrained-models.pytorch) with some pretrained weight additions by myself)
     * SENet-154, SE-ResNet-18, SE-ResNet-34, SE-ResNet-50, SE-ResNet-101, SE-ResNet-152, SE-ResNeXt-26 (32x4d), SE-ResNeXt50 (32x4d), SE-ResNeXt101 (32x4d)
 * Inception-ResNet-V2 and Inception-V4 (from [Cadene](https://github.com/Cadene/pretrained-models.pytorch) )
-* Xception (from [Cadene](https://github.com/Cadene/pretrained-models.pytorch))
+* Xception
+    * Original variant from [Cadene](https://github.com/Cadene/pretrained-models.pytorch)
+    * MXNet Gluon 'modified aligned' Xception-65 and 71 models from [Gluon ModelZoo](https://github.com/dmlc/gluon-cv/tree/master/gluoncv/model_zoo)
 * PNasNet & NASNet-A (from [Cadene](https://github.com/Cadene/pretrained-models.pytorch))
 * DPN (from [me](https://github.com/rwightman/pytorch-dpn-pretrained), weights hosted by Cadene)
     * DPN-68, DPN-68b, DPN-92, DPN-98, DPN-131, DPN-107
-* Generic EfficientNet (from my standalone [GenMobileNet](https://github.com/rwightman/genmobilenet-pytorch)) - A generic model that implements many of the efficient models that utilize similar DepthwiseSeparable and InvertedResidual blocks
-    * EfficientNet (B0-B7) (https://arxiv.org/abs/1905.11946) -- validated, compat with TF weights
-    * EfficientNet-EdgeTPU (S, M, L) (https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html) --validated w/ TF weights
-    * MixNet (https://arxiv.org/abs/1907.09595) -- validated, compat with TF weights
-    * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626)
-    * MobileNet-V1 (https://arxiv.org/abs/1704.04861)
-    * MobileNet-V2 (https://arxiv.org/abs/1801.04381)
-    * MobileNet-V3 (https://arxiv.org/abs/1905.02244) -- pretrained model good, still no official impl to verify against
-    * ChamNet (https://arxiv.org/abs/1812.08934) -- specific arch details hard to find, currently an educated guess
-    * FBNet-C (https://arxiv.org/abs/1812.03443) -- TODO A/B variants
+* EfficientNet (from my standalone [GenMobileNet](https://github.com/rwightman/genmobilenet-pytorch)) - A generic model that implements many of the efficient models that utilize similar DepthwiseSeparable and InvertedResidual blocks
+    * EfficientNet AdvProp (B0-B8) (https://arxiv.org/abs/1911.09665) -- TF weights ported
+    * EfficientNet (B0-B7) (https://arxiv.org/abs/1905.11946) -- TF weights ported, B0-B2 finetuned PyTorch
+    * EfficientNet-EdgeTPU (S, M, L) (https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html) --TF weights ported
+    * MixNet (https://arxiv.org/abs/1907.09595) -- TF weights ported, PyTorch finetuned (S, M, L) or trained models (XL)
+    * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626) -- trained in PyTorch
+    * MobileNet-V2 (https://arxiv.org/abs/1801.04381)    
+    * FBNet-C (https://arxiv.org/abs/1812.03443) -- trained in PyTorch
     * Single-Path NAS (https://arxiv.org/abs/1904.02877) -- pixel1 variant
+* MobileNet-V3 (https://arxiv.org/abs/1905.02244) -- pretrained PyTorch model, official TF weights ported
+* HRNet
+    * code from https://github.com/HRNet/HRNet-Image-Classification, paper https://arxiv.org/abs/1908.07919
 
 Use the  `--model` arg to specify model for train, validation, inference scripts. Match the all lowercase
 creation fn for the model you'd like.
@@ -103,67 +122,110 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | seresnet18 | 71.742 (28.258) | 90.334 (9.666) | 11.8M | bicubic | 224 |
 
 ### Ported Weights
+For the models below, the model code and weight porting from Tensorflow or MXNet Gluon to Pytorch was done by myself. There are weights/models ported by others included in this repository, they are not listed below.
 
-| Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size | Source |
-|---|---|---|---|---|---|---|
-| tf_efficientnet_b7 *tfp  | 84.940 (15.060) | 97.214 (2.786) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b7       | 84.932 (15.068) | 97.208 (2.792) | 66.35  | bicubic | 600 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b6 *tfp  | 84.140 (15.860) | 96.852 (3.148) | 43.04  | bicubic | 528 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b6       | 84.110 (15.890) | 96.886 (3.114) | 43.04  | bicubic | 528 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b5 *tfp  | 83.822 (16.178) | 96.756 (3.244) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b5       | 83.812 (16.188) | 96.748 (3.252) | 30.39  | bicubic | 456 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b4       | 83.022 (16.978) | 96.300 (3.700) | 19.34  | bicubic | 380 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b4 *tfp  | 82.948 (17.052) | 96.308 (3.692) | 19.34  | bicubic | 380 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b3 *tfp  | 81.576 (18.424) | 95.662 (4.338) | 12.23  | bicubic | 300 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b3       | 81.636 (18.364) | 95.718 (4.282) | 12.23  | bicubic | 300 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| gluon_senet154           | 81.224 (18.776) | 95.356 (4.644) | 115.09 | bicubic | 224 | |
-| gluon_resnet152_v1s      | 81.012 (18.988) | 95.416 (4.584) | 60.32  | bicubic | 224 | |
-| gluon_seresnext101_32x4d | 80.902 (19.098) | 95.294 (4.706) | 48.96  | bicubic | 224 | |
-| gluon_seresnext101_64x4d | 80.890 (19.110) | 95.304 (4.696) | 88.23  | bicubic | 224 | |
-| gluon_resnext101_64x4d   | 80.602 (19.398) | 94.994 (5.006) | 83.46  | bicubic | 224 | |
-| tf_efficientnet_el       | 80.534 (19.466) | 95.190 (4.810) | 10.59 | bicubic | 300 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| tf_efficientnet_el *tfp  | 80.476 (19.524) | 95.200 (4.800) | 10.59 | bicubic | 300 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| gluon_resnet152_v1d      | 80.470 (19.530) | 95.206 (4.794) | 60.21  | bicubic | 224 | |
-| gluon_resnet101_v1d      | 80.424 (19.576) | 95.020 (4.980) | 44.57  | bicubic | 224 | |
-| gluon_resnext101_32x4d   | 80.334 (19.666) | 94.926 (5.074) | 44.18  | bicubic | 224 | |
-| gluon_resnet101_v1s      | 80.300 (19.700) | 95.150 (4.850) | 44.67  | bicubic | 224 | |
-| tf_efficientnet_b2 *tfp  | 80.188 (19.812) | 94.974 (5.026) | 9.11  | bicubic | 260 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_efficientnet_b2       | 80.086 (19.914) | 94.908 (5.092) | 9.11  | bicubic | 260 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| gluon_resnet152_v1c      | 79.916 (20.084) | 94.842 (5.158) | 60.21  | bicubic | 224 | |
-| gluon_seresnext50_32x4d  | 79.912 (20.088) | 94.818 (5.182) | 27.56  | bicubic | 224 | |
-| gluon_resnet152_v1b      | 79.692 (20.308) | 94.738 (5.262) | 60.19  | bicubic | 224 | |
-| gluon_xception65         | 79.604 (20.396) | 94.748 (5.252) | 39.92  | bicubic | 299 | |
-| gluon_resnet101_v1c      | 79.544 (20.456) | 94.586 (5.414) | 44.57  | bicubic | 224 | |
-| gluon_resnext50_32x4d    | 79.356 (20.644) | 94.424 (5.576) | 25.03  | bicubic | 224 | |
-| gluon_resnet101_v1b      | 79.304 (20.696) | 94.524 (5.476) | 44.55  | bicubic | 224 | |
-| tf_efficientnet_b1 *tfp  | 79.172 (20.828) | 94.450 (5.550) | 7.79  | bicubic | 240 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| gluon_resnet50_v1d       | 79.074 (20.926) | 94.476 (5.524) | 25.58  | bicubic | 224 | |
-| tf_efficientnet_em *tfp  | 78.958 (21.042) | 94.458 (5.542) | 6.90 | bicubic | 240 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| tf_mixnet_l *tfp         | 78.846 (21.154) | 94.212 (5.788) | 7.33  | bilinear | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| tf_efficientnet_b1       | 78.826 (21.174) | 94.198 (5.802) | 7.79  | bicubic | 240 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| gluon_inception_v3       | 78.804 (21.196) | 94.380 (5.620) | 27.16M | bicubic | 299 | [MxNet Gluon](https://gluon-cv.mxnet.io/model_zoo/classification.html) |
-| tf_mixnet_l              | 78.770 (21.230) | 94.004 (5.996) | 7.33  | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| tf_efficientnet_em       | 78.742 (21.258) | 94.332 (5.668) | 6.90 | bicubic | 240 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| gluon_resnet50_v1s       | 78.712 (21.288) | 94.242 (5.758) | 25.68  | bicubic | 224 | |
-| gluon_resnet50_v1c       | 78.010 (21.990) | 93.988 (6.012) | 25.58  | bicubic | 224 | |
-| tf_inception_v3          | 77.856 (22.144) | 93.644 (6.356) | 27.16M | bicubic | 299 | [Tensorflow Slim](https://github.com/tensorflow/models/tree/master/research/slim) |
-| tf_efficientnet_es *tfp  | 77.616 (22.384) | 93.750 (6.250) | 5.44 | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| gluon_resnet50_v1b       | 77.578 (22.422) | 93.718 (6.282) | 25.56  | bicubic | 224 | |
-| adv_inception_v3         | 77.576 (22.424) | 93.724 (6.276) | 27.16M | bicubic | 299 | [Tensorflow Adv models](https://github.com/tensorflow/models/tree/master/research/adv_imagenet_models) |
-| tf_efficientnet_es       | 77.264 (22.736) | 93.600 (6.400) | 5.44 | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu) |
-| tf_efficientnet_b0 *tfp  | 77.258 (22.742) | 93.478 (6.522) | 5.29  | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_mixnet_m *tfp         | 77.072 (22.928) | 93.368 (6.632) | 5.01  | bilinear | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| tf_mixnet_m              | 76.950 (23.050) | 93.156 (6.844) | 5.01  | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| tf_efficientnet_b0       | 76.848 (23.152) | 93.228 (6.772) | 5.29  | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
-| tf_mixnet_s *tfp         | 75.800 (24.200) | 92.788 (7.212) | 4.13  | bilinear | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| tf_mixnet_s              | 75.648 (24.352) | 92.636 (7.364) | 4.13  | bicubic | 224 | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
-| gluon_resnet34_v1b       | 74.580 (25.420) | 91.988 (8.012) | 21.80 | bicubic | 224 | |
-| gluon_resnet18_v1b       | 70.830 (29.170) | 89.756 (10.244) | 11.69 | bicubic | 224 | |
+| Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size |
+|---|---|---|---|---|---|
+| tf_efficientnet_b8_ap *tfp | 85.436 (14.564) | 97.272 (2.728) | 87.4 | bicubic | 672 |
+| tf_efficientnet_b8_ap      | 85.368 (14.632) | 97.294 (2.706) | 87.4 | bicubic | 672 |
+| tf_efficientnet_b7_ap *tfp | 85.154 (14.846) | 97.244 (2.756) | 66.35 | bicubic | 600 |
+| tf_efficientnet_b7_ap      | 85.118 (14.882) | 97.252 (2.748) | 66.35 | bicubic | 600 |
+| tf_efficientnet_b7 *tfp  | 84.940 (15.060) | 97.214 (2.786) | 66.35  | bicubic | 600 |
+| tf_efficientnet_b7       | 84.932 (15.068) | 97.208 (2.792) | 66.35  | bicubic | 600 |
+| tf_efficientnet_b6_ap      | 84.786 (15.214) | 97.138 (2.862) | 43.04 | bicubic | 528 |
+| tf_efficientnet_b6_ap *tfp | 84.760 (15.240) | 97.124 (2.876) | 43.04 | bicubic | 528 |
+| tf_efficientnet_b5_ap *tfp | 84.276 (15.724) | 96.932 (3.068) | 30.39 | bicubic | 456 |
+| tf_efficientnet_b5_ap      | 84.254 (15.746) | 96.976 (3.024) | 30.39 | bicubic | 456 |
+| tf_efficientnet_b6 *tfp  | 84.140 (15.860) | 96.852 (3.148) | 43.04  | bicubic | 528 |
+| tf_efficientnet_b6       | 84.110 (15.890) | 96.886 (3.114) | 43.04  | bicubic | 528 |
+| tf_efficientnet_b5 *tfp  | 83.822 (16.178) | 96.756 (3.244) | 30.39  | bicubic | 456 |
+| tf_efficientnet_b5       | 83.812 (16.188) | 96.748 (3.252) | 30.39  | bicubic | 456 |
+| tf_efficientnet_b4_ap *tfp | 83.278 (16.722) | 96.376 (3.624) | 19.34 | bicubic | 380 |
+| tf_efficientnet_b4_ap      | 83.248 (16.752) | 96.388 (3.612) | 19.34 | bicubic | 380 |
+| tf_efficientnet_b4       | 83.022 (16.978) | 96.300 (3.700) | 19.34  | bicubic | 380 |
+| tf_efficientnet_b4 *tfp  | 82.948 (17.052) | 96.308 (3.692) | 19.34  | bicubic | 380 |
+| tf_efficientnet_b3_ap *tfp | 81.882 (18.118) | 95.662 (4.338) | 12.23 | bicubic | 300 |
+| tf_efficientnet_b3_ap      | 81.828 (18.172) | 95.624 (4.376) | 12.23 | bicubic | 300 |
+| tf_efficientnet_b3       | 81.636 (18.364) | 95.718 (4.282) | 12.23  | bicubic | 300 |
+| tf_efficientnet_b3 *tfp  | 81.576 (18.424) | 95.662 (4.338) | 12.23  | bicubic | 300 |
+| gluon_senet154           | 81.224 (18.776) | 95.356 (4.644) | 115.09 | bicubic | 224 |
+| gluon_resnet152_v1s      | 81.012 (18.988) | 95.416 (4.584) | 60.32  | bicubic | 224 |
+| gluon_seresnext101_32x4d | 80.902 (19.098) | 95.294 (4.706) | 48.96  | bicubic | 224 |
+| gluon_seresnext101_64x4d | 80.890 (19.110) | 95.304 (4.696) | 88.23  | bicubic | 224 |
+| gluon_resnext101_64x4d   | 80.602 (19.398) | 94.994 (5.006) | 83.46  | bicubic | 224 |
+| tf_efficientnet_el       | 80.534 (19.466) | 95.190 (4.810) | 10.59 | bicubic | 300 |
+| tf_efficientnet_el *tfp  | 80.476 (19.524) | 95.200 (4.800) | 10.59 | bicubic | 300 |
+| gluon_resnet152_v1d      | 80.470 (19.530) | 95.206 (4.794) | 60.21  | bicubic | 224 |
+| gluon_resnet101_v1d      | 80.424 (19.576) | 95.020 (4.980) | 44.57  | bicubic | 224 |
+| tf_efficientnet_b2_ap *tfp | 80.420 (19.580) | 95.040 (4.960) | 9.11 | bicubic | 260 |
+| gluon_resnext101_32x4d   | 80.334 (19.666) | 94.926 (5.074) | 44.18  | bicubic | 224 |
+| tf_efficientnet_b2_ap    | 80.306 (19.694) | 95.028 (4.972) | 9.11 | bicubic | 260 |
+| gluon_resnet101_v1s      | 80.300 (19.700) | 95.150 (4.850) | 44.67  | bicubic | 224 |
+| tf_efficientnet_b2 *tfp  | 80.188 (19.812) | 94.974 (5.026) | 9.11  | bicubic | 260 |
+| tf_efficientnet_b2       | 80.086 (19.914) | 94.908 (5.092) | 9.11  | bicubic | 260 |
+| gluon_resnet152_v1c      | 79.916 (20.084) | 94.842 (5.158) | 60.21  | bicubic | 224 |
+| gluon_seresnext50_32x4d  | 79.912 (20.088) | 94.818 (5.182) | 27.56  | bicubic | 224 |
+| gluon_resnet152_v1b      | 79.692 (20.308) | 94.738 (5.262) | 60.19  | bicubic | 224 |
+| gluon_xception65         | 79.604 (20.396) | 94.748 (5.252) | 39.92  | bicubic | 299 |
+| gluon_resnet101_v1c      | 79.544 (20.456) | 94.586 (5.414) | 44.57  | bicubic | 224 |
+| tf_efficientnet_b1_ap *tfp | 79.532 (20.468) | 94.378 (5.622) | 7.79 | bicubic | 240 |
+| tf_efficientnet_cc_b1_8e *tfp | 79.464 (20.536)| 94.492 (5.508) | 39.7 | bicubic | 240 |
+| gluon_resnext50_32x4d    | 79.356 (20.644) | 94.424 (5.576) | 25.03  | bicubic | 224 |
+| gluon_resnet101_v1b      | 79.304 (20.696) | 94.524 (5.476) | 44.55  | bicubic | 224 |
+| tf_efficientnet_cc_b1_8e | 79.298 (20.702) | 94.364 (5.636) | 39.7 | bicubic | 240 |
+| tf_efficientnet_b1_ap    | 79.278 (20.722) | 94.308 (5.692) | 7.79 | bicubic | 240 |
+| tf_efficientnet_b1 *tfp  | 79.172 (20.828) | 94.450 (5.550) | 7.79  | bicubic | 240 |
+| gluon_resnet50_v1d       | 79.074 (20.926) | 94.476 (5.524) | 25.58  | bicubic | 224 |
+| tf_efficientnet_em *tfp  | 78.958 (21.042) | 94.458 (5.542) | 6.90 | bicubic | 240 |
+| tf_mixnet_l *tfp         | 78.846 (21.154) | 94.212 (5.788) | 7.33  | bilinear | 224 |
+| tf_efficientnet_b1       | 78.826 (21.174) | 94.198 (5.802) | 7.79  | bicubic | 240 |
+| gluon_inception_v3       | 78.804 (21.196) | 94.380 (5.620) | 27.16M | bicubic | 299 |
+| tf_mixnet_l              | 78.770 (21.230) | 94.004 (5.996) | 7.33  | bicubic | 224 |
+| tf_efficientnet_em       | 78.742 (21.258) | 94.332 (5.668) | 6.90 | bicubic | 240 |
+| gluon_resnet50_v1s       | 78.712 (21.288) | 94.242 (5.758) | 25.68  | bicubic | 224 |
+| tf_efficientnet_cc_b0_8e *tfp | 78.314 (21.686) | 93.790 (6.210) | 24.0 | bicubic | 224 |
+| gluon_resnet50_v1c       | 78.010 (21.990) | 93.988 (6.012) | 25.58  | bicubic | 224 |
+| tf_efficientnet_cc_b0_8e | 77.908 (22.092) | 93.656 (6.344) | 24.0 | bicubic | 224 |
+| tf_inception_v3          | 77.856 (22.144) | 93.644 (6.356) | 27.16M | bicubic | 299 |
+| tf_efficientnet_cc_b0_4e *tfp | 77.746 (22.254) | 93.552 (6.448) | 13.3 | bicubic | 224 |
+| tf_efficientnet_es *tfp  | 77.616 (22.384) | 93.750 (6.250) | 5.44 | bicubic | 224 |
+| gluon_resnet50_v1b       | 77.578 (22.422) | 93.718 (6.282) | 25.56  | bicubic | 224 |
+| adv_inception_v3         | 77.576 (22.424) | 93.724 (6.276) | 27.16M | bicubic | 299 |
+| tf_efficientnet_b0_ap *tfp | 77.514 (22.486) | 93.576 (6.424) | 5.29  | bicubic | 224 |
+| tf_efficientnet_cc_b0_4e | 77.304 (22.696) | 93.332 (6.668) | 13.3 | bicubic | 224 |
+| tf_efficientnet_es       | 77.264 (22.736) | 93.600 (6.400) | 5.44 | bicubic | 224 |
+| tf_efficientnet_b0 *tfp  | 77.258 (22.742) | 93.478 (6.522) | 5.29  | bicubic | 224 |
+| tf_efficientnet_b0_ap    | 77.084 (22.916) | 93.254 (6.746) | 5.29  | bicubic | 224 |
+| tf_mixnet_m *tfp         | 77.072 (22.928) | 93.368 (6.632) | 5.01  | bilinear | 224 |
+| tf_mixnet_m              | 76.950 (23.050) | 93.156 (6.844) | 5.01  | bicubic | 224 |
+| tf_efficientnet_b0       | 76.848 (23.152) | 93.228 (6.772) | 5.29  | bicubic | 224 |
+| tf_mixnet_s *tfp         | 75.800 (24.200) | 92.788 (7.212) | 4.13  | bilinear | 224 |
+| tf_mobilenetv3_large_100 *tfp | 75.768 (24.232) | 92.710 (7.290) | 5.48 | bilinear | 224 |
+| tf_mixnet_s              | 75.648 (24.352) | 92.636 (7.364) | 4.13  | bicubic | 224 |
+| tf_mobilenetv3_large_100 | 75.516 (24.484) | 92.600 (7.400) | 5.48 | bilinear | 224 |
+| gluon_resnet34_v1b       | 74.580 (25.420) | 91.988 (8.012) | 21.80 | bicubic | 224 |
+| tf_mobilenetv3_large_075 *tfp | 73.730 (26.270) | 91.616 (8.384) | 3.99 | bilinear | 224 |
+| tf_mobilenetv3_large_075 | 73.442 (26.558) | 91.352 (8.648) | 3.99 | bilinear | 224 |
+| tf_mobilenetv3_large_minimal_100 *tfp | 72.678 (27.322) | 90.860 (9.140) | 3.92 | bilinear | 224 |
+| tf_mobilenetv3_large_minimal_100 | 72.244 (27.756) | 90.636 (9.364) | 3.92 | bilinear | 224 |
+| tf_mobilenetv3_small_100 *tfp | 67.918 (32.082) | 87.958 (12.042 | 2.54 | bilinear | 224 |
+| tf_mobilenetv3_small_100 | 67.918 (32.082) | 87.662 (12.338) | 2.54 | bilinear | 224 |
+| tf_mobilenetv3_small_075 *tfp | 66.142 (33.858) | 86.498 (13.502) | 2.04 | bilinear | 224 |
+| tf_mobilenetv3_small_075 | 65.718 (34.282) | 86.136 (13.864) | 2.04 | bilinear | 224 |
+| tf_mobilenetv3_small_minimal_100 *tfp | 63.378 (36.622) | 84.802 (15.198) | 2.04 | bilinear | 224 |
+| tf_mobilenetv3_small_minimal_100 | 62.898 (37.102) | 84.230 (15.770) | 2.04 | bilinear | 224 |
 
 Models with `*tfp` next to them were scored with `--tf-preprocessing` flag. 
 
 The `tf_efficientnet`, `tf_mixnet` models require an equivalent for 'SAME' padding as their arch results in asymmetric padding. I've added this in the model creation wrapper, but it does come with a performance penalty. 
 
+Sources for original weights:
+* `tf_efficientnet*`: [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
+* `tf_efficientnet_e*`: [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu)
+* `tf_mixnet*`: [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet)
+* `tf_inception*`: [Tensorflow Slim](https://github.com/tensorflow/models/tree/master/research/slim)
+* `gluon_*`: [MxNet Gluon](https://gluon-cv.mxnet.io/model_zoo/classification.html)
+
 ## Usage
 
 ### Environment

From 00b93407c7637951d16ef4414beedebac5299d89 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Fri, 29 Nov 2019 23:25:53 -0800
Subject: [PATCH 26/35] Update results-all.csv with latest models/weights

---
 results/results-all.csv | 248 +++++++++++++++++++++++++---------------
 1 file changed, 153 insertions(+), 95 deletions(-)

diff --git a/results/results-all.csv b/results/results-all.csv
index 5b1fea82..e9ba9fb7 100644
--- a/results/results-all.csv
+++ b/results/results-all.csv
@@ -1,97 +1,155 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
-resnet18,69.758,30.242,89.078,10.922,11.69,224,0.875,bilinear
-gluon_resnet18_v1b,70.83,29.17,89.756,10.244,11.69,224,0.875,bicubic
-seresnet18,71.758,28.242,90.334,9.666,11.78,224,0.875,bicubic
-tv_resnet34,73.314,26.686,91.42,8.58,21.8,224,0.875,bilinear
-spnasnet_100,74.08,25.92,91.832,8.168,4.42,224,0.875,bilinear
-gluon_resnet34_v1b,74.58,25.42,91.988,8.012,21.8,224,0.875,bicubic
-mnasnet_100,74.656,25.344,92.126,7.874,4.38,224,0.875,bicubic
-densenet121,74.752,25.248,92.152,7.848,7.98,224,0.875,bicubic
-seresnet34,74.808,25.192,92.126,7.874,21.96,224,0.875,bilinear
-resnet34,75.112,24.888,92.288,7.712,21.8,224,0.875,bilinear
-fbnetc_100,75.12,24.88,92.386,7.614,5.57,224,0.875,bilinear
-resnet26,75.292,24.708,92.57,7.43,16,224,0.875,bicubic
-semnasnet_100,75.456,24.544,92.592,7.408,3.89,224,0.875,bicubic
-mobilenetv3_100,75.628,24.372,92.708,7.292,5.48,224,0.875,bicubic
-tf_mixnet_s,75.648,24.352,92.636,7.364,4.13,224,0.875,bicubic
-densenet169,75.912,24.088,93.024,6.976,14.15,224,0.875,bicubic
-mixnet_s,75.988,24.012,92.794,7.206,4.13,224,0.875,bicubic
-tv_resnet50,76.13,23.87,92.862,7.138,25.56,224,0.875,bilinear
-dpn68,76.306,23.694,92.97,7.03,12.61,224,0.875,bicubic
-resnet26d,76.68,23.32,93.166,6.834,16.01,224,0.875,bicubic
-tf_efficientnet_b0,76.84,23.16,93.226,6.774,5.29,224,0.875,bicubic
-efficientnet_b0,76.914,23.086,93.206,6.794,5.29,224,0.875,bicubic
-tf_mixnet_m,76.95,23.05,93.156,6.844,5.01,224,0.875,bicubic
-seresnext26_32x4d,77.1,22.9,93.31,6.69,16.79,224,0.875,bicubic
-mixnet_m,77.256,22.744,93.418,6.582,5.01,224,0.875,bicubic
-tf_efficientnet_es,77.264,22.736,93.6,6.4,5.44,224,0.875,bicubic
-densenet201,77.29,22.71,93.478,6.522,20.01,224,0.875,bicubic
-densenet161,77.348,22.652,93.648,6.352,28.68,224,0.875,bicubic
-resnet101,77.374,22.626,93.546,6.454,44.55,224,0.875,bilinear
-inception_v3,77.434,22.566,93.478,6.522,27.16,299,0.875,bicubic
-dpn68b,77.514,22.486,93.822,6.178,12.61,224,0.875,bicubic
-adv_inception_v3,77.576,22.424,93.724,6.276,23.83,299,0.875,bicubic
-gluon_resnet50_v1b,77.578,22.422,93.718,6.282,25.56,224,0.875,bicubic
-tv_resnext50_32x4d,77.618,22.382,93.698,6.302,25.03,224,0.875,bilinear
-seresnet50,77.636,22.364,93.752,6.248,28.09,224,0.875,bilinear
-tf_inception_v3,77.856,22.144,93.644,6.356,23.83,299,0.875,bicubic
-gluon_resnet50_v1c,78.012,21.988,93.988,6.012,25.58,224,0.875,bicubic
-resnet152,78.312,21.688,94.046,5.954,60.19,224,0.875,bilinear
-seresnet101,78.396,21.604,94.258,5.742,49.33,224,0.875,bilinear
-wide_resnet50_2,78.468,21.532,94.086,5.914,68.88,224,0.875,bilinear
-resnet50,78.47,21.53,94.266,5.734,25.56,224,0.875,bicubic
-resnext50_32x4d,78.51,21.49,94.054,5.946,25.03,224,0.875,bicubic
-seresnet152,78.658,21.342,94.374,5.626,66.82,224,0.875,bilinear
-efficientnet_b1,78.692,21.308,94.086,5.914,7.79,240,0.882,bicubic
-tf_efficientnet_em,78.698,21.302,94.32,5.68,6.9,240,0.882,bicubic
-gluon_resnet50_v1s,78.712,21.288,94.242,5.758,25.68,224,0.875,bicubic
-tf_mixnet_l,78.77,21.23,94.004,5.996,7.33,224,0.875,bicubic
-gluon_inception_v3,78.804,21.196,94.38,5.62,23.83,299,0.875,bicubic
-tf_efficientnet_b1,78.832,21.168,94.196,5.804,7.79,240,0.882,bicubic
-wide_resnet101_2,78.846,21.154,94.284,5.716,126.89,224,0.875,bilinear
-mixnet_l,78.976,21.024,94.184,5.816,7.33,224,0.875,bicubic
-xception,79.048,20.952,94.392,5.608,22.86,299,0.8975,bicubic
-gluon_resnet50_v1d,79.074,20.926,94.476,5.524,25.58,224,0.875,bicubic
-seresnext50_32x4d,79.076,20.924,94.434,5.566,27.56,224,0.875,bilinear
-gluon_resnet101_v1b,79.304,20.696,94.524,5.476,44.55,224,0.875,bicubic
-resnext101_32x8d,79.312,20.688,94.526,5.474,88.79,224,0.875,bilinear
-gluon_resnext50_32x4d,79.356,20.644,94.424,5.576,25.03,224,0.875,bicubic
-gluon_resnet101_v1c,79.544,20.456,94.586,5.414,44.57,224,0.875,bicubic
-gluon_xception65,79.604,20.396,94.748,5.252,39.92,299,0.875,bicubic
-dpn98,79.636,20.364,94.594,5.406,61.57,224,0.875,bicubic
-resnext50d_32x4d,79.674,20.326,94.868,5.132,25.05,224,0.875,bicubic
-gluon_resnet152_v1b,79.692,20.308,94.738,5.262,60.19,224,0.875,bicubic
-efficientnet_b2,79.752,20.248,94.71,5.29,9.11,260,0.89,bicubic
-dpn131,79.828,20.172,94.704,5.296,79.25,224,0.875,bicubic
-gluon_seresnext50_32x4d,79.912,20.088,94.818,5.182,27.56,224,0.875,bicubic
-gluon_resnet152_v1c,79.916,20.084,94.842,5.158,60.21,224,0.875,bicubic
-ens_adv_inception_resnet_v2,79.976,20.024,94.946,5.054,55.84,299,0.8975,bicubic
-dpn92,80.016,19.984,94.838,5.162,37.67,224,0.875,bicubic
-tf_efficientnet_b2,80.09,19.91,94.906,5.094,9.11,260,0.89,bicubic
-mixnet_xl,80.12,19.88,95.022,4.978,11.9,224,0.875,bicubic
-inception_v4,80.156,19.844,94.974,5.026,42.68,299,0.875,bicubic
-dpn107,80.164,19.836,94.912,5.088,86.92,224,0.875,bicubic
-seresnext101_32x4d,80.236,19.764,95.028,4.972,48.96,224,0.875,bilinear
-gluon_resnet101_v1s,80.3,19.7,95.15,4.85,44.67,224,0.875,bicubic
-gluon_resnext101_32x4d,80.334,19.666,94.926,5.074,44.18,224,0.875,bicubic
-gluon_resnet101_v1d,80.424,19.576,95.02,4.98,44.57,224,0.875,bicubic
-tf_efficientnet_el,80.448,19.552,95.16,4.84,10.59,300,0.904,bicubic
-inception_resnet_v2,80.46,19.54,95.31,4.69,55.84,299,0.8975,bicubic
-gluon_resnet152_v1d,80.47,19.53,95.206,4.794,60.21,224,0.875,bicubic
-gluon_resnext101_64x4d,80.602,19.398,94.994,5.006,83.46,224,0.875,bicubic
-gluon_seresnext101_64x4d,80.89,19.11,95.304,4.696,88.23,224,0.875,bicubic
-gluon_seresnext101_32x4d,80.902,19.098,95.294,4.706,48.96,224,0.875,bicubic
-gluon_resnet152_v1s,81.012,18.988,95.416,4.584,60.32,224,0.875,bicubic
-gluon_senet154,81.224,18.776,95.356,4.644,115.09,224,0.875,bicubic
-senet154,81.304,18.696,95.498,4.502,115.09,224,0.875,bilinear
-tf_efficientnet_b3,81.64,18.36,95.722,4.278,12.23,300,0.904,bicubic
-nasnetalarge,82.558,17.442,96.036,3.964,88.75,331,0.875,bicubic
-ig_resnext101_32x8d,82.688,17.312,96.632,3.368,88.79,224,0.875,bilinear
-pnasnet5large,82.74,17.26,96.04,3.96,86.06,331,0.875,bicubic
-tf_efficientnet_b4,83.016,16.984,96.298,3.702,19.34,380,0.922,bicubic
-tf_efficientnet_b5,83.686,16.314,96.712,3.288,30.39,456,0.934,bicubic
-tf_efficientnet_b6,84.112,15.888,96.884,3.116,43.04,528,0.942,bicubic
-ig_resnext101_32x16d,84.176,15.824,97.196,2.804,194.03,224,0.875,bilinear
-tf_efficientnet_b7,84.42,15.58,96.908,3.092,66.35,600,0.949,bicubic
-ig_resnext101_32x32d,85.092,14.908,97.436,2.564,468.53,224,0.875,bilinear
 ig_resnext101_32x48d,85.442,14.558,97.572,2.428,828.41,224,0.875,bilinear
+tf_efficientnet_b8_ap,85.368,14.632,97.294,2.706,87.41,672,0.954,bicubic
+tf_efficientnet_b7_ap,85.118,14.882,97.252,2.748,66.35,600,0.949,bicubic
+ig_resnext101_32x32d,85.092,14.908,97.436,2.564,468.53,224,0.875,bilinear
+tf_efficientnet_b7,84.932,15.068,97.208,2.792,66.35,600,0.949,bicubic
+tf_efficientnet_b6_ap,84.786,15.214,97.138,2.862,43.04,528,0.942,bicubic
+swsl_resnext101_32x8d,84.294,15.706,97.174,2.826,88.79,224,0.875,bilinear
+tf_efficientnet_b5_ap,84.254,15.746,96.976,3.024,30.39,456,0.934,bicubic
+ig_resnext101_32x16d,84.176,15.824,97.196,2.804,194.03,224,0.875,bilinear
+tf_efficientnet_b6,84.112,15.888,96.884,3.116,43.04,528,0.942,bicubic
+tf_efficientnet_b5,83.816,16.184,96.75,3.25,30.39,456,0.934,bicubic
+swsl_resnext101_32x16d,83.338,16.662,96.852,3.148,194.03,224,0.875,bilinear
+tf_efficientnet_b4_ap,83.248,16.752,96.388,3.612,19.34,380,0.922,bicubic
+swsl_resnext101_32x4d,83.234,16.766,96.756,3.244,44.18,224,0.875,bilinear
+tf_efficientnet_b4,83.016,16.984,96.298,3.702,19.34,380,0.922,bicubic
+pnasnet5large,82.74,17.26,96.04,3.96,86.06,331,0.875,bicubic
+ig_resnext101_32x8d,82.688,17.312,96.632,3.368,88.79,224,0.875,bilinear
+nasnetalarge,82.558,17.442,96.036,3.964,88.75,331,0.875,bicubic
+swsl_resnext50_32x4d,82.18,17.82,96.228,3.772,25.03,224,0.875,bilinear
+ssl_resnext101_32x16d,81.836,18.164,96.094,3.906,194.03,224,0.875,bilinear
+tf_efficientnet_b3_ap,81.828,18.172,95.624,4.376,12.23,300,0.904,bicubic
+tf_efficientnet_b3,81.64,18.36,95.722,4.278,12.23,300,0.904,bicubic
+ssl_resnext101_32x8d,81.626,18.374,96.038,3.962,88.79,224,0.875,bilinear
+senet154,81.304,18.696,95.498,4.502,115.09,224,0.875,bilinear
+gluon_senet154,81.224,18.776,95.356,4.644,115.09,224,0.875,bicubic
+swsl_resnet50,81.18,18.82,95.986,4.014,25.56,224,0.875,bilinear
+gluon_resnet152_v1s,81.012,18.988,95.416,4.584,60.32,224,0.875,bicubic
+ssl_resnext101_32x4d,80.928,19.072,95.728,4.272,44.18,224,0.875,bilinear
+gluon_seresnext101_32x4d,80.902,19.098,95.294,4.706,48.96,224,0.875,bicubic
+gluon_seresnext101_64x4d,80.89,19.11,95.304,4.696,88.23,224,0.875,bicubic
+gluon_resnext101_64x4d,80.602,19.398,94.994,5.006,83.46,224,0.875,bicubic
+gluon_resnet152_v1d,80.47,19.53,95.206,4.794,60.21,224,0.875,bicubic
+inception_resnet_v2,80.46,19.54,95.31,4.69,55.84,299,0.8975,bicubic
+tf_efficientnet_el,80.448,19.552,95.16,4.84,10.59,300,0.904,bicubic
+gluon_resnet101_v1d,80.424,19.576,95.02,4.98,44.57,224,0.875,bicubic
+gluon_resnext101_32x4d,80.334,19.666,94.926,5.074,44.18,224,0.875,bicubic
+ssl_resnext50_32x4d,80.328,19.672,95.404,4.596,25.03,224,0.875,bilinear
+tf_efficientnet_b2_ap,80.306,19.694,95.028,4.972,9.11,260,0.89,bicubic
+gluon_resnet101_v1s,80.3,19.7,95.15,4.85,44.67,224,0.875,bicubic
+seresnext101_32x4d,80.236,19.764,95.028,4.972,48.96,224,0.875,bilinear
+dpn107,80.164,19.836,94.912,5.088,86.92,224,0.875,bicubic
+inception_v4,80.156,19.844,94.974,5.026,42.68,299,0.875,bicubic
+mixnet_xl,80.12,19.88,95.022,4.978,11.9,224,0.875,bicubic
+tf_efficientnet_b2,80.09,19.91,94.906,5.094,9.11,260,0.89,bicubic
+dpn92,80.016,19.984,94.838,5.162,37.67,224,0.875,bicubic
+ens_adv_inception_resnet_v2,79.976,20.024,94.946,5.054,55.84,299,0.8975,bicubic
+gluon_resnet152_v1c,79.916,20.084,94.842,5.158,60.21,224,0.875,bicubic
+gluon_seresnext50_32x4d,79.912,20.088,94.818,5.182,27.56,224,0.875,bicubic
+dpn131,79.828,20.172,94.704,5.296,79.25,224,0.875,bicubic
+efficientnet_b2,79.752,20.248,94.71,5.29,9.11,260,0.89,bicubic
+gluon_resnet152_v1b,79.692,20.308,94.738,5.262,60.19,224,0.875,bicubic
+resnext50d_32x4d,79.674,20.326,94.868,5.132,25.05,224,0.875,bicubic
+dpn98,79.636,20.364,94.594,5.406,61.57,224,0.875,bicubic
+gluon_xception65,79.604,20.396,94.748,5.252,39.92,299,0.875,bicubic
+gluon_resnet101_v1c,79.544,20.456,94.586,5.414,44.57,224,0.875,bicubic
+hrnet_w64,79.472,20.528,94.65,5.35,128.06,224,0.875,bilinear
+dla102x2,79.452,20.548,94.644,5.356,41.75,224,0.875,bilinear
+gluon_resnext50_32x4d,79.356,20.644,94.424,5.576,25.03,224,0.875,bicubic
+resnext101_32x8d,79.312,20.688,94.526,5.474,88.79,224,0.875,bilinear
+hrnet_w48,79.31,20.69,94.52,5.48,77.47,224,0.875,bilinear
+gluon_resnet101_v1b,79.304,20.696,94.524,5.476,44.55,224,0.875,bicubic
+tf_efficientnet_cc_b1_8e,79.298,20.702,94.364,5.636,39.72,240,0.882,bicubic
+tf_efficientnet_b1_ap,79.278,20.722,94.308,5.692,7.79,240,0.882,bicubic
+ssl_resnet50,79.228,20.772,94.832,5.168,25.56,224,0.875,bilinear
+res2net50_26w_8s,79.21,20.79,94.362,5.638,48.4,224,0.875,bilinear
+res2net101_26w_4s,79.196,20.804,94.44,5.56,45.21,224,0.875,bilinear
+seresnext50_32x4d,79.076,20.924,94.434,5.566,27.56,224,0.875,bilinear
+gluon_resnet50_v1d,79.074,20.926,94.476,5.524,25.58,224,0.875,bicubic
+xception,79.048,20.952,94.392,5.608,22.86,299,0.8975,bicubic
+mixnet_l,78.976,21.024,94.184,5.816,7.33,224,0.875,bicubic
+hrnet_w40,78.934,21.066,94.466,5.534,57.56,224,0.875,bilinear
+hrnet_w44,78.894,21.106,94.37,5.63,67.06,224,0.875,bilinear
+wide_resnet101_2,78.846,21.154,94.284,5.716,126.89,224,0.875,bilinear
+tf_efficientnet_b1,78.832,21.168,94.196,5.804,7.79,240,0.882,bicubic
+gluon_inception_v3,78.804,21.196,94.38,5.62,23.83,299,0.875,bicubic
+tf_mixnet_l,78.77,21.23,94.004,5.996,7.33,224,0.875,bicubic
+gluon_resnet50_v1s,78.712,21.288,94.242,5.758,25.68,224,0.875,bicubic
+dla169,78.71,21.29,94.338,5.662,53.99,224,0.875,bilinear
+tf_efficientnet_em,78.698,21.302,94.32,5.68,6.9,240,0.882,bicubic
+efficientnet_b1,78.692,21.308,94.086,5.914,7.79,240,0.882,bicubic
+seresnet152,78.658,21.342,94.374,5.626,66.82,224,0.875,bilinear
+res2net50_26w_6s,78.574,21.426,94.126,5.874,37.05,224,0.875,bilinear
+resnext50_32x4d,78.51,21.49,94.054,5.946,25.03,224,0.875,bicubic
+dla102x,78.508,21.492,94.234,5.766,26.77,224,0.875,bilinear
+dla60_res2net,78.472,21.528,94.204,5.796,21.15,224,0.875,bilinear
+resnet50,78.47,21.53,94.266,5.734,25.56,224,0.875,bicubic
+wide_resnet50_2,78.468,21.532,94.086,5.914,68.88,224,0.875,bilinear
+dla60_res2next,78.448,21.552,94.144,5.856,17.33,224,0.875,bilinear
+hrnet_w32,78.448,21.552,94.188,5.812,41.23,224,0.875,bilinear
+seresnet101,78.396,21.604,94.258,5.742,49.33,224,0.875,bilinear
+resnet152,78.312,21.688,94.046,5.954,60.19,224,0.875,bilinear
+dla60x,78.242,21.758,94.022,5.978,17.65,224,0.875,bilinear
+res2next50,78.242,21.758,93.892,6.108,24.67,224,0.875,bilinear
+hrnet_w30,78.196,21.804,94.22,5.78,37.71,224,0.875,bilinear
+res2net50_14w_8s,78.152,21.848,93.842,6.158,25.06,224,0.875,bilinear
+dla102,78.026,21.974,93.95,6.05,33.73,224,0.875,bilinear
+gluon_resnet50_v1c,78.012,21.988,93.988,6.012,25.58,224,0.875,bicubic
+res2net50_26w_4s,77.946,22.054,93.852,6.148,25.7,224,0.875,bilinear
+tf_efficientnet_cc_b0_8e,77.908,22.092,93.656,6.344,24.01,224,0.875,bicubic
+tf_inception_v3,77.854,22.146,93.644,6.356,23.83,299,0.875,bicubic
+seresnet50,77.636,22.364,93.752,6.248,28.09,224,0.875,bilinear
+tv_resnext50_32x4d,77.618,22.382,93.698,6.302,25.03,224,0.875,bilinear
+adv_inception_v3,77.58,22.42,93.724,6.276,23.83,299,0.875,bicubic
+gluon_resnet50_v1b,77.578,22.422,93.718,6.282,25.56,224,0.875,bicubic
+dpn68b,77.514,22.486,93.822,6.178,12.61,224,0.875,bicubic
+res2net50_48w_2s,77.514,22.486,93.548,6.452,25.29,224,0.875,bilinear
+inception_v3,77.436,22.564,93.476,6.524,27.16,299,0.875,bicubic
+resnet101,77.374,22.626,93.546,6.454,44.55,224,0.875,bilinear
+densenet161,77.348,22.652,93.648,6.352,28.68,224,0.875,bicubic
+tf_efficientnet_cc_b0_4e,77.304,22.696,93.332,6.668,13.31,224,0.875,bicubic
+densenet201,77.29,22.71,93.478,6.522,20.01,224,0.875,bicubic
+tf_efficientnet_es,77.264,22.736,93.6,6.4,5.44,224,0.875,bicubic
+mixnet_m,77.256,22.744,93.418,6.582,5.01,224,0.875,bicubic
+seresnext26_32x4d,77.1,22.9,93.31,6.69,16.79,224,0.875,bicubic
+tf_efficientnet_b0_ap,77.084,22.916,93.254,6.746,5.29,224,0.875,bicubic
+dla60,77.022,22.978,93.308,6.692,22.33,224,0.875,bilinear
+tf_mixnet_m,76.95,23.05,93.156,6.844,5.01,224,0.875,bicubic
+efficientnet_b0,76.914,23.086,93.206,6.794,5.29,224,0.875,bicubic
+tf_efficientnet_b0,76.84,23.16,93.226,6.774,5.29,224,0.875,bicubic
+hrnet_w18,76.756,23.244,93.442,6.558,21.3,224,0.875,bilinear
+resnet26d,76.68,23.32,93.166,6.834,16.01,224,0.875,bicubic
+dpn68,76.306,23.694,92.97,7.03,12.61,224,0.875,bicubic
+tv_resnet50,76.13,23.87,92.862,7.138,25.56,224,0.875,bilinear
+mixnet_s,75.988,24.012,92.794,7.206,4.13,224,0.875,bicubic
+densenet169,75.912,24.088,93.024,6.976,14.15,224,0.875,bicubic
+tf_mixnet_s,75.648,24.352,92.636,7.364,4.13,224,0.875,bicubic
+mobilenetv3_rw,75.628,24.372,92.708,7.292,5.48,224,0.875,bicubic
+tf_mobilenetv3_large_100,75.516,24.484,92.6,7.4,5.48,224,0.875,bilinear
+semnasnet_100,75.456,24.544,92.592,7.408,3.89,224,0.875,bicubic
+resnet26,75.292,24.708,92.57,7.43,16,224,0.875,bicubic
+hrnet_w18_small_v2,75.126,24.874,92.416,7.584,15.6,224,0.875,bilinear
+fbnetc_100,75.12,24.88,92.386,7.614,5.57,224,0.875,bilinear
+resnet34,75.112,24.888,92.288,7.712,21.8,224,0.875,bilinear
+seresnet34,74.808,25.192,92.126,7.874,21.96,224,0.875,bilinear
+densenet121,74.752,25.248,92.152,7.848,7.98,224,0.875,bicubic
+mnasnet_100,74.656,25.344,92.126,7.874,4.38,224,0.875,bicubic
+dla34,74.636,25.364,92.064,7.936,15.78,224,0.875,bilinear
+gluon_resnet34_v1b,74.58,25.42,91.988,8.012,21.8,224,0.875,bicubic
+spnasnet_100,74.08,25.92,91.832,8.168,4.42,224,0.875,bilinear
+tf_mobilenetv3_large_075,73.442,26.558,91.352,8.648,3.99,224,0.875,bilinear
+tv_resnet34,73.314,26.686,91.42,8.58,21.8,224,0.875,bilinear
+swsl_resnet18,73.286,26.714,91.732,8.268,11.69,224,0.875,bilinear
+ssl_resnet18,72.6,27.4,91.416,8.584,11.69,224,0.875,bilinear
+hrnet_w18_small,72.342,27.658,90.672,9.328,13.19,224,0.875,bilinear
+tf_mobilenetv3_large_minimal_100,72.244,27.756,90.636,9.364,3.92,224,0.875,bilinear
+seresnet18,71.758,28.242,90.334,9.666,11.78,224,0.875,bicubic
+gluon_resnet18_v1b,70.83,29.17,89.756,10.244,11.69,224,0.875,bicubic
+resnet18,69.758,30.242,89.078,10.922,11.69,224,0.875,bilinear
+tf_mobilenetv3_small_100,67.918,32.082,87.662,12.338,2.54,224,0.875,bilinear
+dla60x_c,67.906,32.094,88.434,11.566,1.34,224,0.875,bilinear
+dla46x_c,65.98,34.02,86.98,13.02,1.08,224,0.875,bilinear
+tf_mobilenetv3_small_075,65.718,34.282,86.136,13.864,2.04,224,0.875,bilinear
+dla46_c,64.878,35.122,86.286,13.714,1.31,224,0.875,bilinear
+tf_mobilenetv3_small_minimal_100,62.898,37.102,84.23,15.77,2.04,224,0.875,bilinear

From ff421e5e099148715b711a6d358c4bf722315fa7 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Wed, 4 Dec 2019 11:09:47 -0800
Subject: [PATCH 27/35] New PyTorch trained EfficientNet-B2 weights with my
 RandAugment impl

---
 README.md                   | 7 ++++++-
 timm/models/efficientnet.py | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f8e4d3b0..bd734537 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,11 @@
 
 ## What's New
 
+
+### Dec 4, 2019
+* Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).
+  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --aa rand-m9-noise0.5 --remode pixel --reprob 0.2 --amp --lr .016`
+
 ### Nov 29, 2019
 * Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded.
   * AdvProp weights added
@@ -99,8 +104,8 @@ I've leveraged the training scripts in this repository to train a few of the mod
 
 |Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling  | Image Size |
 |---|---|---|---|---|---|
+| efficientnet_b2 | 80.402 (19.598) | 95.076 (4.924) | 9.11M | bicubic | 260 |
 | mixnet_xl | 80.120 (19.880) | 95.022 (4.978) | 11.90M | bicubic | 224 |
-| efficientnet_b2 | 79.760 (20.240) | 94.714 (5.286) | 9.11M | bicubic | 260 |
 | resnext50d_32x4d | 79.674 (20.326) | 94.868 (5.132) | 25.1M | bicubic | 224 |
 | mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33M | bicubic | 224 |
 | efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.79M | bicubic | 240 |
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index ac3c244c..b93098dd 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -68,10 +68,10 @@ default_cfgs = {
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0-d6904d92.pth'),
     'efficientnet_b1': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+        input_size=(3, 240, 240), pool_size=(8, 8)),
     'efficientnet_b2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2-cf78dc4d.pth',
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9)),
     'efficientnet_b3': _cfg(
         url='', input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
     'efficientnet_b4': _cfg(

From 3129bdb2c107e8490ef2e4aa9338787fdd27813e Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@users.noreply.github.com>
Date: Wed, 4 Dec 2019 11:13:25 -0800
Subject: [PATCH 28/35] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bd734537..6f3f4410 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 
 ### Dec 4, 2019
 * Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).
-  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --aa rand-m9-noise0.5 --remode pixel --reprob 0.2 --amp --lr .016`
+  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-noise0.5 --remode pixel --reprob 0.2 --amp --lr .016`
 
 ### Nov 29, 2019
 * Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded.

From 5d7af97981ecc794fc1988c4916462fcc7b1aa22 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@users.noreply.github.com>
Date: Thu, 5 Dec 2019 11:47:57 -0800
Subject: [PATCH 29/35] Update README.md

Update latest training hparam/command line with correct RandAugment config string (`noise` -> `mstd`)
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6f3f4410..33b61779 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 
 ### Dec 4, 2019
 * Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).
-  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-noise0.5 --remode pixel --reprob 0.2 --amp --lr .016`
+  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
 
 ### Nov 29, 2019
 * Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded.

From 0161de01276100810f8aef5fac270164f4079de4 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 5 Dec 2019 22:35:08 -0800
Subject: [PATCH 30/35] Switch RandoErasing back to on GPU normal sampling

---
 timm/data/random_erasing.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/timm/data/random_erasing.py b/timm/data/random_erasing.py
index e944f22c..5eed1387 100644
--- a/timm/data/random_erasing.py
+++ b/timm/data/random_erasing.py
@@ -7,12 +7,10 @@ def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='
     # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
     # paths, flip the order so normal is run on CPU if this becomes a problem
     # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
-    # will revert back to doing normal_() on GPU when it's in next release
     if per_pixel:
-        return torch.empty(
-            patch_size, dtype=dtype).normal_().to(device=device)
+        return torch.empty(patch_size, dtype=dtype, device=device).normal_()
     elif rand_color:
-        return torch.empty((patch_size[0], 1, 1), dtype=dtype).normal_().to(device=device)
+        return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_()
     else:
         return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)
 

From 3bff2b21dcd0fab2177a5a7ccfa17609d16ec5aa Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 5 Dec 2019 22:35:40 -0800
Subject: [PATCH 31/35] Add support for keeping running bn stats the same
 across distributed training nodes before eval/save

---
 timm/utils.py | 18 +++++++++++++++---
 train.py      | 12 +++++++++++-
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/timm/utils.py b/timm/utils.py
index 8ed8f195..ee258aed 100644
--- a/timm/utils.py
+++ b/timm/utils.py
@@ -21,11 +21,15 @@ except ImportError:
 from torch import distributed as dist
 
 
-def get_state_dict(model):
+def unwrap_model(model):
     if isinstance(model, ModelEma):
-        return get_state_dict(model.ema)
+        return unwrap_model(model.ema)
     else:
-        return model.module.state_dict() if hasattr(model, 'module') else model.state_dict()
+        return model.module if hasattr(model, 'module') else model
+
+
+def get_state_dict(model):
+    return unwrap_model(model).state_dict()
 
 
 class CheckpointSaver:
@@ -206,6 +210,14 @@ def reduce_tensor(tensor, n):
     return rt
 
 
+def reduce_bn(model, world_size):
+    # ensure every node has the same running bn stats
+    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
+        if ('running_mean' in bn_name) or ('running_var' in bn_name):
+            torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
+            bn_buf /= float(world_size)
+
+
 class ModelEma:
     """ Model Exponential Moving Average
     Keep a moving average of everything in the model state_dict (parameters and buffers).
diff --git a/train.py b/train.py
index b79a342e..41af8fb6 100644
--- a/train.py
+++ b/train.py
@@ -145,6 +145,8 @@ parser.add_argument('--amp', action='store_true', default=False,
                     help='use NVIDIA amp for mixed precision training')
 parser.add_argument('--sync-bn', action='store_true',
                     help='enabling apex sync BN.')
+parser.add_argument('--reduce-bn', action='store_true',
+                    help='average BN running stats across all distributed nodes between train and validation.')
 parser.add_argument('--no-prefetcher', action='store_true', default=False,
                     help='disable fast prefetcher')
 parser.add_argument('--output', default='', type=str, metavar='PATH',
@@ -256,7 +258,7 @@ def main():
             if args.local_rank == 0:
                 logging.info('Restoring NVIDIA AMP state from checkpoint')
             amp.load_state_dict(resume_state['amp'])
-    resume_state = None  # clear it
+    del resume_state
 
     model_ema = None
     if args.model_ema:
@@ -388,9 +390,17 @@ def main():
                 lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
                 use_amp=use_amp, model_ema=model_ema)
 
+            if args.distributed and args.reduce_bn:
+                if args.local_rank == 0:
+                    logging.info("Averaging bn running means and vars")
+                reduce_bn(model, args.world_size)
+
             eval_metrics = validate(model, loader_eval, validate_loss_fn, args)
 
             if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.reduce_bn:
+                    reduce_bn(model_ema, args.world_size)
+
                 ema_eval_metrics = validate(
                     model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)')
                 eval_metrics = ema_eval_metrics

From a435ea132721a388f16a63fe581057090620bd99 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 19 Dec 2019 22:56:54 -0800
Subject: [PATCH 32/35] Change reduce_bn to distribute_bn, add ability to
 choose between broadcast and reduce (mean). Add crop_pct arg to allow
 selecting validation crop while training.

---
 timm/utils.py | 11 ++++++++---
 train.py      | 19 +++++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/timm/utils.py b/timm/utils.py
index ee258aed..59d2bcd0 100644
--- a/timm/utils.py
+++ b/timm/utils.py
@@ -210,12 +210,17 @@ def reduce_tensor(tensor, n):
     return rt
 
 
-def reduce_bn(model, world_size):
+def distribute_bn(model, world_size, reduce=False):
     # ensure every node has the same running bn stats
     for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
         if ('running_mean' in bn_name) or ('running_var' in bn_name):
-            torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
-            bn_buf /= float(world_size)
+            if reduce:
+                # average bn stats across whole group
+                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
+                bn_buf /= float(world_size)
+            else:
+                # broadcast bn stats from rank 0 to whole group
+                torch.distributed.broadcast(bn_buf, 0)
 
 
 class ModelEma:
diff --git a/train.py b/train.py
index 41af8fb6..c55cbfb3 100644
--- a/train.py
+++ b/train.py
@@ -55,6 +55,8 @@ parser.add_argument('--gp', default='avg', type=str, metavar='POOL',
                     help='Type of global pool, "avg", "max", "avgmax", "avgmaxc" (default: "avg")')
 parser.add_argument('--img-size', type=int, default=None, metavar='N',
                     help='Image patch size (default: None => model default)')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
 parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
                     help='Override mean pixel value of dataset')
 parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
@@ -121,6 +123,10 @@ parser.add_argument('--bn-momentum', type=float, default=None,
                     help='BatchNorm momentum override (if not None)')
 parser.add_argument('--bn-eps', type=float, default=None,
                     help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
 # Model Exponential Moving Average
 parser.add_argument('--model-ema', action='store_true', default=False,
                     help='Enable tracking moving average of model weights')
@@ -143,10 +149,6 @@ parser.add_argument('--save-images', action='store_true', default=False,
                     help='save images of input bathes every log interval for debugging')
 parser.add_argument('--amp', action='store_true', default=False,
                     help='use NVIDIA amp for mixed precision training')
-parser.add_argument('--sync-bn', action='store_true',
-                    help='enabling apex sync BN.')
-parser.add_argument('--reduce-bn', action='store_true',
-                    help='average BN running stats across all distributed nodes between train and validation.')
 parser.add_argument('--no-prefetcher', action='store_true', default=False,
                     help='disable fast prefetcher')
 parser.add_argument('--output', default='', type=str, metavar='PATH',
@@ -349,6 +351,7 @@ def main():
         std=data_config['std'],
         num_workers=args.workers,
         distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
     )
 
     if args.mixup > 0.:
@@ -390,16 +393,16 @@ def main():
                 lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
                 use_amp=use_amp, model_ema=model_ema)
 
-            if args.distributed and args.reduce_bn:
+            if args.distributed and args.dist_bn and args.dist_bn in ('broadcast', 'reduce'):
                 if args.local_rank == 0:
-                    logging.info("Averaging bn running means and vars")
-                reduce_bn(model, args.world_size)
+                    logging.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
 
             eval_metrics = validate(model, loader_eval, validate_loss_fn, args)
 
             if model_ema is not None and not args.model_ema_force_cpu:
                 if args.distributed and args.reduce_bn:
-                    reduce_bn(model_ema, args.world_size)
+                    distribute_bn(model_ema, args.world_size)
 
                 ema_eval_metrics = validate(
                     model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)')

From 5719b493adb4b1f42844bcbdbc58af44c9f3056b Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 19 Dec 2019 23:03:04 -0800
Subject: [PATCH 33/35] Missed update dist-bn logic for EMA model

---
 train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train.py b/train.py
index c55cbfb3..a47f1b4d 100644
--- a/train.py
+++ b/train.py
@@ -393,7 +393,7 @@ def main():
                 lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
                 use_amp=use_amp, model_ema=model_ema)
 
-            if args.distributed and args.dist_bn and args.dist_bn in ('broadcast', 'reduce'):
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                 if args.local_rank == 0:
                     logging.info("Distributing BatchNorm running means and vars")
                 distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
@@ -401,8 +401,8 @@ def main():
             eval_metrics = validate(model, loader_eval, validate_loss_fn, args)
 
             if model_ema is not None and not args.model_ema_force_cpu:
-                if args.distributed and args.reduce_bn:
-                    distribute_bn(model_ema, args.world_size)
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
 
                 ema_eval_metrics = validate(
                     model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)')

From 73b78459dc4dd5c8bb6b2a33477172cd253e2272 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 24 Dec 2019 10:08:24 -0800
Subject: [PATCH 34/35] Add update RandAugment MixNet-XL weights

---
 README.md                   | 18 ++++++++++++++++--
 timm/models/efficientnet.py |  2 +-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 33b61779..d3bc1573 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,12 @@
 
 ## What's New
 
+### Dec 23, 2019
+* Add RandAugment trained MixNet-XL weights with 80.48 top-1.
+* `--dist-bn` argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval
 
 ### Dec 4, 2019
 * Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).
-  * For those interested in hparams, I trained with the following: `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
 
 ### Nov 29, 2019
 * Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded.
@@ -38,6 +40,7 @@ The work of many others is present here. I've tried to make sure all source mate
     * RAdam by [Liyuan Liu](https://github.com/LiyuanLucasLiu/RAdam) (https://arxiv.org/abs/1908.03265)
     * NovoGrad by [Masashi Kimura](https://github.com/convergence-lab/novograd) (https://arxiv.org/abs/1905.11286)
     * Lookahead adapted from impl by [Liam](https://github.com/alphadl/lookahead.pytorch) (https://arxiv.org/abs/1907.08610)
+
 ## Models
 
 I've included a few of my favourite models, but this is not an exhaustive collection. You can't do better than Cadene's collection in that regard. Most models do have pretrained weights from their respective sources or original authors. 
@@ -104,8 +107,8 @@ I've leveraged the training scripts in this repository to train a few of the mod
 
 |Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling  | Image Size |
 |---|---|---|---|---|---|
+| mixnet_xl | 80.478 (19.522) | 94.932 (5.068) | 11.90M | bicubic | 224 |
 | efficientnet_b2 | 80.402 (19.598) | 95.076 (4.924) | 9.11M | bicubic | 260 |
-| mixnet_xl | 80.120 (19.880) | 95.022 (4.978) | 11.90M | bicubic | 224 |
 | resnext50d_32x4d | 79.674 (20.326) | 94.868 (5.132) | 25.1M | bicubic | 224 |
 | mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33M | bicubic | 224 |
 | efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.79M | bicubic | 240 |
@@ -231,6 +234,17 @@ Sources for original weights:
 * `tf_inception*`: [Tensorflow Slim](https://github.com/tensorflow/models/tree/master/research/slim)
 * `gluon_*`: [MxNet Gluon](https://gluon-cv.mxnet.io/model_zoo/classification.html)
 
+## Training Hyperparameters
+
+### EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5
+`./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
+
+### MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5
+`./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce`
+
+**TODO dig up some more**
+
+
 ## Usage
 
 ### Environment
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
index b93098dd..fd12dd07 100644
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@@ -181,7 +181,7 @@ default_cfgs = {
     'mixnet_l': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth'),
     'mixnet_xl': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl-ac5fbe8d.pth'),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth'),
     'mixnet_xxl': _cfg(),
     'tf_mixnet_s': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth'),

From 1f4498f2175557dc8601b1fb865e9b41a5704106 Mon Sep 17 00:00:00 2001
From: rwightman <rwightman@gmail.com>
Date: Sat, 28 Dec 2019 11:42:36 -0800
Subject: [PATCH 35/35] Add ResNet deep tiered stem and model weights for
 seresnext26t_32x4d and seresnext26d_32x4d

---
 README.md                   | 20 ++++++++++
 timm/models/gluon_resnet.py | 26 ++++++------
 timm/models/resnet.py       | 80 +++++++++++++++++++++++++++++--------
 3 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index d3bc1573..0a6c1d7c 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,15 @@
 
 ## What's New
 
+### Dec 28, 2019
+* Add new model weights and training hparams (see Training Hparams section)
+  * `seresnext26d_32x4d`- 77.6 top-1, 93.6 top-5
+     * deep stem (32, 32, 64), avgpool downsample
+     * stem/dowsample from bag-of-tricks paper
+  * `seresnext26t_32x4d`- 78.0 top-1, 93.7 top-5
+     * deep tiered stem (24, 48, 64), avgpool downsample (a modified 'D' variant)
+     * stem sizing mods from Jeremy Howard and fastai devs discussing ResNet architecture experiments
+
 ### Dec 23, 2019
 * Add RandAugment trained MixNet-XL weights with 80.48 top-1.
 * `--dist-bn` argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval
@@ -114,6 +123,8 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.79M | bicubic | 240 |
 | resnext50_32x4d | 78.512 (21.488) | 94.042 (5.958) | 25M | bicubic | 224 |
 | resnet50 | 78.470 (21.530) | 94.266 (5.734) | 25.6M | bicubic | 224 |
+| seresnext26t_32x4d | 77.998 (22.002) | 93.708 (6.292) | 16.8M | bicubic | 224 |
+| seresnext26d_32x4d | 77.602 (22.398) | 93.608 (6.392) | 16.8M | bicubic | 224 |
 | mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01M | bicubic | 224 |
 | seresnext26_32x4d | 77.104 (22.896) | 93.316 (6.684) | 16.8M | bicubic | 224 |
 | efficientnet_b0 | 76.912 (23.088) | 93.210 (6.790) | 5.29M | bicubic | 224 |
@@ -237,11 +248,20 @@ Sources for original weights:
 ## Training Hyperparameters
 
 ### EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5
+These params are for dual Titan RTX cards with NVIDIA Apex installed:
+
 `./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
 
 ### MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5
+This params are for dual Titan RTX cards with NVIDIA Apex installed:
+
 `./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce`
 
+### SE-ResNeXt-26-D and SE-ResNeXt-26-T
+These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards:
+
+`./distributed_train.sh 2 /imagenet/ --model seresnext26t_32x4d --lr 0.1 --warmup-epochs 5 --epochs 160 --weight-decay 1e-4 --sched cosine --reprob 0.4 --remode pixel -b 112`
+
 **TODO dig up some more**
 
 
diff --git a/timm/models/gluon_resnet.py b/timm/models/gluon_resnet.py
index 3d0f926f..f835a485 100644
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@@ -121,7 +121,7 @@ def gluon_resnet50_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """
     default_cfg = default_cfgs['gluon_resnet50_v1c']
     model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, **kwargs)
+                   stem_width=32, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -134,7 +134,7 @@ def gluon_resnet101_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet101_v1c']
     model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, **kwargs)
+                   stem_width=32, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -147,7 +147,7 @@ def gluon_resnet152_v1c(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet152_v1c']
     model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, **kwargs)
+                   stem_width=32, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -160,7 +160,7 @@ def gluon_resnet50_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """
     default_cfg = default_cfgs['gluon_resnet50_v1d']
     model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=32, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -173,7 +173,7 @@ def gluon_resnet101_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet101_v1d']
     model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=32, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -186,7 +186,7 @@ def gluon_resnet152_v1d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet152_v1d']
     model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=32, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=32, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -199,7 +199,7 @@ def gluon_resnet50_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """
     default_cfg = default_cfgs['gluon_resnet50_v1e']
     model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=64, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     #if pretrained:
     #    load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -212,7 +212,7 @@ def gluon_resnet101_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet101_v1e']
     model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=64, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -225,7 +225,7 @@ def gluon_resnet152_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet152_v1e']
     model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, avg_down=True, **kwargs)
+                   stem_width=64, stem_type='deep', avg_down=True, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -238,7 +238,7 @@ def gluon_resnet50_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
     """
     default_cfg = default_cfgs['gluon_resnet50_v1s']
     model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, **kwargs)
+                   stem_width=64, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -251,7 +251,7 @@ def gluon_resnet101_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet101_v1s']
     model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, **kwargs)
+                   stem_width=64, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -264,7 +264,7 @@ def gluon_resnet152_v1s(pretrained=False, num_classes=1000, in_chans=3, **kwargs
     """
     default_cfg = default_cfgs['gluon_resnet152_v1s']
     model = ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, in_chans=in_chans,
-                   stem_width=64, deep_stem=True, **kwargs)
+                   stem_width=64, stem_type='deep', **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
         load_pretrained(model, default_cfg, num_classes, in_chans)
@@ -362,7 +362,7 @@ def gluon_senet154(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     default_cfg = default_cfgs['gluon_senet154']
     model = ResNet(
         Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, use_se=True,
-        deep_stem=True, down_kernel_size=3, block_reduce_first=2,
+        stem_type='deep', down_kernel_size=3, block_reduce_first=2,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index b90bb9d5..9196cc0c 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -91,6 +91,12 @@ default_cfgs = {
         url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x8-b4712904.pth'),
     'swsl_resnext101_32x16d': _cfg(
         url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x16-f3559a9c.pth'),
+    'seresnext26d_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26d_32x4d-80fa48a3.pth',
+        interpolation='bicubic'),
+    'seresnext26t_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26t_32x4d-361bc1c4.pth',
+        interpolation='bicubic'),
 }
 
 
@@ -231,10 +237,11 @@ class ResNet(nn.Module):
 
     ResNet variants:
       * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
-      * c - 3 layer deep 3x3 stem, stem_width = 32
-      * d - 3 layer deep 3x3 stem, stem_width = 32, average pool in downsample
-      * e - 3 layer deep 3x3 stem, stem_width = 64, average pool in downsample
-      * s - 3 layer deep 3x3 stem, stem_width = 64
+      * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
+      * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
+      * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
+      * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
+      * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
 
     ResNeXt
       * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
@@ -263,10 +270,13 @@ class ResNet(nn.Module):
         Number of convolution groups for 3x3 conv in Bottleneck.
     base_width : int, default 64
         Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
-    deep_stem : bool, default False
-        Whether to replace the 7x7 conv1 with 3 3x3 convolution layers.
     stem_width : int, default 64
         Number of channels in stem convolutions
+    stem_type : str, default ''
+        The type of stem:
+          * '', default - a single 7x7 conv with a width of stem_width
+          * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
+          * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width//4 * 6, stem_width * 2
     block_reduce_first: int, default 1
         Reduction factor for first convolution output width of residual blocks,
         1 for all archs except senets, where 2
@@ -283,12 +293,13 @@ class ResNet(nn.Module):
         Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
     """
     def __init__(self, block, layers, num_classes=1000, in_chans=3, use_se=False,
-                 cardinality=1, base_width=64, stem_width=64, deep_stem=False,
+                 cardinality=1, base_width=64, stem_width=64, stem_type='',
                  block_reduce_first=1, down_kernel_size=1, avg_down=False, dilated=False,
                  norm_layer=nn.BatchNorm2d, drop_rate=0.0, global_pool='avg',
                  zero_init_last_bn=True, block_args=None):
         block_args = block_args or dict()
         self.num_classes = num_classes
+        deep_stem = 'deep' in stem_type
         self.inplanes = stem_width * 2 if deep_stem else 64
         self.cardinality = cardinality
         self.base_width = base_width
@@ -298,16 +309,20 @@ class ResNet(nn.Module):
         super(ResNet, self).__init__()
 
         if deep_stem:
+            stem_chs_1 = stem_chs_2 = stem_width
+            if 'tiered' in stem_type:
+                stem_chs_1 = 3 * (stem_width // 4)
+                stem_chs_2 = 6 * (stem_width // 4)
             self.conv1 = nn.Sequential(*[
-                nn.Conv2d(in_chans, stem_width, 3, stride=2, padding=1, bias=False),
-                norm_layer(stem_width),
+                nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False),
+                norm_layer(stem_chs_1),
                 nn.ReLU(inplace=True),
-                nn.Conv2d(stem_width, stem_width, 3, stride=1, padding=1, bias=False),
-                norm_layer(stem_width),
+                nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False),
+                norm_layer(stem_chs_2),
                 nn.ReLU(inplace=True),
-                nn.Conv2d(stem_width, self.inplanes, 3, stride=1, padding=1, bias=False)])
+                nn.Conv2d(stem_chs_2, self.inplanes, 3, stride=1, padding=1, bias=False)])
         else:
-            self.conv1 = nn.Conv2d(in_chans, stem_width, kernel_size=7, stride=2, padding=3, bias=False)
+            self.conv1 = nn.Conv2d(in_chans, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = norm_layer(self.inplanes)
         self.relu = nn.ReLU(inplace=True)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
@@ -324,7 +339,7 @@ class ResNet(nn.Module):
         self.num_features = 512 * block.expansion
         self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
 
-        last_bn_name = 'bn3' if 'Bottleneck' in block.__name__ else 'bn2'
+        last_bn_name = 'bn3' if 'Bottle' in block.__name__ else 'bn2'
         for n, m in self.named_modules():
             if isinstance(m, nn.Conv2d):
                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
@@ -440,7 +455,7 @@ def resnet26d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """
     default_cfg = default_cfgs['resnet26d']
     model = ResNet(
-        Bottleneck, [2, 2, 2, 2], stem_width=32, deep_stem=True, avg_down=True,
+        Bottleneck, [2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -466,7 +481,7 @@ def resnet50d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     """
     default_cfg = default_cfgs['resnet50d']
     model = ResNet(
-        Bottleneck, [3, 4, 6, 3], stem_width=32, deep_stem=True, avg_down=True,
+        Bottleneck, [3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -574,7 +589,7 @@ def resnext50d_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
     default_cfg = default_cfgs['resnext50d_32x4d']
     model = ResNet(
         Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
-        stem_width=32, deep_stem=True, avg_down=True,
+        stem_width=32, stem_type='deep', avg_down=True,
         num_classes=num_classes, in_chans=in_chans, **kwargs)
     model.default_cfg = default_cfg
     if pretrained:
@@ -854,3 +869,34 @@ def swsl_resnext101_32x16d(pretrained=True, **kwargs):
     if pretrained:
         load_pretrained(model, num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3))
     return model
+
+
+@register_model
+def seresnext26d_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a ResNet-26 v1d model.
+    This is technically a 28 layer ResNet, sticking with 'd' modifier from Gluon for now.
+    """
+    default_cfg = default_cfgs['seresnext26d_32x4d']
+    model = ResNet(
+        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
+        stem_width=32, stem_type='deep', avg_down=True, use_se=True,
+        num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def seresnext26t_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a ResNet-26 v1d model.
+    """
+    default_cfg = default_cfgs['seresnext26t_32x4d']
+    model = ResNet(
+        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
+        stem_width=32, stem_type='deep_tiered', avg_down=True, use_se=True,
+        num_classes=num_classes, in_chans=in_chans, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model