From 38c474e3de0347e0e5545fb809c4b70de6a69977 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 25 Aug 2023 11:04:34 -0700 Subject: [PATCH] Update README, add --reparm to onnx_export --- README.md | 195 ++++++------------------------------------------ docs/changes.md | 174 ++++++++++++++++++++++++++++++++++++++++++ onnx_export.py | 13 +++- 3 files changed, 208 insertions(+), 174 deletions(-) diff --git a/README.md b/README.md index d254f321..25f9e092 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,19 @@ And a big thanks to all GitHub sponsors who helped with some of my costs before * The Hugging Face Hub (https://huggingface.co/timm) is now the primary source for `timm` weights. Model cards include link to papers, original source, license. * Previous 0.6.x can be cloned from [0.6.x](https://github.com/rwightman/pytorch-image-models/tree/0.6.x) branch or installed via pip with version. +### Aug 25, 2023 +* Many new models since last release + * FastViT - https://arxiv.org/abs/2303.14189 + * MobileOne - https://arxiv.org/abs/2206.04040 + * InceptionNeXt - https://arxiv.org/abs/2303.16900 + * RepGhostNet - https://arxiv.org/abs/2211.06088 (thanks https://github.com/ChengpengChen) + * GhostNetV2 - https://arxiv.org/abs/2211.12905 (thanks https://github.com/yehuitang) + * EfficientViT (MSRA) - https://arxiv.org/abs/2305.07027 (thanks https://github.com/seefun) + * EfficientViT (MIT) - https://arxiv.org/abs/2205.14756 (thanks https://github.com/seefun) +* Add `--reparam` arg to `benchmark.py`, `onnx_export.py`, and `validate.py` to trigger layer reparameterization / fusion for models with any one of `reparameterize()`, `switch_to_deploy()` or `fuse()` + * Including FastViT, MobileOne, RepGhostNet, EfficientViT (MSRA), RepViT, RepVGG, and LeViT +* Preparing 0.9.6 'back to school' release + ### Aug 3, 2023 * Add GluonCV weights for HRNet w18_small and w18_small_v2. Converted by [SeeFun](https://github.com/seefun) * Fix `selecsls*` model naming regression @@ -380,179 +393,6 @@ And a big thanks to all GitHub sponsors who helped with some of my costs before * `maxvit_tiny_rw_224` - 83.5 @ 224 (G) * `maxvit_rmlp_tiny_rw_256` - 84.2 @ 256, 84.8 @ 320 (T) -### Aug 29, 2022 -* MaxVit window size scales with img_size by default. Add new RelPosMlp MaxViT weight that leverages this: - * `maxvit_rmlp_nano_rw_256` - 83.0 @ 256, 83.6 @ 320 (T) - -### Aug 26, 2022 -* CoAtNet (https://arxiv.org/abs/2106.04803) and MaxVit (https://arxiv.org/abs/2204.01697) `timm` original models - * both found in [`maxxvit.py`](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/maxxvit.py) model def, contains numerous experiments outside scope of original papers - * an unfinished Tensorflow version from MaxVit authors can be found https://github.com/google-research/maxvit -* Initial CoAtNet and MaxVit timm pretrained weights (working on more): - * `coatnet_nano_rw_224` - 81.7 @ 224 (T) - * `coatnet_rmlp_nano_rw_224` - 82.0 @ 224, 82.8 @ 320 (T) - * `coatnet_0_rw_224` - 82.4 (T) -- NOTE timm '0' coatnets have 2 more 3rd stage blocks - * `coatnet_bn_0_rw_224` - 82.4 (T) - * `maxvit_nano_rw_256` - 82.9 @ 256 (T) - * `coatnet_rmlp_1_rw_224` - 83.4 @ 224, 84 @ 320 (T) - * `coatnet_1_rw_224` - 83.6 @ 224 (G) - * (T) = TPU trained with `bits_and_tpu` branch training code, (G) = GPU trained -* GCVit (weights adapted from https://github.com/NVlabs/GCVit, code 100% `timm` re-write for license purposes) -* MViT-V2 (multi-scale vit, adapted from https://github.com/facebookresearch/mvit) -* EfficientFormer (adapted from https://github.com/snap-research/EfficientFormer) -* PyramidVisionTransformer-V2 (adapted from https://github.com/whai362/PVT) -* 'Fast Norm' support for LayerNorm and GroupNorm that avoids float32 upcast w/ AMP (uses APEX LN if available for further boost) - - -### Aug 15, 2022 -* ConvNeXt atto weights added - * `convnext_atto` - 75.7 @ 224, 77.0 @ 288 - * `convnext_atto_ols` - 75.9 @ 224, 77.2 @ 288 - -### Aug 5, 2022 -* More custom ConvNeXt smaller model defs with weights - * `convnext_femto` - 77.5 @ 224, 78.7 @ 288 - * `convnext_femto_ols` - 77.9 @ 224, 78.9 @ 288 - * `convnext_pico` - 79.5 @ 224, 80.4 @ 288 - * `convnext_pico_ols` - 79.5 @ 224, 80.5 @ 288 - * `convnext_nano_ols` - 80.9 @ 224, 81.6 @ 288 -* Updated EdgeNeXt to improve ONNX export, add new base variant and weights from original (https://github.com/mmaaz60/EdgeNeXt) - -### July 28, 2022 -* Add freshly minted DeiT-III Medium (width=512, depth=12, num_heads=8) model weights. Thanks [Hugo Touvron](https://github.com/TouvronHugo)! - -### July 27, 2022 -* All runtime benchmark and validation result csv files are finally up-to-date! -* A few more weights & model defs added: - * `darknetaa53` - 79.8 @ 256, 80.5 @ 288 - * `convnext_nano` - 80.8 @ 224, 81.5 @ 288 - * `cs3sedarknet_l` - 81.2 @ 256, 81.8 @ 288 - * `cs3darknet_x` - 81.8 @ 256, 82.2 @ 288 - * `cs3sedarknet_x` - 82.2 @ 256, 82.7 @ 288 - * `cs3edgenet_x` - 82.2 @ 256, 82.7 @ 288 - * `cs3se_edgenet_x` - 82.8 @ 256, 83.5 @ 320 -* `cs3*` weights above all trained on TPU w/ `bits_and_tpu` branch. Thanks to TRC program! -* Add output_stride=8 and 16 support to ConvNeXt (dilation) -* deit3 models not being able to resize pos_emb fixed -* Version 0.6.7 PyPi release (/w above bug fixes and new weighs since 0.6.5) - -### July 8, 2022 -More models, more fixes -* Official research models (w/ weights) added: - * EdgeNeXt from (https://github.com/mmaaz60/EdgeNeXt) - * MobileViT-V2 from (https://github.com/apple/ml-cvnets) - * DeiT III (Revenge of the ViT) from (https://github.com/facebookresearch/deit) -* My own models: - * Small `ResNet` defs added by request with 1 block repeats for both basic and bottleneck (resnet10 and resnet14) - * `CspNet` refactored with dataclass config, simplified CrossStage3 (`cs3`) option. These are closer to YOLO-v5+ backbone defs. - * More relative position vit fiddling. Two `srelpos` (shared relative position) models trained, and a medium w/ class token. - * Add an alternate downsample mode to EdgeNeXt and train a `small` model. Better than original small, but not their new USI trained weights. -* My own model weight results (all ImageNet-1k training) - * `resnet10t` - 66.5 @ 176, 68.3 @ 224 - * `resnet14t` - 71.3 @ 176, 72.3 @ 224 - * `resnetaa50` - 80.6 @ 224 , 81.6 @ 288 - * `darknet53` - 80.0 @ 256, 80.5 @ 288 - * `cs3darknet_m` - 77.0 @ 256, 77.6 @ 288 - * `cs3darknet_focus_m` - 76.7 @ 256, 77.3 @ 288 - * `cs3darknet_l` - 80.4 @ 256, 80.9 @ 288 - * `cs3darknet_focus_l` - 80.3 @ 256, 80.9 @ 288 - * `vit_srelpos_small_patch16_224` - 81.1 @ 224, 82.1 @ 320 - * `vit_srelpos_medium_patch16_224` - 82.3 @ 224, 83.1 @ 320 - * `vit_relpos_small_patch16_cls_224` - 82.6 @ 224, 83.6 @ 320 - * `edgnext_small_rw` - 79.6 @ 224, 80.4 @ 320 -* `cs3`, `darknet`, and `vit_*relpos` weights above all trained on TPU thanks to TRC program! Rest trained on overheating GPUs. -* Hugging Face Hub support fixes verified, demo notebook TBA -* Pretrained weights / configs can be loaded externally (ie from local disk) w/ support for head adaptation. -* Add support to change image extensions scanned by `timm` datasets/readers. See (https://github.com/rwightman/pytorch-image-models/pull/1274#issuecomment-1178303103) -* Default ConvNeXt LayerNorm impl to use `F.layer_norm(x.permute(0, 2, 3, 1), ...).permute(0, 3, 1, 2)` via `LayerNorm2d` in all cases. - * a bit slower than previous custom impl on some hardware (ie Ampere w/ CL), but overall fewer regressions across wider HW / PyTorch version ranges. - * previous impl exists as `LayerNormExp2d` in `models/layers/norm.py` -* Numerous bug fixes -* Currently testing for imminent PyPi 0.6.x release -* LeViT pretraining of larger models still a WIP, they don't train well / easily without distillation. Time to add distill support (finally)? -* ImageNet-22k weight training + finetune ongoing, work on multi-weight support (slowly) chugging along (there are a LOT of weights, sigh) ... - -### May 13, 2022 -* Official Swin-V2 models and weights added from (https://github.com/microsoft/Swin-Transformer). Cleaned up to support torchscript. -* Some refactoring for existing `timm` Swin-V2-CR impl, will likely do a bit more to bring parts closer to official and decide whether to merge some aspects. -* More Vision Transformer relative position / residual post-norm experiments (all trained on TPU thanks to TRC program) - * `vit_relpos_small_patch16_224` - 81.5 @ 224, 82.5 @ 320 -- rel pos, layer scale, no class token, avg pool - * `vit_relpos_medium_patch16_rpn_224` - 82.3 @ 224, 83.1 @ 320 -- rel pos + res-post-norm, no class token, avg pool - * `vit_relpos_medium_patch16_224` - 82.5 @ 224, 83.3 @ 320 -- rel pos, layer scale, no class token, avg pool - * `vit_relpos_base_patch16_gapcls_224` - 82.8 @ 224, 83.9 @ 320 -- rel pos, layer scale, class token, avg pool (by mistake) -* Bring 512 dim, 8-head 'medium' ViT model variant back to life (after using in a pre DeiT 'small' model for first ViT impl back in 2020) -* Add ViT relative position support for switching btw existing impl and some additions in official Swin-V2 impl for future trials -* Sequencer2D impl (https://arxiv.org/abs/2205.01972), added via PR from author (https://github.com/okojoalg) - -### May 2, 2022 -* Vision Transformer experiments adding Relative Position (Swin-V2 log-coord) (`vision_transformer_relpos.py`) and Residual Post-Norm branches (from Swin-V2) (`vision_transformer*.py`) - * `vit_relpos_base_patch32_plus_rpn_256` - 79.5 @ 256, 80.6 @ 320 -- rel pos + extended width + res-post-norm, no class token, avg pool - * `vit_relpos_base_patch16_224` - 82.5 @ 224, 83.6 @ 320 -- rel pos, layer scale, no class token, avg pool - * `vit_base_patch16_rpn_224` - 82.3 @ 224 -- rel pos + res-post-norm, no class token, avg pool -* Vision Transformer refactor to remove representation layer that was only used in initial vit and rarely used since with newer pretrain (ie `How to Train Your ViT`) -* `vit_*` models support removal of class token, use of global average pool, use of fc_norm (ala beit, mae). - -### April 22, 2022 -* `timm` models are now officially supported in [fast.ai](https://www.fast.ai/)! Just in time for the new Practical Deep Learning course. `timmdocs` documentation link updated to [timm.fast.ai](http://timm.fast.ai/). -* Two more model weights added in the TPU trained [series](https://github.com/rwightman/pytorch-image-models/releases/tag/v0.1-tpu-weights). Some In22k pretrain still in progress. - * `seresnext101d_32x8d` - 83.69 @ 224, 84.35 @ 288 - * `seresnextaa101d_32x8d` (anti-aliased w/ AvgPool2d) - 83.85 @ 224, 84.57 @ 288 - -### March 23, 2022 -* Add `ParallelBlock` and `LayerScale` option to base vit models to support model configs in [Three things everyone should know about ViT](https://arxiv.org/abs/2203.09795) -* `convnext_tiny_hnf` (head norm first) weights trained with (close to) A2 recipe, 82.2% top-1, could do better with more epochs. - -### March 21, 2022 -* Merge `norm_norm_norm`. **IMPORTANT** this update for a coming 0.6.x release will likely de-stabilize the master branch for a while. Branch [`0.5.x`](https://github.com/rwightman/pytorch-image-models/tree/0.5.x) or a previous 0.5.x release can be used if stability is required. -* Significant weights update (all TPU trained) as described in this [release](https://github.com/rwightman/pytorch-image-models/releases/tag/v0.1-tpu-weights) - * `regnety_040` - 82.3 @ 224, 82.96 @ 288 - * `regnety_064` - 83.0 @ 224, 83.65 @ 288 - * `regnety_080` - 83.17 @ 224, 83.86 @ 288 - * `regnetv_040` - 82.44 @ 224, 83.18 @ 288 (timm pre-act) - * `regnetv_064` - 83.1 @ 224, 83.71 @ 288 (timm pre-act) - * `regnetz_040` - 83.67 @ 256, 84.25 @ 320 - * `regnetz_040h` - 83.77 @ 256, 84.5 @ 320 (w/ extra fc in head) - * `resnetv2_50d_gn` - 80.8 @ 224, 81.96 @ 288 (pre-act GroupNorm) - * `resnetv2_50d_evos` 80.77 @ 224, 82.04 @ 288 (pre-act EvoNormS) - * `regnetz_c16_evos` - 81.9 @ 256, 82.64 @ 320 (EvoNormS) - * `regnetz_d8_evos` - 83.42 @ 256, 84.04 @ 320 (EvoNormS) - * `xception41p` - 82 @ 299 (timm pre-act) - * `xception65` - 83.17 @ 299 - * `xception65p` - 83.14 @ 299 (timm pre-act) - * `resnext101_64x4d` - 82.46 @ 224, 83.16 @ 288 - * `seresnext101_32x8d` - 83.57 @ 224, 84.270 @ 288 - * `resnetrs200` - 83.85 @ 256, 84.44 @ 320 -* HuggingFace hub support fixed w/ initial groundwork for allowing alternative 'config sources' for pretrained model definitions and weights (generic local file / remote url support soon) -* SwinTransformer-V2 implementation added. Submitted by [Christoph Reich](https://github.com/ChristophReich1996). Training experiments and model changes by myself are ongoing so expect compat breaks. -* Swin-S3 (AutoFormerV2) models / weights added from https://github.com/microsoft/Cream/tree/main/AutoFormerV2 -* MobileViT models w/ weights adapted from https://github.com/apple/ml-cvnets -* PoolFormer models w/ weights adapted from https://github.com/sail-sg/poolformer -* VOLO models w/ weights adapted from https://github.com/sail-sg/volo -* Significant work experimenting with non-BatchNorm norm layers such as EvoNorm, FilterResponseNorm, GroupNorm, etc -* Enhance support for alternate norm + act ('NormAct') layers added to a number of models, esp EfficientNet/MobileNetV3, RegNet, and aligned Xception -* Grouped conv support added to EfficientNet family -* Add 'group matching' API to all models to allow grouping model parameters for application of 'layer-wise' LR decay, lr scale added to LR scheduler -* Gradient checkpointing support added to many models -* `forward_head(x, pre_logits=False)` fn added to all models to allow separate calls of `forward_features` + `forward_head` -* All vision transformer and vision MLP models update to return non-pooled / non-token selected features from `foward_features`, for consistency with CNN models, token selection or pooling now applied in `forward_head` - -### Feb 2, 2022 -* [Chris Hughes](https://github.com/Chris-hughes10) posted an exhaustive run through of `timm` on his blog yesterday. Well worth a read. [Getting Started with PyTorch Image Models (timm): A Practitioner’s Guide](https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055) -* I'm currently prepping to merge the `norm_norm_norm` branch back to master (ver 0.6.x) in next week or so. - * The changes are more extensive than usual and may destabilize and break some model API use (aiming for full backwards compat). So, beware `pip install git+https://github.com/rwightman/pytorch-image-models` installs! - * `0.5.x` releases and a `0.5.x` branch will remain stable with a cherry pick or two until dust clears. Recommend sticking to pypi install for a bit if you want stable. - -### Jan 14, 2022 -* Version 0.5.4 w/ release to be pushed to pypi. It's been a while since last pypi update and riskier changes will be merged to main branch soon.... -* Add ConvNeXT models /w weights from official impl (https://github.com/facebookresearch/ConvNeXt), a few perf tweaks, compatible with timm features -* Tried training a few small (~1.8-3M param) / mobile optimized models, a few are good so far, more on the way... - * `mnasnet_small` - 65.6 top-1 - * `mobilenetv2_050` - 65.9 - * `lcnet_100/075/050` - 72.1 / 68.8 / 63.1 - * `semnasnet_075` - 73 - * `fbnetv3_b/d/g` - 79.1 / 79.7 / 82.0 -* TinyNet models added by [rsomani95](https://github.com/rsomani95) -* LCNet added via MobileNetV3 architecture ## Introduction @@ -594,26 +434,33 @@ All model architecture families include variants with pretrained weights. There * MobileNet-V2 - https://arxiv.org/abs/1801.04381 * Single-Path NAS - https://arxiv.org/abs/1904.02877 * TinyNet - https://arxiv.org/abs/2010.14819 +* EfficientViT (MIT) - https://arxiv.org/abs/2205.14756 +* EfficientViT (MSRA) - https://arxiv.org/abs/2305.07027 * EVA - https://arxiv.org/abs/2211.07636 * EVA-02 - https://arxiv.org/abs/2303.11331 +* FastViT - https://arxiv.org/abs/2303.14189 * FlexiViT - https://arxiv.org/abs/2212.08013 * FocalNet (Focal Modulation Networks) - https://arxiv.org/abs/2203.11926 * GCViT (Global Context Vision Transformer) - https://arxiv.org/abs/2206.09959 * GhostNet - https://arxiv.org/abs/1911.11907 +* GhostNet-V2 - https://arxiv.org/abs/2211.12905 * gMLP - https://arxiv.org/abs/2105.08050 * GPU-Efficient Networks - https://arxiv.org/abs/2006.14090 * Halo Nets - https://arxiv.org/abs/2103.12731 * HRNet - https://arxiv.org/abs/1908.07919 +* InceptionNeXt - https://arxiv.org/abs/2303.16900 * Inception-V3 - https://arxiv.org/abs/1512.00567 * Inception-ResNet-V2 and Inception-V4 - https://arxiv.org/abs/1602.07261 * Lambda Networks - https://arxiv.org/abs/2102.08602 * LeViT (Vision Transformer in ConvNet's Clothing) - https://arxiv.org/abs/2104.01136 * MaxViT (Multi-Axis Vision Transformer) - https://arxiv.org/abs/2204.01697 +* MetaFormer (PoolFormer-v2, ConvFormer, CAFormer) - https://arxiv.org/abs/2210.13452 * MLP-Mixer - https://arxiv.org/abs/2105.01601 * MobileNet-V3 (MBConvNet w/ Efficient Head) - https://arxiv.org/abs/1905.02244 * FBNet-V3 - https://arxiv.org/abs/2006.02049 * HardCoRe-NAS - https://arxiv.org/abs/2102.11646 * LCNet - https://arxiv.org/abs/2109.15099 +* MobileOne - https://arxiv.org/abs/2206.04040 * MobileViT - https://arxiv.org/abs/2110.02178 * MobileViT-V2 - https://arxiv.org/abs/2206.02680 * MViT-V2 (Improved Multiscale Vision Transformer) - https://arxiv.org/abs/2112.01526 @@ -628,6 +475,8 @@ All model architecture families include variants with pretrained weights. There * RegNet - https://arxiv.org/abs/2003.13678 * RegNetZ - https://arxiv.org/abs/2103.06877 * RepVGG - https://arxiv.org/abs/2101.03697 +* RepGhostNet - https://arxiv.org/abs/2211.06088 +* RepViT - https://arxiv.org/abs/2307.09283 * ResMLP - https://arxiv.org/abs/2105.03404 * ResNet/ResNeXt * ResNet (v1b/v1.5) - https://arxiv.org/abs/1512.03385 diff --git a/docs/changes.md b/docs/changes.md index edf88c62..e28a4ff3 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -1,4 +1,178 @@ # Recent Changes + +### Aug 29, 2022 +* MaxVit window size scales with img_size by default. Add new RelPosMlp MaxViT weight that leverages this: + * `maxvit_rmlp_nano_rw_256` - 83.0 @ 256, 83.6 @ 320 (T) + +### Aug 26, 2022 +* CoAtNet (https://arxiv.org/abs/2106.04803) and MaxVit (https://arxiv.org/abs/2204.01697) `timm` original models + * both found in [`maxxvit.py`](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/maxxvit.py) model def, contains numerous experiments outside scope of original papers + * an unfinished Tensorflow version from MaxVit authors can be found https://github.com/google-research/maxvit +* Initial CoAtNet and MaxVit timm pretrained weights (working on more): + * `coatnet_nano_rw_224` - 81.7 @ 224 (T) + * `coatnet_rmlp_nano_rw_224` - 82.0 @ 224, 82.8 @ 320 (T) + * `coatnet_0_rw_224` - 82.4 (T) -- NOTE timm '0' coatnets have 2 more 3rd stage blocks + * `coatnet_bn_0_rw_224` - 82.4 (T) + * `maxvit_nano_rw_256` - 82.9 @ 256 (T) + * `coatnet_rmlp_1_rw_224` - 83.4 @ 224, 84 @ 320 (T) + * `coatnet_1_rw_224` - 83.6 @ 224 (G) + * (T) = TPU trained with `bits_and_tpu` branch training code, (G) = GPU trained +* GCVit (weights adapted from https://github.com/NVlabs/GCVit, code 100% `timm` re-write for license purposes) +* MViT-V2 (multi-scale vit, adapted from https://github.com/facebookresearch/mvit) +* EfficientFormer (adapted from https://github.com/snap-research/EfficientFormer) +* PyramidVisionTransformer-V2 (adapted from https://github.com/whai362/PVT) +* 'Fast Norm' support for LayerNorm and GroupNorm that avoids float32 upcast w/ AMP (uses APEX LN if available for further boost) + +### Aug 15, 2022 +* ConvNeXt atto weights added + * `convnext_atto` - 75.7 @ 224, 77.0 @ 288 + * `convnext_atto_ols` - 75.9 @ 224, 77.2 @ 288 + +### Aug 5, 2022 +* More custom ConvNeXt smaller model defs with weights + * `convnext_femto` - 77.5 @ 224, 78.7 @ 288 + * `convnext_femto_ols` - 77.9 @ 224, 78.9 @ 288 + * `convnext_pico` - 79.5 @ 224, 80.4 @ 288 + * `convnext_pico_ols` - 79.5 @ 224, 80.5 @ 288 + * `convnext_nano_ols` - 80.9 @ 224, 81.6 @ 288 +* Updated EdgeNeXt to improve ONNX export, add new base variant and weights from original (https://github.com/mmaaz60/EdgeNeXt) + +### July 28, 2022 +* Add freshly minted DeiT-III Medium (width=512, depth=12, num_heads=8) model weights. Thanks [Hugo Touvron](https://github.com/TouvronHugo)! + +### July 27, 2022 +* All runtime benchmark and validation result csv files are finally up-to-date! +* A few more weights & model defs added: + * `darknetaa53` - 79.8 @ 256, 80.5 @ 288 + * `convnext_nano` - 80.8 @ 224, 81.5 @ 288 + * `cs3sedarknet_l` - 81.2 @ 256, 81.8 @ 288 + * `cs3darknet_x` - 81.8 @ 256, 82.2 @ 288 + * `cs3sedarknet_x` - 82.2 @ 256, 82.7 @ 288 + * `cs3edgenet_x` - 82.2 @ 256, 82.7 @ 288 + * `cs3se_edgenet_x` - 82.8 @ 256, 83.5 @ 320 +* `cs3*` weights above all trained on TPU w/ `bits_and_tpu` branch. Thanks to TRC program! +* Add output_stride=8 and 16 support to ConvNeXt (dilation) +* deit3 models not being able to resize pos_emb fixed +* Version 0.6.7 PyPi release (/w above bug fixes and new weighs since 0.6.5) + +### July 8, 2022 +More models, more fixes +* Official research models (w/ weights) added: + * EdgeNeXt from (https://github.com/mmaaz60/EdgeNeXt) + * MobileViT-V2 from (https://github.com/apple/ml-cvnets) + * DeiT III (Revenge of the ViT) from (https://github.com/facebookresearch/deit) +* My own models: + * Small `ResNet` defs added by request with 1 block repeats for both basic and bottleneck (resnet10 and resnet14) + * `CspNet` refactored with dataclass config, simplified CrossStage3 (`cs3`) option. These are closer to YOLO-v5+ backbone defs. + * More relative position vit fiddling. Two `srelpos` (shared relative position) models trained, and a medium w/ class token. + * Add an alternate downsample mode to EdgeNeXt and train a `small` model. Better than original small, but not their new USI trained weights. +* My own model weight results (all ImageNet-1k training) + * `resnet10t` - 66.5 @ 176, 68.3 @ 224 + * `resnet14t` - 71.3 @ 176, 72.3 @ 224 + * `resnetaa50` - 80.6 @ 224 , 81.6 @ 288 + * `darknet53` - 80.0 @ 256, 80.5 @ 288 + * `cs3darknet_m` - 77.0 @ 256, 77.6 @ 288 + * `cs3darknet_focus_m` - 76.7 @ 256, 77.3 @ 288 + * `cs3darknet_l` - 80.4 @ 256, 80.9 @ 288 + * `cs3darknet_focus_l` - 80.3 @ 256, 80.9 @ 288 + * `vit_srelpos_small_patch16_224` - 81.1 @ 224, 82.1 @ 320 + * `vit_srelpos_medium_patch16_224` - 82.3 @ 224, 83.1 @ 320 + * `vit_relpos_small_patch16_cls_224` - 82.6 @ 224, 83.6 @ 320 + * `edgnext_small_rw` - 79.6 @ 224, 80.4 @ 320 +* `cs3`, `darknet`, and `vit_*relpos` weights above all trained on TPU thanks to TRC program! Rest trained on overheating GPUs. +* Hugging Face Hub support fixes verified, demo notebook TBA +* Pretrained weights / configs can be loaded externally (ie from local disk) w/ support for head adaptation. +* Add support to change image extensions scanned by `timm` datasets/readers. See (https://github.com/rwightman/pytorch-image-models/pull/1274#issuecomment-1178303103) +* Default ConvNeXt LayerNorm impl to use `F.layer_norm(x.permute(0, 2, 3, 1), ...).permute(0, 3, 1, 2)` via `LayerNorm2d` in all cases. + * a bit slower than previous custom impl on some hardware (ie Ampere w/ CL), but overall fewer regressions across wider HW / PyTorch version ranges. + * previous impl exists as `LayerNormExp2d` in `models/layers/norm.py` +* Numerous bug fixes +* Currently testing for imminent PyPi 0.6.x release +* LeViT pretraining of larger models still a WIP, they don't train well / easily without distillation. Time to add distill support (finally)? +* ImageNet-22k weight training + finetune ongoing, work on multi-weight support (slowly) chugging along (there are a LOT of weights, sigh) ... + +### May 13, 2022 +* Official Swin-V2 models and weights added from (https://github.com/microsoft/Swin-Transformer). Cleaned up to support torchscript. +* Some refactoring for existing `timm` Swin-V2-CR impl, will likely do a bit more to bring parts closer to official and decide whether to merge some aspects. +* More Vision Transformer relative position / residual post-norm experiments (all trained on TPU thanks to TRC program) + * `vit_relpos_small_patch16_224` - 81.5 @ 224, 82.5 @ 320 -- rel pos, layer scale, no class token, avg pool + * `vit_relpos_medium_patch16_rpn_224` - 82.3 @ 224, 83.1 @ 320 -- rel pos + res-post-norm, no class token, avg pool + * `vit_relpos_medium_patch16_224` - 82.5 @ 224, 83.3 @ 320 -- rel pos, layer scale, no class token, avg pool + * `vit_relpos_base_patch16_gapcls_224` - 82.8 @ 224, 83.9 @ 320 -- rel pos, layer scale, class token, avg pool (by mistake) +* Bring 512 dim, 8-head 'medium' ViT model variant back to life (after using in a pre DeiT 'small' model for first ViT impl back in 2020) +* Add ViT relative position support for switching btw existing impl and some additions in official Swin-V2 impl for future trials +* Sequencer2D impl (https://arxiv.org/abs/2205.01972), added via PR from author (https://github.com/okojoalg) + +### May 2, 2022 +* Vision Transformer experiments adding Relative Position (Swin-V2 log-coord) (`vision_transformer_relpos.py`) and Residual Post-Norm branches (from Swin-V2) (`vision_transformer*.py`) + * `vit_relpos_base_patch32_plus_rpn_256` - 79.5 @ 256, 80.6 @ 320 -- rel pos + extended width + res-post-norm, no class token, avg pool + * `vit_relpos_base_patch16_224` - 82.5 @ 224, 83.6 @ 320 -- rel pos, layer scale, no class token, avg pool + * `vit_base_patch16_rpn_224` - 82.3 @ 224 -- rel pos + res-post-norm, no class token, avg pool +* Vision Transformer refactor to remove representation layer that was only used in initial vit and rarely used since with newer pretrain (ie `How to Train Your ViT`) +* `vit_*` models support removal of class token, use of global average pool, use of fc_norm (ala beit, mae). + +### April 22, 2022 +* `timm` models are now officially supported in [fast.ai](https://www.fast.ai/)! Just in time for the new Practical Deep Learning course. `timmdocs` documentation link updated to [timm.fast.ai](http://timm.fast.ai/). +* Two more model weights added in the TPU trained [series](https://github.com/rwightman/pytorch-image-models/releases/tag/v0.1-tpu-weights). Some In22k pretrain still in progress. + * `seresnext101d_32x8d` - 83.69 @ 224, 84.35 @ 288 + * `seresnextaa101d_32x8d` (anti-aliased w/ AvgPool2d) - 83.85 @ 224, 84.57 @ 288 + +### March 23, 2022 +* Add `ParallelBlock` and `LayerScale` option to base vit models to support model configs in [Three things everyone should know about ViT](https://arxiv.org/abs/2203.09795) +* `convnext_tiny_hnf` (head norm first) weights trained with (close to) A2 recipe, 82.2% top-1, could do better with more epochs. + +### March 21, 2022 +* Merge `norm_norm_norm`. **IMPORTANT** this update for a coming 0.6.x release will likely de-stabilize the master branch for a while. Branch [`0.5.x`](https://github.com/rwightman/pytorch-image-models/tree/0.5.x) or a previous 0.5.x release can be used if stability is required. +* Significant weights update (all TPU trained) as described in this [release](https://github.com/rwightman/pytorch-image-models/releases/tag/v0.1-tpu-weights) + * `regnety_040` - 82.3 @ 224, 82.96 @ 288 + * `regnety_064` - 83.0 @ 224, 83.65 @ 288 + * `regnety_080` - 83.17 @ 224, 83.86 @ 288 + * `regnetv_040` - 82.44 @ 224, 83.18 @ 288 (timm pre-act) + * `regnetv_064` - 83.1 @ 224, 83.71 @ 288 (timm pre-act) + * `regnetz_040` - 83.67 @ 256, 84.25 @ 320 + * `regnetz_040h` - 83.77 @ 256, 84.5 @ 320 (w/ extra fc in head) + * `resnetv2_50d_gn` - 80.8 @ 224, 81.96 @ 288 (pre-act GroupNorm) + * `resnetv2_50d_evos` 80.77 @ 224, 82.04 @ 288 (pre-act EvoNormS) + * `regnetz_c16_evos` - 81.9 @ 256, 82.64 @ 320 (EvoNormS) + * `regnetz_d8_evos` - 83.42 @ 256, 84.04 @ 320 (EvoNormS) + * `xception41p` - 82 @ 299 (timm pre-act) + * `xception65` - 83.17 @ 299 + * `xception65p` - 83.14 @ 299 (timm pre-act) + * `resnext101_64x4d` - 82.46 @ 224, 83.16 @ 288 + * `seresnext101_32x8d` - 83.57 @ 224, 84.270 @ 288 + * `resnetrs200` - 83.85 @ 256, 84.44 @ 320 +* HuggingFace hub support fixed w/ initial groundwork for allowing alternative 'config sources' for pretrained model definitions and weights (generic local file / remote url support soon) +* SwinTransformer-V2 implementation added. Submitted by [Christoph Reich](https://github.com/ChristophReich1996). Training experiments and model changes by myself are ongoing so expect compat breaks. +* Swin-S3 (AutoFormerV2) models / weights added from https://github.com/microsoft/Cream/tree/main/AutoFormerV2 +* MobileViT models w/ weights adapted from https://github.com/apple/ml-cvnets +* PoolFormer models w/ weights adapted from https://github.com/sail-sg/poolformer +* VOLO models w/ weights adapted from https://github.com/sail-sg/volo +* Significant work experimenting with non-BatchNorm norm layers such as EvoNorm, FilterResponseNorm, GroupNorm, etc +* Enhance support for alternate norm + act ('NormAct') layers added to a number of models, esp EfficientNet/MobileNetV3, RegNet, and aligned Xception +* Grouped conv support added to EfficientNet family +* Add 'group matching' API to all models to allow grouping model parameters for application of 'layer-wise' LR decay, lr scale added to LR scheduler +* Gradient checkpointing support added to many models +* `forward_head(x, pre_logits=False)` fn added to all models to allow separate calls of `forward_features` + `forward_head` +* All vision transformer and vision MLP models update to return non-pooled / non-token selected features from `foward_features`, for consistency with CNN models, token selection or pooling now applied in `forward_head` + +### Feb 2, 2022 +* [Chris Hughes](https://github.com/Chris-hughes10) posted an exhaustive run through of `timm` on his blog yesterday. Well worth a read. [Getting Started with PyTorch Image Models (timm): A Practitioner’s Guide](https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055) +* I'm currently prepping to merge the `norm_norm_norm` branch back to master (ver 0.6.x) in next week or so. + * The changes are more extensive than usual and may destabilize and break some model API use (aiming for full backwards compat). So, beware `pip install git+https://github.com/rwightman/pytorch-image-models` installs! + * `0.5.x` releases and a `0.5.x` branch will remain stable with a cherry pick or two until dust clears. Recommend sticking to pypi install for a bit if you want stable. + +### Jan 14, 2022 +* Version 0.5.4 w/ release to be pushed to pypi. It's been a while since last pypi update and riskier changes will be merged to main branch soon.... +* Add ConvNeXT models /w weights from official impl (https://github.com/facebookresearch/ConvNeXt), a few perf tweaks, compatible with timm features +* Tried training a few small (~1.8-3M param) / mobile optimized models, a few are good so far, more on the way... + * `mnasnet_small` - 65.6 top-1 + * `mobilenetv2_050` - 65.9 + * `lcnet_100/075/050` - 72.1 / 68.8 / 63.1 + * `semnasnet_075` - 73 + * `fbnetv3_b/d/g` - 79.1 / 79.7 / 82.0 +* TinyNet models added by [rsomani95](https://github.com/rsomani95) +* LCNet added via MobileNetV3 architecture + ### Jan 5, 2023 * ConvNeXt-V2 models and weights added to existing `convnext.py` * Paper: [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](http://arxiv.org/abs/2301.00808) diff --git a/onnx_export.py b/onnx_export.py index 54f8f352..3baab369 100644 --- a/onnx_export.py +++ b/onnx_export.py @@ -21,6 +21,7 @@ Copyright 2020 Ross Wightman import argparse import timm +from timm.utils.model import reparameterize_model from timm.utils.onnx import onnx_export parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation') @@ -50,7 +51,12 @@ parser.add_argument('--num-classes', type=int, default=1000, help='Number classes in dataset') parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', help='path to checkpoint (default: none)') - +parser.add_argument('--reparam', default=False, action='store_true', + help='Reparameterize model') +parser.add_argument('--training', default=False, action='store_true', + help='Export in training mode (default is eval)') +parser.add_argument('--verbose', default=False, action='store_true', + help='Extra stdout output') def main(): args = parser.parse_args() @@ -71,6 +77,9 @@ def main(): exportable=True, ) + if args.reparam: + model = reparameterize_model(model) + onnx_export( model, args.output, @@ -79,6 +88,8 @@ def main(): aten_fallback=args.aten_fallback, keep_initializers=args.keep_init, check_forward=args.check_forward, + training=args.training, + verbose=args.verbose, )