From 4731e4efc4691f3f4e57057667a62e5be78d4117 Mon Sep 17 00:00:00 2001 From: Cheng-Ling Lai Date: Sat, 16 Mar 2024 23:05:26 +0800 Subject: [PATCH] Modified ViT get_intermediate_layers() to support dynamic image size --- timm/models/vision_transformer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 70f91d58..1ab2e736 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -667,9 +667,12 @@ class VisionTransformer(nn.Module): outputs = [out[:, self.num_prefix_tokens:] for out in outputs] if reshape: - grid_size = self.patch_embed.grid_size + patch_size = self.patch_embed.patch_size + batch, _, height, width = x.size() outputs = [ - out.reshape(x.shape[0], grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2).contiguous() + out.reshape(batch, int(math.ceil(height / patch_size[0])), int(math.ceil(width / patch_size[1])), -1) + .permute(0, 3, 1, 2) + .contiguous() for out in outputs ]