Remove sdpa context mgrs

2025-06-03 15:01:08 +08:00 · 2023-09-25 23:30:56 -07:00 · 2023-09-25 23:30:56 -07:00 · 379780bb6c
commit 379780bb6c
parent 2734bb76ce
1 changed files with 16 additions and 18 deletions
--- a/timm/models/vision_transformer_packed.py
+++ b/timm/models/vision_transformer_packed.py
@ -124,7 +124,7 @@ def pack_images(
 ):
    max_seq_len = max_grid_size[0] * max_grid_size[1]

-    # patchify if needed, generate position indices, apply patch drop, record seq lengths
+    # patchify, generate position indices, apply patch drop, record seq lengths
    img_tokens = []
    img_pos_indices = []
    img_seq_lens = []
@ -144,6 +144,7 @@ def pack_images(
                indexing='ij'),
            dim=-1,
        )
+        # FIXME patch drop here
        img_tokens.append(patches.flatten(0, 1))
        img_pos_indices.append(pos_indices.flatten(0, 1))
        img_seq_lens.append(seq_len)
@ -221,7 +222,6 @@ class Attention(nn.Module):
                attn_mask = attn_mask.expand((-1, self.num_heads, -1, -1))

        if self.fused_attn:
-            with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
            x = F.scaled_dot_product_attention(
                q, k, v,
                attn_mask=attn_mask,
@ -374,7 +374,6 @@ class ParallelScalingBlock(nn.Module):
        k = self.k_norm(k.view(B, N, self.num_heads, self.head_dim)).transpose(1, 2)
        v = v.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        if self.fused_attn:
-            with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
            x_attn = F.scaled_dot_product_attention(
                q, k, v,
                attn_mask=attn_mask,
@ -507,7 +506,6 @@ class AttentionPoolLatent(nn.Module):
        q = self.q_norm(q)
        k = self.k_norm(k)
        if False:
-            with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
            x = F.scaled_dot_product_attention(
                q, k, v,
                attn_mask=attn_mask,