pytorch-image-models/timm/layers/attention_pool2d.py

""" Attention Pool 2D

Implementations of 2D spatial feature pooling using multi-head attention instead of average pool.

Based on idea in CLIP by OpenAI, licensed Apache 2.0
https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

Hacked together by / Copyright 2021 Ross Wightman
"""
from typing import Optional, Union, Tuple

import torch
import torch.nn as nn

from. config import use_fused_attn
from .helpers import to_2tuple
from .pos_embed import resample_abs_pos_embed
from .pos_embed_sincos import apply_rot_embed, RotaryEmbedding
from .weight_init import trunc_normal_


class RotAttentionPool2d(nn.Module):
    """ Attention based 2D feature pooling w/ rotary (relative) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.

    Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

    NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
    train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
    """
    fused_attn: torch.jit.Final[bool]

    def __init__(
            self,
            in_features: int,
            out_features: Optional[int] = None,
            ref_feat_size: Union[int, Tuple[int, int]] = 7,
            embed_dim: Optional[int] = None,
            head_dim: Optional[int] = 64,
            num_heads: Optional[int] = None,
            qkv_bias: bool = True,
            qkv_separate: bool = False,
    ):
        super().__init__()
        embed_dim = embed_dim or in_features
        self.in_features = in_features
        self.out_features = out_features or in_features
        ref_feat_size = to_2tuple(ref_feat_size)
        if num_heads is not None:
            assert embed_dim % num_heads == 0
            head_dim = embed_dim // num_heads
        else:
            assert embed_dim % head_dim == 0
            num_heads = embed_dim // head_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.scale = self.head_dim ** -0.5
        self.fused_attn = use_fused_attn()

        if qkv_separate:
            self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.qkv = None
        else:
            self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(embed_dim, self.out_features)
        self.pos_embed = RotaryEmbedding(self.head_dim, in_pixels=False, ref_feat_shape=ref_feat_size)

    def init_weights(self, zero_init_last: bool = False):
        if self.qkv is None:
            in_features = self.q.in_features
            trunc_normal_(self.q.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.q.bias)
            trunc_normal_(self.k.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.k.bias)
            trunc_normal_(self.v.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.v.bias)
        else:
            in_features = self.qkv.in_features
            trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.qkv.bias)

    def forward(self, x):
        B, _, H, W = x.shape
        N = H * W
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat([x.mean(1, keepdim=True), x], dim=1)
        if self.qkv is None:
            q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
            k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
            v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
        else:
            x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
            q, k, v = x.unbind(0)

        rse, rce = self.pos_embed.get_embed((H, W))
        q = torch.cat([q[:, :, :1, :], apply_rot_embed(q[:, :, 1:, :], rse, rce)], dim=2).type_as(v)
        k = torch.cat([k[:, :, :1, :], apply_rot_embed(k[:, :, 1:, :], rse, rce)], dim=2).type_as(v)

        if self.fused_attn:
            x = nn.functional.scaled_dot_product_attention(q, k, v)
        else:
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            x = attn @ v
        x = x.transpose(1, 2).reshape(B, N + 1, -1)
        x = self.proj(x)
        return x[:, 0]


class AttentionPool2d(nn.Module):
    """ Attention based 2D feature pooling w/ learned (absolute) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.

    It was based on impl in CLIP by OpenAI
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py

    NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
    """
    fused_attn: torch.jit.Final[bool]

    def __init__(
            self,
            in_features: int,
            feat_size: Union[int, Tuple[int, int]] = 7,
            out_features: Optional[int] = None,
            embed_dim: Optional[int] = None,
            head_dim: Optional[int] = 64,
            num_heads: Optional[int] = None,
            qkv_bias: bool = True,
            qkv_separate: bool = False,
    ):
        super().__init__()
        embed_dim = embed_dim or in_features
        self.in_features = in_features
        self.out_features = out_features or in_features
        if num_heads is not None:
            assert embed_dim % num_heads == 0
            head_dim = embed_dim // num_heads
        else:
            assert embed_dim % head_dim == 0
            num_heads = embed_dim // head_dim
        self.feat_size = to_2tuple(feat_size)
        self.seq_len = self.feat_size[0] * self.feat_size[1]
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.scale = self.head_dim ** -0.5
        self.fused_attn = use_fused_attn()

        if qkv_separate:
            self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)
            self.qkv = None
        else:
            self.q = self.k = self.v = None
            self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(embed_dim, self.out_features)
        self.pos_embed = nn.Parameter(torch.zeros(self.seq_len + 1, in_features))

        self.init_weights()

    def init_weights(self, zero_init_last: bool = False):
        if self.qkv is None:
            in_features = self.q.in_features
            trunc_normal_(self.q.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.q.bias)
            trunc_normal_(self.k.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.k.bias)
            trunc_normal_(self.v.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.v.bias)
        else:
            in_features = self.qkv.in_features
            trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
            nn.init.zeros_(self.qkv.bias)
        trunc_normal_(self.pos_embed, std=in_features ** -0.5)

    def forward(self, x):
        B, _, H, W = x.shape
        N = H * W
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat([x.mean(1, keepdim=True), x], dim=1)
        if self.seq_len != N:
            pos_embed = resample_abs_pos_embed(self.pos_embed.unsqueeze(0), (H, W), num_prefix_tokens=1)
        else:
            pos_embed = self.pos_embed.unsqueeze(0).to(x.dtype)
        x = x + pos_embed

        if self.qkv is None:
            q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
            k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
            v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)
        else:
            x = self.qkv(x).reshape(B, -1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
            q, k, v = x.unbind(0)

        if self.fused_attn:
            x = nn.functional.scaled_dot_product_attention(q, k, v)
        else:
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)
            attn = attn.softmax(dim=-1)
            x = attn @ v
        x = x.transpose(1, 2).reshape(B, N + 1, -1)
        x = self.proj(x)
        return x[:, 0]
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`""" Attention Pool 2D`

			`Implementations of 2D spatial feature pooling using multi-head attention instead of average pool.`

			`Based on idea in CLIP by OpenAI, licensed Apache 2.0`
			`https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py`

			`Hacked together by / Copyright 2021 Ross Wightman`
			`"""`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`from typing import Optional, Union, Tuple`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00
			`import torch`
			`import torch.nn as nn`

Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`from. config import use_fused_attn`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`from .helpers import to_2tuple`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`from .pos_embed import resample_abs_pos_embed`
Add FlexiViT models and weights, refactoring, push more weights * push all vision_transformer.py weights to HF hub finalize more pretrained tags for pushed weights * refactor pos_embed files and module locations, move some pos embed modules to layers * tweak hf hub helpers to aid bulk uploading and updating 2022-12-22 17:19:45 -08:00			`from .pos_embed_sincos import apply_rot_embed, RotaryEmbedding`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`from .weight_init import trunc_normal_`


			`class RotAttentionPool2d(nn.Module):`
			`""" Attention based 2D feature pooling w/ rotary (relative) pos embedding.`
			`This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.`

			`Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.`
			`https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py`

			`NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from`
			`train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW`
			`"""`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`fused_attn: torch.jit.Final[bool]`

Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`def __init__(`
			`self,`
			`in_features: int,`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`out_features: Optional[int] = None,`
			`ref_feat_size: Union[int, Tuple[int, int]] = 7,`
			`embed_dim: Optional[int] = None,`
			`head_dim: Optional[int] = 64,`
			`num_heads: Optional[int] = None,`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`qkv_bias: bool = True,`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`qkv_separate: bool = False,`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`):`
			`super().__init__()`
			`embed_dim = embed_dim or in_features`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.in_features = in_features`
			`self.out_features = out_features or in_features`
			`ref_feat_size = to_2tuple(ref_feat_size)`
			`if num_heads is not None:`
			`assert embed_dim % num_heads == 0`
			`head_dim = embed_dim // num_heads`
			`else:`
			`assert embed_dim % head_dim == 0`
			`num_heads = embed_dim // head_dim`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`self.num_heads = num_heads`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.head_dim = head_dim`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`self.scale = self.head_dim ** -0.5`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.fused_attn = use_fused_attn()`

			`if qkv_separate:`
			`self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.qkv = None`
			`else:`
			`self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)`
			`self.proj = nn.Linear(embed_dim, self.out_features)`
			`self.pos_embed = RotaryEmbedding(self.head_dim, in_pixels=False, ref_feat_shape=ref_feat_size)`

			`def init_weights(self, zero_init_last: bool = False):`
			`if self.qkv is None:`
			`in_features = self.q.in_features`
			`trunc_normal_(self.q.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.q.bias)`
			`trunc_normal_(self.k.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.k.bias)`
			`trunc_normal_(self.v.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.v.bias)`
			`else:`
			`in_features = self.qkv.in_features`
			`trunc_normal_(self.qkv.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.qkv.bias)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00
			`def forward(self, x):`
			`B, _, H, W = x.shape`
			`N = H * W`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`x = x.flatten(2).transpose(1, 2)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`x = torch.cat([x.mean(1, keepdim=True), x], dim=1)`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`if self.qkv is None:`
			`q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`else:`
			`x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)`
			`q, k, v = x.unbind(0)`

			`rse, rce = self.pos_embed.get_embed((H, W))`
			`q = torch.cat([q[:, :, :1, :], apply_rot_embed(q[:, :, 1:, :], rse, rce)], dim=2).type_as(v)`
			`k = torch.cat([k[:, :, :1, :], apply_rot_embed(k[:, :, 1:, :], rse, rce)], dim=2).type_as(v)`

			`if self.fused_attn:`
			`x = nn.functional.scaled_dot_product_attention(q, k, v)`
			`else:`
			`q = q * self.scale`
			`attn = q @ k.transpose(-2, -1)`
			`attn = attn.softmax(dim=-1)`
			`x = attn @ v`
			`x = x.transpose(1, 2).reshape(B, N + 1, -1)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`x = self.proj(x)`
			`return x[:, 0]`


			`class AttentionPool2d(nn.Module):`
			`""" Attention based 2D feature pooling w/ learned (absolute) pos embedding.`
			`This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.`

			`It was based on impl in CLIP by OpenAI`
			`https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py`

			`NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.`
			`"""`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`fused_attn: torch.jit.Final[bool]`

Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`def __init__(`
			`self,`
			`in_features: int,`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`feat_size: Union[int, Tuple[int, int]] = 7,`
			`out_features: Optional[int] = None,`
			`embed_dim: Optional[int] = None,`
			`head_dim: Optional[int] = 64,`
			`num_heads: Optional[int] = None,`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`qkv_bias: bool = True,`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`qkv_separate: bool = False,`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`):`
			`super().__init__()`
			`embed_dim = embed_dim or in_features`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.in_features = in_features`
			`self.out_features = out_features or in_features`
			`if num_heads is not None:`
			`assert embed_dim % num_heads == 0`
			`head_dim = embed_dim // num_heads`
			`else:`
			`assert embed_dim % head_dim == 0`
			`num_heads = embed_dim // head_dim`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`self.feat_size = to_2tuple(feat_size)`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.seq_len = self.feat_size[0] * self.feat_size[1]`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`self.num_heads = num_heads`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.head_dim = head_dim`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`self.scale = self.head_dim ** -0.5`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`self.fused_attn = use_fused_attn()`

			`if qkv_separate:`
			`self.q = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.k = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.v = nn.Linear(in_features, embed_dim, bias=qkv_bias)`
			`self.qkv = None`
			`else:`
			`self.q = self.k = self.v = None`
			`self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)`
			`self.proj = nn.Linear(embed_dim, self.out_features)`
			`self.pos_embed = nn.Parameter(torch.zeros(self.seq_len + 1, in_features))`

			`self.init_weights()`

			`def init_weights(self, zero_init_last: bool = False):`
			`if self.qkv is None:`
			`in_features = self.q.in_features`
			`trunc_normal_(self.q.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.q.bias)`
			`trunc_normal_(self.k.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.k.bias)`
			`trunc_normal_(self.v.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.v.bias)`
			`else:`
			`in_features = self.qkv.in_features`
			`trunc_normal_(self.qkv.weight, std=in_features ** -0.5)`
			`nn.init.zeros_(self.qkv.bias)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`trunc_normal_(self.pos_embed, std=in_features ** -0.5)`

			`def forward(self, x):`
			`B, _, H, W = x.shape`
			`N = H * W`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`x = x.flatten(2).transpose(1, 2)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`x = torch.cat([x.mean(1, keepdim=True), x], dim=1)`
Mapping OpenAI CLIP Modified ResNet weights -> ByobNet. Improve AttentionPool2d layers. Fix #1731 2024-06-09 16:54:48 -07:00			`if self.seq_len != N:`
			`pos_embed = resample_abs_pos_embed(self.pos_embed.unsqueeze(0), (H, W), num_prefix_tokens=1)`
			`else:`
			`pos_embed = self.pos_embed.unsqueeze(0).to(x.dtype)`
			`x = x + pos_embed`

			`if self.qkv is None:`
			`q = self.q(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`k = self.k(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`v = self.v(x).reshape(B, N + 1, self.num_heads, self.head_dim).transpose(1, 2)`
			`else:`
			`x = self.qkv(x).reshape(B, -1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)`
			`q, k, v = x.unbind(0)`

			`if self.fused_attn:`
			`x = nn.functional.scaled_dot_product_attention(q, k, v)`
			`else:`
			`q = q * self.scale`
			`attn = q @ k.transpose(-2, -1)`
			`attn = attn.softmax(dim=-1)`
			`x = attn @ v`
			`x = x.transpose(1, 2).reshape(B, N + 1, -1)`
Add initial AttentionPool2d that's being trialed. Fix comment and still trying to improve reliability of sgd test. 2021-09-05 12:29:36 -07:00			`x = self.proj(x)`
			`return x[:, 0]`