mmclassification/mmpretrain/models/utils/attention.py

# Copyright (c) OpenMMLab. All rights reserved.
import itertools
import warnings
from functools import partial
from typing import List, Optional, Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn.bricks.drop import build_dropout
from mmengine.model import BaseModule
from mmengine.model.weight_init import trunc_normal_
from mmengine.utils import digit_version

from mmpretrain.registry import MODELS
from .helpers import to_2tuple
from .layer_scale import LayerScale

# After pytorch v1.10.0, use torch.meshgrid without indexing
# will raise extra warning. For more details,
# refers to https://github.com/pytorch/pytorch/issues/50276
if digit_version(torch.__version__) >= digit_version('1.10.0'):
    torch_meshgrid = partial(torch.meshgrid, indexing='ij')
else:
    torch_meshgrid = torch.meshgrid


def scaled_dot_product_attention_pyimpl(query,
                                        key,
                                        value,
                                        attn_mask=None,
                                        dropout_p=0.,
                                        scale=None,
                                        is_causal=False):
    scale = scale or query.size(-1)**0.5
    if is_causal and attn_mask is not None:
        attn_mask = torch.ones(
            query.size(-2), key.size(-2), dtype=torch.bool).tril(diagonal=0)
    if attn_mask is not None and attn_mask.dtype == torch.bool:
        attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf'))

    attn_weight = query @ key.transpose(-2, -1) / scale
    if attn_mask is not None:
        attn_weight += attn_mask
    attn_weight = torch.softmax(attn_weight, dim=-1)
    attn_weight = torch.dropout(attn_weight, dropout_p, True)
    return attn_weight @ value


if digit_version(torch.__version__) >= digit_version('2.0.0'):
    scaled_dot_product_attention = F.scaled_dot_product_attention
else:
    scaled_dot_product_attention = scaled_dot_product_attention_pyimpl


class WindowMSA(BaseModule):
    """Window based multi-head self-attention (W-MSA) module with relative
    position bias.

    Args:
        embed_dims (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        attn_drop (float, optional): Dropout ratio of attention weight.
            Defaults to 0.
        proj_drop (float, optional): Dropout ratio of output. Defaults to 0.
        init_cfg (dict, optional): The extra config for initialization.
            Defaults to None.
    """

    def __init__(self,
                 embed_dims,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.,
                 init_cfg=None):

        super().__init__(init_cfg)
        self.embed_dims = embed_dims
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_embed_dims = embed_dims // num_heads
        self.scale = qk_scale or head_embed_dims**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # About 2x faster than original impl
        Wh, Ww = self.window_size
        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
        rel_position_index = rel_index_coords + rel_index_coords.T
        rel_position_index = rel_position_index.flip(1).contiguous()
        self.register_buffer('relative_position_index', rel_position_index)

        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(embed_dims, embed_dims)
        self.proj_drop = nn.Dropout(proj_drop)

        self.softmax = nn.Softmax(dim=-1)

    def init_weights(self):
        super(WindowMSA, self).init_weights()

        trunc_normal_(self.relative_position_bias_table, std=0.02)

    def forward(self, x, mask=None):
        """
        Args:

            x (tensor): input features with shape of (num_windows*B, N, C)
            mask (tensor, Optional): mask with shape of (num_windows, Wh*Ww,
                Wh*Ww), value should be between (-inf, 0].
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    @staticmethod
    def double_step_seq(step1, len1, step2, len2):
        seq1 = torch.arange(0, step1 * len1, step1)
        seq2 = torch.arange(0, step2 * len2, step2)
        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)


class WindowMSAV2(BaseModule):
    """Window based multi-head self-attention (W-MSA) module with relative
    position bias.

    Based on implementation on Swin Transformer V2 original repo. Refers to
    https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer_v2.py
    for more details.

    Args:
        embed_dims (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool): If True, add a learnable bias to q, k, v.
            Defaults to True.
        attn_drop (float): Dropout ratio of attention weight.
            Defaults to 0.
        proj_drop (float): Dropout ratio of output. Defaults to 0.
        cpb_mlp_hidden_dims (int): The hidden dimensions of the continuous
            relative position bias network. Defaults to 512.
        pretrained_window_size (tuple(int)): The height and width of the window
            in pre-training. Defaults to (0, 0), which means not load
            pretrained model.
        init_cfg (dict, optional): The extra config for initialization.
            Defaults to None.
    """

    def __init__(self,
                 embed_dims,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 attn_drop=0.,
                 proj_drop=0.,
                 cpb_mlp_hidden_dims=512,
                 pretrained_window_size=(0, 0),
                 init_cfg=None):

        super().__init__(init_cfg)
        self.embed_dims = embed_dims
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads

        # Use small network for continuous relative position bias
        self.cpb_mlp = nn.Sequential(
            nn.Linear(
                in_features=2, out_features=cpb_mlp_hidden_dims, bias=True),
            nn.ReLU(inplace=True),
            nn.Linear(
                in_features=cpb_mlp_hidden_dims,
                out_features=num_heads,
                bias=False))

        # Add learnable scalar for cosine attention
        self.logit_scale = nn.Parameter(
            torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)

        # get relative_coords_table
        relative_coords_h = torch.arange(
            -(self.window_size[0] - 1),
            self.window_size[0],
            dtype=torch.float32)
        relative_coords_w = torch.arange(
            -(self.window_size[1] - 1),
            self.window_size[1],
            dtype=torch.float32)
        relative_coords_table = torch.stack(
            torch_meshgrid([relative_coords_h, relative_coords_w])).permute(
                1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
        if pretrained_window_size[0] > 0:
            relative_coords_table[:, :, :, 0] /= (
                pretrained_window_size[0] - 1)
            relative_coords_table[:, :, :, 1] /= (
                pretrained_window_size[1] - 1)
        else:
            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
        relative_coords_table *= 8  # normalize to -8, 8
        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
            torch.abs(relative_coords_table) + 1.0) / np.log2(8)
        self.register_buffer('relative_coords_table', relative_coords_table)

        # get pair-wise relative position index
        # for each token inside the window
        indexes_h = torch.arange(self.window_size[0])
        indexes_w = torch.arange(self.window_size[1])
        coordinates = torch.stack(
            torch_meshgrid([indexes_h, indexes_w]), dim=0)  # 2, Wh, Ww
        coordinates = torch.flatten(coordinates, start_dim=1)  # 2, Wh*Ww
        # 2, Wh*Ww, Wh*Ww
        relative_coordinates = coordinates[:, :, None] - coordinates[:,
                                                                     None, :]
        relative_coordinates = relative_coordinates.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2

        relative_coordinates[:, :, 0] += self.window_size[
            0] - 1  # shift to start from 0
        relative_coordinates[:, :, 1] += self.window_size[1] - 1
        relative_coordinates[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coordinates.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False)
        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
        else:
            self.q_bias = None
            self.v_bias = None
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(embed_dims, embed_dims)
        self.proj_drop = nn.Dropout(proj_drop)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """
        Args:

            x (tensor): input features with shape of (num_windows*B, N, C)
            mask (tensor, Optional): mask with shape of (num_windows, Wh*Ww,
                Wh*Ww), value should be between (-inf, 0].
        """
        B_, N, C = x.shape
        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = torch.cat(
                (self.q_bias,
                 torch.zeros_like(self.v_bias,
                                  requires_grad=False), self.v_bias))
        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
        qkv = qkv.reshape(B_, N, 3, self.num_heads,
                          C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        # cosine attention
        attn = (
            F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
        logit_scale = torch.clamp(
            self.logit_scale, max=np.log(1. / 0.01)).exp()
        attn = attn * logit_scale

        relative_position_bias_table = self.cpb_mlp(
            self.relative_coords_table).view(-1, self.num_heads)
        relative_position_bias = relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


@MODELS.register_module()
class ShiftWindowMSA(BaseModule):
    """Shift Window Multihead Self-Attention Module.

    Args:
        embed_dims (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): The height and width of the window.
        shift_size (int, optional): The shift step of each window towards
            right-bottom. If zero, act as regular window-msa. Defaults to 0.
        dropout_layer (dict, optional): The dropout_layer used before output.
            Defaults to dict(type='DropPath', drop_prob=0.).
        pad_small_map (bool): If True, pad the small feature map to the window
            size, which is common used in detection and segmentation. If False,
            avoid shifting window and shrink the window size to the size of
            feature map, which is common used in classification.
            Defaults to False.
        window_msa (Callable): To build a window multi-head attention module.
            Defaults to :class:`WindowMSA`.
        init_cfg (dict, optional): The extra config for initialization.
            Defaults to None.
        **kwargs: Other keyword arguments to build the window multi-head
            attention module.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 window_size,
                 shift_size=0,
                 dropout_layer=dict(type='DropPath', drop_prob=0.),
                 pad_small_map=False,
                 window_msa=WindowMSA,
                 init_cfg=None,
                 **kwargs):
        super().__init__(init_cfg)

        self.shift_size = shift_size
        self.window_size = window_size
        assert 0 <= self.shift_size < self.window_size

        self.w_msa = window_msa(
            embed_dims=embed_dims,
            num_heads=num_heads,
            window_size=to_2tuple(self.window_size),
            **kwargs,
        )

        self.drop = build_dropout(dropout_layer)
        self.pad_small_map = pad_small_map

    def forward(self, query, hw_shape):
        B, L, C = query.shape
        H, W = hw_shape
        assert L == H * W, f"The query length {L} doesn't match the input "\
            f'shape ({H}, {W}).'
        query = query.view(B, H, W, C)

        window_size = self.window_size
        shift_size = self.shift_size

        if min(H, W) == window_size:
            # If not pad small feature map, avoid shifting when the window size
            # is equal to the size of feature map. It's to align with the
            # behavior of the original implementation.
            shift_size = shift_size if self.pad_small_map else 0
        elif min(H, W) < window_size:
            # In the original implementation, the window size will be shrunk
            # to the size of feature map. The behavior is different with
            # swin-transformer for downstream tasks. To support dynamic input
            # shape, we don't allow this feature.
            assert self.pad_small_map, \
                f'The input shape ({H}, {W}) is smaller than the window ' \
                f'size ({window_size}). Please set `pad_small_map=True`, or ' \
                'decrease the `window_size`.'

        pad_r = (window_size - W % window_size) % window_size
        pad_b = (window_size - H % window_size) % window_size
        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))

        H_pad, W_pad = query.shape[1], query.shape[2]

        # cyclic shift
        if shift_size > 0:
            query = torch.roll(
                query, shifts=(-shift_size, -shift_size), dims=(1, 2))

        attn_mask = self.get_attn_mask((H_pad, W_pad),
                                       window_size=window_size,
                                       shift_size=shift_size,
                                       device=query.device)

        # nW*B, window_size, window_size, C
        query_windows = self.window_partition(query, window_size)
        # nW*B, window_size*window_size, C
        query_windows = query_windows.view(-1, window_size**2, C)

        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
        attn_windows = self.w_msa(query_windows, mask=attn_mask)

        # merge windows
        attn_windows = attn_windows.view(-1, window_size, window_size, C)

        # B H' W' C
        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad,
                                        window_size)
        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x, shifts=(shift_size, shift_size), dims=(1, 2))
        else:
            x = shifted_x

        if H != H_pad or W != W_pad:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        x = self.drop(x)

        return x

    @staticmethod
    def window_reverse(windows, H, W, window_size):
        B = int(windows.shape[0] / (H * W / window_size / window_size))
        x = windows.view(B, H // window_size, W // window_size, window_size,
                         window_size, -1)
        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
        return x

    @staticmethod
    def window_partition(x, window_size):
        B, H, W, C = x.shape
        x = x.view(B, H // window_size, window_size, W // window_size,
                   window_size, C)
        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
        windows = windows.view(-1, window_size, window_size, C)
        return windows

    @staticmethod
    def get_attn_mask(hw_shape, window_size, shift_size, device=None):
        if shift_size > 0:
            img_mask = torch.zeros(1, *hw_shape, 1, device=device)
            h_slices = (slice(0, -window_size), slice(-window_size,
                                                      -shift_size),
                        slice(-shift_size, None))
            w_slices = (slice(0, -window_size), slice(-window_size,
                                                      -shift_size),
                        slice(-shift_size, None))
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1

            # nW, window_size, window_size, 1
            mask_windows = ShiftWindowMSA.window_partition(
                img_mask, window_size)
            mask_windows = mask_windows.view(-1, window_size * window_size)
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0)
            attn_mask = attn_mask.masked_fill(attn_mask == 0, 0.0)
        else:
            attn_mask = None
        return attn_mask


class MultiheadAttention(BaseModule):
    """Multi-head Attention Module.

    This module implements multi-head attention that supports different input
    dims and embed dims. And it also supports a shortcut from ``value``, which
    is useful if input dims is not the same with embed dims.

    Args:
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads.
        input_dims (int, optional): The input dimension, and if None,
            use ``embed_dims``. Defaults to None.
        attn_drop (float): Dropout rate of the dropout layer after the
            attention calculation of query and key. Defaults to 0.
        proj_drop (float): Dropout rate of the dropout layer after the
            output projection. Defaults to 0.
        dropout_layer (dict): The dropout config before adding the shortcut.
            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
        qkv_bias (bool): If True, add a learnable bias to q, k, v.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        proj_bias (bool) If True, add a learnable bias to output projection.
            Defaults to True.
        v_shortcut (bool): Add a shortcut from value to output. It's usually
            used if ``input_dims`` is different from ``embed_dims``.
            Defaults to False.
        use_layer_scale (bool): Whether to use layer scale. Defaults to False.
        layer_scale_init_value (float or torch.Tensor): Init value of layer
            scale. Defaults to 0.
        init_cfg (dict, optional): The Config for initialization.
            Defaults to None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 input_dims=None,
                 attn_drop=0.,
                 proj_drop=0.,
                 dropout_layer=dict(type='Dropout', drop_prob=0.),
                 qkv_bias=True,
                 qk_scale=None,
                 proj_bias=True,
                 v_shortcut=False,
                 use_layer_scale=False,
                 layer_scale_init_value=0.,
                 init_cfg=None):
        super(MultiheadAttention, self).__init__(init_cfg=init_cfg)

        self.input_dims = input_dims or embed_dims
        self.embed_dims = embed_dims
        self.num_heads = num_heads
        self.v_shortcut = v_shortcut

        self.head_dims = embed_dims // num_heads
        if qk_scale is not None:
            self.scaled_dot_product_attention = partial(
                scaled_dot_product_attention_pyimpl,
                scale=self.head_dims**-0.5)
        else:
            self.scaled_dot_product_attention = scaled_dot_product_attention

        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias)
        self.attn_drop = attn_drop
        self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias)
        self.proj_drop = nn.Dropout(proj_drop)

        self.out_drop = build_dropout(dropout_layer)

        if use_layer_scale:
            warnings.warn('The `use_layer_scale` in `MultiheadAttention` will '
                          'be deprecated. Please use `layer_scale_init_value` '
                          'to control whether using layer scale or not.')

        if use_layer_scale or (layer_scale_init_value > 0):
            layer_scale_init_value = layer_scale_init_value or 1e-5
            self.gamma1 = LayerScale(
                embed_dims, layer_scale_init_value=layer_scale_init_value)
        else:
            self.gamma1 = nn.Identity()

    def forward(self, x):
        B, N, _ = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  self.head_dims).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn_drop = self.attn_drop if self.training else 0.
        x = self.scaled_dot_product_attention(q, k, v, dropout_p=attn_drop)
        x = x.transpose(1, 2).reshape(B, N, self.embed_dims)

        x = self.proj(x)
        x = self.out_drop(self.gamma1(self.proj_drop(x)))

        if self.v_shortcut:
            x = v.squeeze(1) + x
        return x


class BEiTAttention(BaseModule):
    """Window based multi-head self-attention (W-MSA) module with relative
    position bias.

    The initial implementation is in MMSegmentation.

    Args:
        embed_dims (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (tuple[int, int]): The height and width of the window.
        use_rel_pos_bias (bool): Whether to use unique relative position bias,
            if False, use shared relative position bias defined in backbone.
        bias (str): The option to add leanable bias for q, k, v. If bias is
            True, it will add leanable bias. If bias is 'qv_bias', it will only
            add leanable bias for q, v. If bias is False, it will not add bias
            for q, k, v. Default to 'qv_bias'.
        qk_scale (float | None, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        attn_drop_rate (float): Dropout ratio of attention weight.
            Default: 0.0
        proj_drop_rate (float): Dropout ratio of output. Default: 0.
        init_cfg (dict | None, optional): The Config for initialization.
            Default: None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads,
                 window_size,
                 use_rel_pos_bias,
                 bias='qv_bias',
                 qk_scale=None,
                 attn_drop_rate=0.,
                 proj_drop_rate=0.,
                 init_cfg=None,
                 **kwargs):
        super().__init__(init_cfg=init_cfg)
        self.embed_dims = embed_dims
        self.num_heads = num_heads
        head_embed_dims = embed_dims // num_heads
        self.bias = bias
        self.scale = qk_scale or head_embed_dims**-0.5

        qkv_bias = bias
        if bias == 'qv_bias':
            self._init_qv_bias()
            qkv_bias = False

        if window_size is None:
            assert not use_rel_pos_bias
        else:
            assert isinstance(window_size, tuple)
        self.window_size = window_size
        self.use_rel_pos_bias = use_rel_pos_bias
        self._init_rel_pos_embedding()

        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop_rate)
        self.proj = nn.Linear(embed_dims, embed_dims)
        self.proj_drop = nn.Dropout(proj_drop_rate)

    def _init_qv_bias(self):
        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))

    def _init_rel_pos_embedding(self):
        if self.use_rel_pos_bias:
            Wh, Ww = self.window_size
            # cls to token & token 2 cls & cls to cls
            self.num_relative_distance = (2 * Wh - 1) * (2 * Ww - 1) + 3
            # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
            self.relative_position_bias_table = nn.Parameter(
                torch.zeros(self.num_relative_distance, self.num_heads))

            # get pair-wise relative position index for
            # each token inside the window
            coords_h = torch.arange(Wh)
            coords_w = torch.arange(Ww)
            # coords shape is (2, Wh, Ww)
            coords = torch.stack(torch_meshgrid([coords_h, coords_w]))
            # coords_flatten shape is (2, Wh*Ww)
            coords_flatten = torch.flatten(coords, 1)
            relative_coords = (
                coords_flatten[:, :, None] - coords_flatten[:, None, :])
            # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
            # shift to start from 0
            relative_coords[:, :, 0] += Wh - 1
            relative_coords[:, :, 1] += Ww - 1
            relative_coords[:, :, 0] *= 2 * Ww - 1
            relative_position_index = torch.zeros(
                size=(Wh * Ww + 1, ) * 2, dtype=relative_coords.dtype)
            # relative_position_index shape is (Wh*Ww, Wh*Ww)
            relative_position_index[1:, 1:] = relative_coords.sum(-1)
            relative_position_index[0, 0:] = self.num_relative_distance - 3
            relative_position_index[0:, 0] = self.num_relative_distance - 2
            relative_position_index[0, 0] = self.num_relative_distance - 1

            self.register_buffer('relative_position_index',
                                 relative_position_index)
        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

    def init_weights(self):
        super().init_weights()
        if self.use_rel_pos_bias:
            trunc_normal_(self.relative_position_bias_table, std=0.02)

    def forward(self, x, rel_pos_bias=None):
        """
        Args:
            x (tensor): input features with shape of (num_windows*B, N, C).
            rel_pos_bias (tensor): input relative position bias with shape of
                (num_heads, N, N).
        """
        B, N, C = x.shape

        if self.bias == 'qv_bias':
            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
        else:
            qkv = self.qkv(x)

        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        if self.relative_position_bias_table is not None:
            Wh = self.window_size[0]
            Ww = self.window_size[1]
            relative_position_bias = self.relative_position_bias_table[
                self.relative_position_index.view(-1)].view(
                    Wh * Ww + 1, Wh * Ww + 1, -1)
            relative_position_bias = relative_position_bias.permute(
                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
            attn = attn + relative_position_bias.unsqueeze(0)

        if rel_pos_bias is not None:
            # use shared relative position bias
            attn = attn + rel_pos_bias

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class ChannelMultiheadAttention(BaseModule):
    """Channel Multihead Self-attention Module.

    This module implements channel multi-head attention that supports different
    input dims and embed dims.
    Args:
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads.
        input_dims (int, optional): The input dimension, and if None,
            use ``embed_dims``. Defaults to None.
        attn_drop (float): Dropout rate of the dropout layer after the
            attention calculation of query and key. Defaults to 0.
        proj_drop (float): Dropout rate of the dropout layer after the
            output projection. Defaults to 0.
        dropout_layer (dict): The dropout config before adding the shoutcut.
            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
        qkv_bias (bool): If True, add a learnable bias to q, k, v.
            Defaults to False.
        proj_bias (bool) If True, add a learnable bias to output projection.
            Defaults to True.
        qk_scale_type (str): The scale type of qk scale.
            Defaults to 'learnable'. It can be 'learnable', 'fixed' or 'none'.
        qk_scale (float, optional): If set qk_scale_type to 'none', this
            should be specified with valid float number. Defaults to None.
        v_shortcut (bool): Add a shortcut from value to output. It's usually
            used if ``input_dims`` is different from ``embed_dims``.
            Defaults to False.
        init_cfg (dict, optional): The Config for initialization.
            Defaults to None.
    """

    def __init__(self,
                 embed_dims,
                 num_heads=8,
                 input_dims=None,
                 attn_drop=0.,
                 proj_drop=0.,
                 dropout_layer=dict(type='Dropout', drop_prob=0.),
                 qkv_bias=False,
                 proj_bias=True,
                 qk_scale_type='learnable',
                 qk_scale=None,
                 v_shortcut=False,
                 init_cfg=None):
        super().__init__(init_cfg)

        self.input_dims = input_dims or embed_dims
        self.embed_dims = embed_dims
        self.num_heads = num_heads
        self.v_shortcut = v_shortcut

        self.head_dims = embed_dims // num_heads
        if qk_scale_type == 'learnable':
            self.scale = nn.Parameter(torch.ones(num_heads, 1, 1))
        elif qk_scale_type == 'fixed':
            self.scale = self.head_dims**-0.5
        elif qk_scale_type == 'none':
            assert qk_scale is not None
            self.scale = qk_scale

        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias)
        self.proj_drop = nn.Dropout(proj_drop)

        self.out_drop = build_dropout(dropout_layer)

    def forward(self, x):
        B, N, _ = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  self.head_dims).permute(2, 0, 3, 1, 4)

        q, k, v = [item.transpose(-2, -1) for item in [qkv[0], qkv[1], qkv[2]]]

        q, k = F.normalize(q, dim=-1), F.normalize(k, dim=-1)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, self.embed_dims)
        x = self.proj(x)
        x = self.out_drop(self.proj_drop(x))

        if self.v_shortcut:
            x = qkv[2].squeeze(1) + x
        return x


class LeAttention(BaseModule):
    """LeViT Attention. Multi-head attention with attention bias,  which is
    proposed in `LeViT: a Vision Transformer in ConvNet’s Clothing for Faster
    Inference<https://arxiv.org/abs/2104.01136>`_

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads. Default: 8.
        key_dim (int): Dimension of key. Default: None.
        attn_ratio (int): Ratio of attention heads. Default: 8.
        resolution (tuple[int]): Input resolution. Default: (16, 16).
        init_cfg (dict, optional): The Config for initialization.
    """

    def __init__(self,
                 dim,
                 key_dim,
                 num_heads=8,
                 attn_ratio=4,
                 resolution=(14, 14),
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)
        # (h, w)
        assert isinstance(resolution, tuple) and len(resolution) == 2
        self.num_heads = num_heads
        self.scale = key_dim**-0.5
        self.key_dim = key_dim
        self.nh_kd = nh_kd = key_dim * num_heads
        self.d = int(attn_ratio * key_dim)
        self.dh = int(attn_ratio * key_dim) * num_heads
        self.attn_ratio = attn_ratio
        h = self.dh + nh_kd * 2

        self.norm = nn.LayerNorm(dim)
        self.qkv = nn.Linear(dim, h)
        self.proj = nn.Linear(self.dh, dim)

        points = list(
            itertools.product(range(resolution[0]), range(resolution[1])))
        N = len(points)
        attention_offsets = {}
        idxs = []
        for p1 in points:
            for p2 in points:
                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
                if offset not in attention_offsets:
                    attention_offsets[offset] = len(attention_offsets)
                idxs.append(attention_offsets[offset])
        self.attention_biases = torch.nn.Parameter(
            torch.zeros(num_heads, len(attention_offsets)))
        self.register_buffer(
            'attention_bias_idxs',
            torch.LongTensor(idxs).view(N, N),
            persistent=False)

    @torch.no_grad()
    def train(self, mode=True):
        super().train(mode)
        if mode and hasattr(self, 'ab'):
            del self.ab
        else:
            self.ab = self.attention_biases[:, self.attention_bias_idxs]

    def forward(self, x):  # x (B,N,C)
        B, N, _ = x.shape

        # Normalization
        x = self.norm(x)

        qkv = self.qkv(x)
        # (B, N, num_heads, d)
        q, k, v = qkv.view(B, N, self.num_heads,
                           -1).split([self.key_dim, self.key_dim, self.d],
                                     dim=3)
        # (B, num_heads, N, d)
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        attn = ((q @ k.transpose(-2, -1)) * self.scale +
                (self.attention_biases[:, self.attention_bias_idxs]
                 if self.training else self.ab))
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
        x = self.proj(x)
        return x


class CrossMultiheadAttention(BaseModule):
    """Cross attention between queries and the union of keys and values.

    This module is different from ``MultiheadAttention``, for the attention
    is computed between queries and the union of keys and values.

    Args:
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads.
        qkv_bias (bool): If True, add a learnable bias to q, k, v.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        attn_drop (float): Dropout rate of the dropout layer after the
            attention calculation of query and key. Defaults to 0.
        proj_drop (float): Dropout rate of the dropout layer after the
            output projection. Defaults to 0.
    """

    def __init__(self,
                 embed_dims: int,
                 num_heads: int = 8,
                 qkv_bias: bool = False,
                 qk_scale: float = None,
                 attn_drop: float = 0.,
                 proj_drop: float = 0.) -> None:
        super().__init__()
        self.num_heads = num_heads
        head_dim = embed_dims // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.q = nn.Linear(embed_dims, embed_dims, bias=False)
        self.k = nn.Linear(embed_dims, embed_dims, bias=False)
        self.v = nn.Linear(embed_dims, embed_dims, bias=False)

        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
        else:
            self.q_bias = None
            self.k_bias = None
            self.v_bias = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(embed_dims, embed_dims)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self,
                x: torch.Tensor,
                k: torch.Tensor = None,
                v: torch.Tensor = None) -> None:
        """Forward function."""
        B, N, _ = x.shape

        N_k = k.shape[1]
        N_v = v.shape[1]

        q_bias, k_bias, v_bias = None, None, None
        if self.q_bias is not None:
            q_bias = self.q_bias
            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
            v_bias = self.v_bias

        q = F.linear(
            input=x, weight=self.q.weight, bias=q_bias)  # (B, N_q, dim)
        k = F.linear(
            input=k, weight=self.k.weight, bias=k_bias)  # (B, N_k, dim)
        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)

        q = q.reshape(B, N, 1, self.num_heads,
                      -1).permute(2, 0, 3, 1,
                                  4).squeeze(0)  # (B, num_heads, N_q, dim)
        k = k.reshape(B, N_k, 1, self.num_heads,
                      -1).permute(2, 0, 3, 1,
                                  4).squeeze(0)  # (B, num_heads, N_k, dim)
        v = v.reshape(B, N_v, 1, self.num_heads,
                      -1).permute(2, 0, 3, 1,
                                  4).squeeze(0)  # (B, num_heads, N_v, dim)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class PromptMultiheadAttention(MultiheadAttention):
    """Prompt Multihead Attention for MILAN.

    This module is specific for the prompt encoder in MILAN. It will not update
    the visible tokens from the encoder.

    Args:
        embed_dims (int): The embedding dimension.
        num_heads (int): Parallel attention heads.
        input_dims (int, optional): The input dimension, and if None,
            use ``embed_dims``. Defaults to None.
        attn_drop (float): Dropout rate of the dropout layer after the
            attention calculation of query and key. Defaults to 0.
        proj_drop (float): Dropout rate of the dropout layer after the
            output projection. Defaults to 0.
        dropout_layer (dict): The dropout config before adding the shortcut.
            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
        qkv_bias (bool): If True, add a learnable bias to q, k, v.
            Defaults to True.
        qk_scale (float, optional): Override default qk scale of
            ``head_dim ** -0.5`` if set. Defaults to None.
        proj_bias (bool) If True, add a learnable bias to output projection.
            Defaults to True.
        v_shortcut (bool): Add a shortcut from value to output. It's usually
            used if ``input_dims`` is different from ``embed_dims``.
            Defaults to False.
        return_attention (bool): If True, return the attention map, computed by
            the cross attention between the class token and all other tokens.
            Defaults to False.
        init_cfg (Union[List[dict], dict], optional): The Config for
            initialization. Defaults to None.
    """

    def __init__(self,
                 embed_dims: int,
                 num_heads: int,
                 input_dims: Optional[int] = None,
                 attn_drop: float = 0,
                 proj_drop: float = 0,
                 dropout_layer: dict = dict(type='Dropout', drop_prob=0.),
                 qkv_bias: bool = True,
                 qk_scale: Optional[float] = None,
                 proj_bias: bool = True,
                 v_shortcut: bool = False,
                 use_layer_scale: bool = False,
                 init_cfg: Optional[Union[List[dict], dict]] = None) -> None:
        super().__init__(
            embed_dims=embed_dims,
            num_heads=num_heads,
            input_dims=input_dims,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            dropout_layer=dropout_layer,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            proj_bias=proj_bias,
            v_shortcut=v_shortcut,
            use_layer_scale=use_layer_scale,
            init_cfg=init_cfg)
        # no longer need qkv
        del self.qkv

        # to project the mask tokens
        self.q = nn.Linear(embed_dims, embed_dims, bias=qkv_bias)
        # to project al the tokens
        self.kv = nn.Linear(embed_dims, embed_dims * 2, bias=qkv_bias)

    def forward(self, x: torch.Tensor, visible_tokens: torch.Tensor,
                ids_restore: torch.Tensor) -> torch.Tensor:
        """Forward function for `PromptMultiheadAttention`.

        Args:
            x (torch.Tensor): Mask token features with shape N x L_m x C.
            visible_tokens (torch.Tensor): The visible tokens features from
                encoder with shape N x L_v x C.
            ids_restore (torch.Tensor): The ids of all tokens in the original
                image with shape N x L.

        Returns:
            torch Tensor: Output features with shape N x L x C.
        """
        x_ = torch.cat([visible_tokens[:, 1:, :], x], dim=1)
        assert x_.shape[1] == ids_restore.shape[1]
        x_ = torch.gather(
            x_,
            dim=1,
            index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[-1]))
        x_ = torch.cat([visible_tokens[:, :1, :], x_], dim=1)

        # full sequence shape
        B, _, _ = x_.shape
        q = self.q(x).reshape(B, x.shape[1], self.num_heads,
                              self.head_dims).permute(0, 2, 1, 3)
        kv = self.kv(x_).reshape(B, x_.shape[1], 2, self.num_heads,
                                 self.head_dims).permute(2, 0, 3, 1, 4)
        k, v = kv[0], kv[1]

        attn_drop = self.attn_drop if self.training else 0.
        attn = self.scaled_dot_product_attention(q, k, v, dropout_p=attn_drop)
        x = attn.transpose(1, 2).reshape(B, x.shape[1], self.embed_dims)

        x = self.proj(x)
        x = self.out_drop(self.gamma1(self.proj_drop(x)))
        return x
-												[Docs] Add Copyright information. (#413)


											
										
										
											2021-08-17 19:52:42 +08:00
+								# Copyright (c) OpenMMLab. All rights reserved.
-												[Feature] Add TinyViT for dev-1.x. (#1042)

* [Feature] add TinyViT for dev-1.x

* [Feature] update readme

* fix lint error

* refactor the code

* [Update] update the args

* [Update] add unit test and fix bugs

* Rename the configuration file

* delete invalid files

* [Feature] update tinyvit readme

* [Feature] update tinyvit readme

* [Feature] update metafile

* Update tinyvit metafile
											
										
										
											2022-12-20 13:04:00 +08:00
+								import itertools
-												[Feature] Support dinov2 backbone (#1522)

* support dinov2 backbone

* update metafile and readme

* compatible to use_layer_scale

* update SwiGLUFFN

* add deprecation warning

* update
											
										
										
											2023-05-05 16:59:37 +08:00
+								import warnings
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								from functools import partial
-												[Refactor] Add necks, heads and losses for the self-supervised task. (#1376)


* add necks

* refactor linear neck

* rename simmim neck

* add heads

* add losses

* fix

* add unittest

* update

* update cae

* remove mim head

* update config
											
										
										
											2023-02-28 10:05:00 +08:00
+								from typing import List, Optional, Union
-												[Feature] Add TinyViT for dev-1.x. (#1042)

* [Feature] add TinyViT for dev-1.x

* [Feature] update readme

* fix lint error

* refactor the code

* [Update] update the args

* [Update] add unit test and fix bugs

* Rename the configuration file

* delete invalid files

* [Feature] update tinyvit readme

* [Feature] update tinyvit readme

* [Feature] update metafile

* Update tinyvit metafile
											
										
										
											2022-12-20 13:04:00 +08:00
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								import numpy as np
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								import torch
 								import torch.nn as nn
 								import torch.nn.functional as F
-												[Refactor] Refactor dev scripts

											
										
										
											2022-07-12 08:10:59 +00:00
+								from mmcv.cnn.bricks.drop import build_dropout
 								from mmengine.model import BaseModule
-												[Refactor] Refactor the data flow. (#989)

* [Refactor] Refactor the data flow.

* Add comments about data preprocessor.

* Fix after mmengine folder structure refactoring.
											
										
										
											2022-08-26 10:40:43 +08:00
+								from mmengine.model.weight_init import trunc_normal_
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								from mmengine.utils import digit_version
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
-												Rename the package name to `mmpretrain`.

											
										
										
											2023-02-17 11:31:08 +08:00
+								from mmpretrain.registry import MODELS
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								from .helpers import to_2tuple
-												[Feature] Support DeiT3. (#1065)

* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-10-10 15:54:20 +09:00
+								from .layer_scale import LayerScale
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								# After pytorch v1.10.0, use torch.meshgrid without indexing
 								# will raise extra warning. For more details,
 								# refers to https://github.com/pytorch/pytorch/issues/50276
 								if digit_version(torch.__version__) >= digit_version('1.10.0'):
 								    torch_meshgrid = partial(torch.meshgrid, indexing='ij')
 								else:
 								    torch_meshgrid = torch.meshgrid
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								def scaled_dot_product_attention_pyimpl(query,
 								                                        key,
 								                                        value,
 								                                        attn_mask=None,
 								                                        dropout_p=0.,
 								                                        scale=None,
 								                                        is_causal=False):
 								    scale = scale or query.size(-1)**0.5
 								    if is_causal and attn_mask is not None:
 								        attn_mask = torch.ones(
 								            query.size(-2), key.size(-2), dtype=torch.bool).tril(diagonal=0)
 								    if attn_mask is not None and attn_mask.dtype == torch.bool:
 								        attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf'))
 								    attn_weight = query @ key.transpose(-2, -1) / scale
 								    if attn_mask is not None:
 								        attn_weight += attn_mask
 								    attn_weight = torch.softmax(attn_weight, dim=-1)
 								    attn_weight = torch.dropout(attn_weight, dropout_p, True)
 								    return attn_weight @ value
 								if digit_version(torch.__version__) >= digit_version('2.0.0'):
 								    scaled_dot_product_attention = F.scaled_dot_product_attention
 								else:
 								    scaled_dot_product_attention = scaled_dot_product_attention_pyimpl
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								class WindowMSA(BaseModule):
 								    """Window based multi-head self-attention (W-MSA) module with relative
 								    position bias.
 								    Args:
 								        embed_dims (int): Number of input channels.
 								        window_size (tuple[int]): The height and width of the window.
 								        num_heads (int): Number of attention heads.
 								        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
 								            Defaults to True.
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
+								        qk_scale (float, optional): Override default qk scale of
 								            ``head_dim ** -0.5`` if set. Defaults to None.
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        attn_drop (float, optional): Dropout ratio of attention weight.
 								            Defaults to 0.
 								        proj_drop (float, optional): Dropout ratio of output. Defaults to 0.
 								        init_cfg (dict, optional): The extra config for initialization.
 								            Defaults to None.
 								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 window_size,
 								                 num_heads,
 								                 qkv_bias=True,
 								                 qk_scale=None,
 								                 attn_drop=0.,
 								                 proj_drop=0.,
 								                 init_cfg=None):
 								        super().__init__(init_cfg)
 								        self.embed_dims = embed_dims
 								        self.window_size = window_size  # Wh, Ww
 								        self.num_heads = num_heads
 								        head_embed_dims = embed_dims // num_heads
 								        self.scale = qk_scale or head_embed_dims**-0.5
 								        # define a parameter table of relative position bias
 								        self.relative_position_bias_table = nn.Parameter(
 								            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
 								                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
 								        # About 2x faster than original impl
 								        Wh, Ww = self.window_size
 								        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
 								        rel_position_index = rel_index_coords + rel_index_coords.T
 								        rel_position_index = rel_position_index.flip(1).contiguous()
 								        self.register_buffer('relative_position_index', rel_position_index)
 								        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
 								        self.attn_drop = nn.Dropout(attn_drop)
 								        self.proj = nn.Linear(embed_dims, embed_dims)
 								        self.proj_drop = nn.Dropout(proj_drop)
 								        self.softmax = nn.Softmax(dim=-1)
 								    def init_weights(self):
 								        super(WindowMSA, self).init_weights()
 								        trunc_normal_(self.relative_position_bias_table, std=0.02)
 								    def forward(self, x, mask=None):
 								        """
 								        Args:
 								            x (tensor): input features with shape of (num_windows*B, N, C)
 								            mask (tensor, Optional): mask with shape of (num_windows, Wh*Ww,
 								                Wh*Ww), value should be between (-inf, 0].
 								        """
 								        B_, N, C = x.shape
 								        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
 								                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
 								        q, k, v = qkv[0], qkv[1], qkv[
 ]  # make torchscript happy (cannot use tensor as tuple)
 								        q = q * self.scale
 								        attn = (q @ k.transpose(-2, -1))
 								        relative_position_bias = self.relative_position_bias_table[
 								            self.relative_position_index.view(-1)].view(
 								                self.window_size[0] * self.window_size[1],
 								                self.window_size[0] * self.window_size[1],
 								                -1)  # Wh*Ww,Wh*Ww,nH
 								        relative_position_bias = relative_position_bias.permute(
 , 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
 								        attn = attn + relative_position_bias.unsqueeze(0)
 								        if mask is not None:
 								            nW = mask.shape[0]
 								            attn = attn.view(B_ // nW, nW, self.num_heads, N,
 								                             N) + mask.unsqueeze(1).unsqueeze(0)
 								            attn = attn.view(-1, self.num_heads, N, N)
 								            attn = self.softmax(attn)
 								        else:
 								            attn = self.softmax(attn)
 								        attn = self.attn_drop(attn)
 								        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
 								        x = self.proj(x)
 								        x = self.proj_drop(x)
 								        return x
 								    @staticmethod
 								    def double_step_seq(step1, len1, step2, len2):
 								        seq1 = torch.arange(0, step1 * len1, step1)
 								        seq2 = torch.arange(0, step2 * len2, step2)
 								        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								class WindowMSAV2(BaseModule):
 								    """Window based multi-head self-attention (W-MSA) module with relative
 								    position bias.
 								    Based on implementation on Swin Transformer V2 original repo. Refers to
 								    https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer_v2.py
 								    for more details.
 								    Args:
 								        embed_dims (int): Number of input channels.
 								        window_size (tuple[int]): The height and width of the window.
 								        num_heads (int): Number of attention heads.
 								        qkv_bias (bool): If True, add a learnable bias to q, k, v.
 								            Defaults to True.
 								        attn_drop (float): Dropout ratio of attention weight.
 								            Defaults to 0.
 								        proj_drop (float): Dropout ratio of output. Defaults to 0.
 								        cpb_mlp_hidden_dims (int): The hidden dimensions of the continuous
 								            relative position bias network. Defaults to 512.
 								        pretrained_window_size (tuple(int)): The height and width of the window
 								            in pre-training. Defaults to (0, 0), which means not load
 								            pretrained model.
 								        init_cfg (dict, optional): The extra config for initialization.
 								            Defaults to None.
 								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 window_size,
 								                 num_heads,
 								                 qkv_bias=True,
 								                 attn_drop=0.,
 								                 proj_drop=0.,
 								                 cpb_mlp_hidden_dims=512,
 								                 pretrained_window_size=(0, 0),
 								                 init_cfg=None):
 								        super().__init__(init_cfg)
 								        self.embed_dims = embed_dims
 								        self.window_size = window_size  # Wh, Ww
 								        self.num_heads = num_heads
 								        # Use small network for continuous relative position bias
 								        self.cpb_mlp = nn.Sequential(
 								            nn.Linear(
 								                in_features=2, out_features=cpb_mlp_hidden_dims, bias=True),
 								            nn.ReLU(inplace=True),
 								            nn.Linear(
 								                in_features=cpb_mlp_hidden_dims,
 								                out_features=num_heads,
 								                bias=False))
 								        # Add learnable scalar for cosine attention
 								        self.logit_scale = nn.Parameter(
 								            torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)
 								        # get relative_coords_table
 								        relative_coords_h = torch.arange(
 								            -(self.window_size[0] - 1),
 								            self.window_size[0],
 								            dtype=torch.float32)
 								        relative_coords_w = torch.arange(
 								            -(self.window_size[1] - 1),
 								            self.window_size[1],
 								            dtype=torch.float32)
 								        relative_coords_table = torch.stack(
 								            torch_meshgrid([relative_coords_h, relative_coords_w])).permute(
 , 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
 								        if pretrained_window_size[0] > 0:
 								            relative_coords_table[:, :, :, 0] /= (
 								                pretrained_window_size[0] - 1)
 								            relative_coords_table[:, :, :, 1] /= (
 								                pretrained_window_size[1] - 1)
 								        else:
 								            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
 								            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
 								        relative_coords_table *= 8  # normalize to -8, 8
 								        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
 								            torch.abs(relative_coords_table) + 1.0) / np.log2(8)
 								        self.register_buffer('relative_coords_table', relative_coords_table)
 								        # get pair-wise relative position index
 								        # for each token inside the window
 								        indexes_h = torch.arange(self.window_size[0])
 								        indexes_w = torch.arange(self.window_size[1])
 								        coordinates = torch.stack(
 								            torch_meshgrid([indexes_h, indexes_w]), dim=0)  # 2, Wh, Ww
 								        coordinates = torch.flatten(coordinates, start_dim=1)  # 2, Wh*Ww
 								        # 2, Wh*Ww, Wh*Ww
 								        relative_coordinates = coordinates[:, :, None] - coordinates[:,
 								                                                                     None, :]
 								        relative_coordinates = relative_coordinates.permute(
 , 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
 								        relative_coordinates[:, :, 0] += self.window_size[
 ] - 1  # shift to start from 0
 								        relative_coordinates[:, :, 1] += self.window_size[1] - 1
 								        relative_coordinates[:, :, 0] *= 2 * self.window_size[1] - 1
 								        relative_position_index = relative_coordinates.sum(-1)  # Wh*Ww, Wh*Ww
 								        self.register_buffer('relative_position_index',
 								                             relative_position_index)
 								        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False)
 								        if qkv_bias:
 								            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
 								            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
 								        else:
 								            self.q_bias = None
 								            self.v_bias = None
 								        self.attn_drop = nn.Dropout(attn_drop)
 								        self.proj = nn.Linear(embed_dims, embed_dims)
 								        self.proj_drop = nn.Dropout(proj_drop)
 								        self.softmax = nn.Softmax(dim=-1)
 								    def forward(self, x, mask=None):
 								        """
 								        Args:
 								            x (tensor): input features with shape of (num_windows*B, N, C)
 								            mask (tensor, Optional): mask with shape of (num_windows, Wh*Ww,
 								                Wh*Ww), value should be between (-inf, 0].
 								        """
 								        B_, N, C = x.shape
 								        qkv_bias = None
 								        if self.q_bias is not None:
 								            qkv_bias = torch.cat(
 								                (self.q_bias,
 								                 torch.zeros_like(self.v_bias,
 								                                  requires_grad=False), self.v_bias))
 								        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
 								        qkv = qkv.reshape(B_, N, 3, self.num_heads,
 								                          C // self.num_heads).permute(2, 0, 3, 1, 4)
 								        q, k, v = qkv[0], qkv[1], qkv[
 ]  # make torchscript happy (cannot use tensor as tuple)
 								        # cosine attention
 								        attn = (
 								            F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
 								        logit_scale = torch.clamp(
 								            self.logit_scale, max=np.log(1. / 0.01)).exp()
 								        attn = attn * logit_scale
 								        relative_position_bias_table = self.cpb_mlp(
 								            self.relative_coords_table).view(-1, self.num_heads)
 								        relative_position_bias = relative_position_bias_table[
 								            self.relative_position_index.view(-1)].view(
 								                self.window_size[0] * self.window_size[1],
 								                self.window_size[0] * self.window_size[1],
 								                -1)  # Wh*Ww,Wh*Ww,nH
 								        relative_position_bias = relative_position_bias.permute(
 , 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
 								        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
 								        attn = attn + relative_position_bias.unsqueeze(0)
 								        if mask is not None:
 								            nW = mask.shape[0]
 								            attn = attn.view(B_ // nW, nW, self.num_heads, N,
 								                             N) + mask.unsqueeze(1).unsqueeze(0)
 								            attn = attn.view(-1, self.num_heads, N, N)
 								            attn = self.softmax(attn)
 								        else:
 								            attn = self.softmax(attn)
 								        attn = self.attn_drop(attn)
 								        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
 								        x = self.proj(x)
 								        x = self.proj_drop(x)
 								        return x
-												[Refactor] Inherit all registries from MMEngine and use inherited registries.

											
										
										
											2022-05-10 09:45:10 +00:00
+								@MODELS.register_module()
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								class ShiftWindowMSA(BaseModule):
 								    """Shift Window Multihead Self-Attention Module.
 								    Args:
 								        embed_dims (int): Number of input channels.
 								        num_heads (int): Number of attention heads.
 								        window_size (int): The height and width of the window.
 								        shift_size (int, optional): The shift step of each window towards
 								            right-bottom. If zero, act as regular window-msa. Defaults to 0.
 								        dropout_layer (dict, optional): The dropout_layer used before output.
 								            Defaults to dict(type='DropPath', drop_prob=0.).
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        pad_small_map (bool): If True, pad the small feature map to the window
 								            size, which is common used in detection and segmentation. If False,
 								            avoid shifting window and shrink the window size to the size of
 								            feature map, which is common used in classification.
 								            Defaults to False.
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								        window_msa (Callable): To build a window multi-head attention module.
 								            Defaults to :class:`WindowMSA`.
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        init_cfg (dict, optional): The extra config for initialization.
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								            Defaults to None.
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								        **kwargs: Other keyword arguments to build the window multi-head
 								            attention module.
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 num_heads,
 								                 window_size,
 								                 shift_size=0,
 								                 dropout_layer=dict(type='DropPath', drop_prob=0.),
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								                 pad_small_map=False,
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								                 window_msa=WindowMSA,
 								                 init_cfg=None,
 								                 **kwargs):
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        super().__init__(init_cfg)
 								        self.shift_size = shift_size
 								        self.window_size = window_size
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        assert 0 <= self.shift_size < self.window_size
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								        self.w_msa = window_msa(
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								            embed_dims=embed_dims,
 								            num_heads=num_heads,
-												[Feature] Support Swin Transformer V2 for MMCLS 1.x. (#1029)

* [Feature] support swin v2 in mmcls 1.x

* fix configs

* fix clamp max

* Fix meshgrid

* Imporve swin v2

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-20 15:45:27 +08:00
+								            window_size=to_2tuple(self.window_size),
 								            **kwargs,
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        )
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        self.drop = build_dropout(dropout_layer)
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        self.pad_small_map = pad_small_map
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								    def forward(self, query, hw_shape):
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        B, L, C = query.shape
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        H, W = hw_shape
 								        assert L == H * W, f"The query length {L} doesn't match the input "\
 								            f'shape ({H}, {W}).'
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        query = query.view(B, H, W, C)
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        window_size = self.window_size
 								        shift_size = self.shift_size
 								        if min(H, W) == window_size:
 								            # If not pad small feature map, avoid shifting when the window size
 								            # is equal to the size of feature map. It's to align with the
 								            # behavior of the original implementation.
 								            shift_size = shift_size if self.pad_small_map else 0
 								        elif min(H, W) < window_size:
 								            # In the original implementation, the window size will be shrunk
 								            # to the size of feature map. The behavior is different with
 								            # swin-transformer for downstream tasks. To support dynamic input
 								            # shape, we don't allow this feature.
 								            assert self.pad_small_map, \
 								                f'The input shape ({H}, {W}) is smaller than the window ' \
 								                f'size ({window_size}). Please set `pad_small_map=True`, or ' \
 								                'decrease the `window_size`.'
 								        pad_r = (window_size - W % window_size) % window_size
 								        pad_b = (window_size - H % window_size) % window_size
 								        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
 								        H_pad, W_pad = query.shape[1], query.shape[2]
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        # cyclic shift
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        if shift_size > 0:
 								            query = torch.roll(
 								                query, shifts=(-shift_size, -shift_size), dims=(1, 2))
 								        attn_mask = self.get_attn_mask((H_pad, W_pad),
 								                                       window_size=window_size,
 								                                       shift_size=shift_size,
 								                                       device=query.device)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        # nW*B, window_size, window_size, C
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        query_windows = self.window_partition(query, window_size)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        # nW*B, window_size*window_size, C
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        query_windows = query_windows.view(-1, window_size**2, C)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        attn_windows = self.w_msa(query_windows, mask=attn_mask)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        # merge windows
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        attn_windows = attn_windows.view(-1, window_size, window_size, C)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
 								        # B H' W' C
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad,
 								                                        window_size)
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        # reverse cyclic shift
 								        if self.shift_size > 0:
 								            x = torch.roll(
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								                shifted_x, shifts=(shift_size, shift_size), dims=(1, 2))
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        else:
 								            x = shifted_x
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								        if H != H_pad or W != W_pad:
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								            x = x[:, :H, :W, :].contiguous()
 								        x = x.view(B, H * W, C)
 								        x = self.drop(x)
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        return x
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								    @staticmethod
 								    def window_reverse(windows, H, W, window_size):
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        B = int(windows.shape[0] / (H * W / window_size / window_size))
 								        x = windows.view(B, H // window_size, W // window_size, window_size,
 								                         window_size, -1)
 								        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
 								        return x
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								    @staticmethod
 								    def window_partition(x, window_size):
-												[Feature] Add swin-transformer model. (#271)

* Add swin transformer archs S, B and L.

* Add SwinTransformer configs

* Add train config files of swin.

* Align init method with original code

* Use nn.Unfold to merge patch

* Change all ConfigDict to dict

* Add init_cfg for all subclasses of BaseModule.

* Use mmcv version init function

* Add Swin README

* Use safer cfg copy method

* Improve docstring and variable name.

* Fix some difference in randaug

Fix BGR bug, align scheduler config.

Fix label smoothing parameter difference.

* Fix missing droppath in attn

* Fix bug of relative posititon table if window width is not equal to
height.

* Make `PatchMerging` more general, support kernel, stride, padding and
dilation.

* Rename `residual` to `identity` in attention and FFN.

* Add `auto_pad` option to auto pad feature map

* Improve docstring.

* Fix bug in ShiftWMSA padding.

* Remove unused `key` and `value` in ShiftWMSA

* Move `PatchMerging` into utils and use common `PatchEmbed`.

* Use latest `LinearClsHead`, train augments and label smooth settings.
And remove original `SwinLinearClsHead`.

* Mark some configs as "Evalution Only".

* Remove useless comment in config

* 1. Move ShiftWindowMSA and WindowMSA to `utils/attention.py`
2. Add docstrings of each module.
3. Fix some variables' names.
4. Other small improvement.

* Add unit tests of swin-transformer and patchmerging.

* Fix some bugs in unit tests.

* Fix bug of rel_position_index if window is not square.

* Make WindowMSA implicit, and add unit tests.

* Add metafile.yml, update readme and model_zoo.
											
										
										
											2021-07-01 09:30:42 +08:00
+								        B, H, W, C = x.shape
 								        x = x.view(B, H // window_size, window_size, W // window_size,
 								                   window_size, C)
 								        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
 								        windows = windows.view(-1, window_size, window_size, C)
 								        return windows
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
-												[Enhance] Support dynamic input shape for ViT-based algorithms. (#706)

* Move `resize_pos_embed` to `mmcls.models.utils`

* Refactor Vision Transformer

* Refactor DeiT

* Refactor MLP-Mixer

* Refactor Swin-Transformer

* Remove `indexing` arg

* Support dynamic inputs for t2t_vit

* Add copyright

* Fix bugs in swin transformer

* Add `pad_small_maps` option

* Update swin transformer

* Handle `attn_mask` in checkpoints of swin

* Imporve by comments
											
										
										
											2022-03-03 13:10:12 +08:00
+								    @staticmethod
 								    def get_attn_mask(hw_shape, window_size, shift_size, device=None):
 								        if shift_size > 0:
 								            img_mask = torch.zeros(1, *hw_shape, 1, device=device)
 								            h_slices = (slice(0, -window_size), slice(-window_size,
 								                                                      -shift_size),
 								                        slice(-shift_size, None))
 								            w_slices = (slice(0, -window_size), slice(-window_size,
 								                                                      -shift_size),
 								                        slice(-shift_size, None))
 								            cnt = 0
 								            for h in h_slices:
 								                for w in w_slices:
 								                    img_mask[:, h, w, :] = cnt
 								                    cnt += 1
 								            # nW, window_size, window_size, 1
 								            mask_windows = ShiftWindowMSA.window_partition(
 								                img_mask, window_size)
 								            mask_windows = mask_windows.view(-1, window_size * window_size)
 								            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
 								            attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0)
 								            attn_mask = attn_mask.masked_fill(attn_mask == 0, 0.0)
 								        else:
 								            attn_mask = None
 								        return attn_mask
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
 								class MultiheadAttention(BaseModule):
 								    """Multi-head Attention Module.
 								    This module implements multi-head attention that supports different input
 								    dims and embed dims. And it also supports a shortcut from ``value``, which
 								    is useful if input dims is not the same with embed dims.
 								    Args:
 								        embed_dims (int): The embedding dimension.
 								        num_heads (int): Parallel attention heads.
 								        input_dims (int, optional): The input dimension, and if None,
 								            use ``embed_dims``. Defaults to None.
 								        attn_drop (float): Dropout rate of the dropout layer after the
 								            attention calculation of query and key. Defaults to 0.
 								        proj_drop (float): Dropout rate of the dropout layer after the
 								            output projection. Defaults to 0.
 								        dropout_layer (dict): The dropout config before adding the shortcut.
 								            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
 								        qkv_bias (bool): If True, add a learnable bias to q, k, v.
 								            Defaults to True.
 								        qk_scale (float, optional): Override default qk scale of
 								            ``head_dim ** -0.5`` if set. Defaults to None.
 								        proj_bias (bool) If True, add a learnable bias to output projection.
 								            Defaults to True.
 								        v_shortcut (bool): Add a shortcut from value to output. It's usually
 								            used if ``input_dims`` is different from ``embed_dims``.
 								            Defaults to False.
-												[Feature] Support dinov2 backbone (#1522)

* support dinov2 backbone

* update metafile and readme

* compatible to use_layer_scale

* update SwiGLUFFN

* add deprecation warning

* update
											
										
										
											2023-05-05 16:59:37 +08:00
+								        use_layer_scale (bool): Whether to use layer scale. Defaults to False.
 								        layer_scale_init_value (float or torch.Tensor): Init value of layer
 								            scale. Defaults to 0.
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
+								        init_cfg (dict, optional): The Config for initialization.
 								            Defaults to None.
 								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 num_heads,
 								                 input_dims=None,
 								                 attn_drop=0.,
 								                 proj_drop=0.,
 								                 dropout_layer=dict(type='Dropout', drop_prob=0.),
 								                 qkv_bias=True,
 								                 qk_scale=None,
 								                 proj_bias=True,
 								                 v_shortcut=False,
-												[Feature] Support DeiT3. (#1065)

* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-10-10 15:54:20 +09:00
+								                 use_layer_scale=False,
-												[Feature] Support dinov2 backbone (#1522)

* support dinov2 backbone

* update metafile and readme

* compatible to use_layer_scale

* update SwiGLUFFN

* add deprecation warning

* update
											
										
										
											2023-05-05 16:59:37 +08:00
+								                 layer_scale_init_value=0.,
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
+								                 init_cfg=None):
 								        super(MultiheadAttention, self).__init__(init_cfg=init_cfg)
 								        self.input_dims = input_dims or embed_dims
 								        self.embed_dims = embed_dims
 								        self.num_heads = num_heads
 								        self.v_shortcut = v_shortcut
 								        self.head_dims = embed_dims // num_heads
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								        if qk_scale is not None:
 								            self.scaled_dot_product_attention = partial(
 								                scaled_dot_product_attention_pyimpl,
 								                scale=self.head_dims**-0.5)
 								        else:
 								            self.scaled_dot_product_attention = scaled_dot_product_attention
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
 								        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias)
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								        self.attn_drop = attn_drop
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
+								        self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias)
 								        self.proj_drop = nn.Dropout(proj_drop)
-												[Refactor] Refactor dev scripts

											
										
										
											2022-07-12 08:10:59 +00:00
+								        self.out_drop = build_dropout(dropout_layer)
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
-												[Feature] Support DeiT3. (#1065)

* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-10-10 15:54:20 +09:00
+								        if use_layer_scale:
-												[Feature] Support dinov2 backbone (#1522)

* support dinov2 backbone

* update metafile and readme

* compatible to use_layer_scale

* update SwiGLUFFN

* add deprecation warning

* update
											
										
										
											2023-05-05 16:59:37 +08:00
+								            warnings.warn('The `use_layer_scale` in `MultiheadAttention` will '
 								                          'be deprecated. Please use `layer_scale_init_value` '
 								                          'to control whether using layer scale or not.')
 								        if use_layer_scale or (layer_scale_init_value > 0):
 								            layer_scale_init_value = layer_scale_init_value or 1e-5
 								            self.gamma1 = LayerScale(
 								                embed_dims, layer_scale_init_value=layer_scale_init_value)
-												[Feature] Support DeiT3. (#1065)

* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-10-10 15:54:20 +09:00
+								        else:
 								            self.gamma1 = nn.Identity()
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
+								    def forward(self, x):
 								        B, N, _ = x.shape
 								        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
 								                                  self.head_dims).permute(2, 0, 3, 1, 4)
 								        q, k, v = qkv[0], qkv[1], qkv[2]
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								        attn_drop = self.attn_drop if self.training else 0.
 								        x = self.scaled_dot_product_attention(q, k, v, dropout_p=attn_drop)
 								        x = x.transpose(1, 2).reshape(B, N, self.embed_dims)
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
 								        x = self.proj(x)
-												[Feature] Support DeiT3. (#1065)

* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-10-10 15:54:20 +09:00
+								        x = self.out_drop(self.gamma1(self.proj_drop(x)))
-												[Refactor] Refator ViT (Continue #295) (#395)

* [Squash] Refator ViT (from #295)

* Use base variable to simplify auto_aug setting

* Use common PatchEmbed, remove HybridEmbed and refactor ViT init
structure.

* Add `output_cls_token` option and change the output format of ViT and
input format of ViT head.

* Update unit tests and add test for `output_cls_token`.

* Support out_indices.

* Standardize config files

* Support resize position embedding.

* Add readme file of vit

* Rename config file

* Improve docs about ViT.

* Update docstring

* Use local version `MultiheadAttention` instead of mmcv version.

* Fix MultiheadAttention

* Support `qk_scale` argument in `MultiheadAttention`

* Improve docs and change `layer_cfg` to `layer_cfgs` and support
sequence.

* Use init_cfg to init Linear layer in VisionTransformerHead

* update metafile

* Update checkpoints and configs

* Imporve docstring.

* Update README

* Revert GAP modification.
											
										
										
											2021-10-18 16:07:00 +08:00
 								        if self.v_shortcut:
 								            x = v.squeeze(1) + x
 								        return x
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
 								class BEiTAttention(BaseModule):
 								    """Window based multi-head self-attention (W-MSA) module with relative
 								    position bias.
 								    The initial implementation is in MMSegmentation.
 								    Args:
 								        embed_dims (int): Number of input channels.
 								        num_heads (int): Number of attention heads.
-												[Refactor] Add self-supervised backbones and target generators. (#1379)


* add heads

* add losses

* fix

* remove mim head

* add modified backbones and target generators

* add unittest

* refactor caevit

* add window_size check

* fix lint

* apply new DataSample

* fix ut error

* update ut

* fix ut

* fix lint

* Update base modules.

---------

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2023-02-28 15:59:17 +08:00
+								        window_size (tuple[int, int]): The height and width of the window.
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								        use_rel_pos_bias (bool): Whether to use unique relative position bias,
 								            if False, use shared relative position bias defined in backbone.
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        bias (str): The option to add leanable bias for q, k, v. If bias is
 								            True, it will add leanable bias. If bias is 'qv_bias', it will only
 								            add leanable bias for q, v. If bias is False, it will not add bias
 								            for q, k, v. Default to 'qv_bias'.
 								        qk_scale (float | None, optional): Override default qk scale of
 								            head_dim ** -0.5 if set. Default: None.
 								        attn_drop_rate (float): Dropout ratio of attention weight.
 								            Default: 0.0
 								        proj_drop_rate (float): Dropout ratio of output. Default: 0.
 								        init_cfg (dict | None, optional): The Config for initialization.
 								            Default: None.
 								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 num_heads,
 								                 window_size,
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								                 use_rel_pos_bias,
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								                 bias='qv_bias',
 								                 qk_scale=None,
 								                 attn_drop_rate=0.,
 								                 proj_drop_rate=0.,
 								                 init_cfg=None,
 								                 **kwargs):
 								        super().__init__(init_cfg=init_cfg)
 								        self.embed_dims = embed_dims
 								        self.num_heads = num_heads
 								        head_embed_dims = embed_dims // num_heads
 								        self.bias = bias
 								        self.scale = qk_scale or head_embed_dims**-0.5
 								        qkv_bias = bias
 								        if bias == 'qv_bias':
 								            self._init_qv_bias()
 								            qkv_bias = False
-												[Refactor] Add self-supervised backbones and target generators. (#1379)


* add heads

* add losses

* fix

* remove mim head

* add modified backbones and target generators

* add unittest

* refactor caevit

* add window_size check

* fix lint

* apply new DataSample

* fix ut error

* update ut

* fix ut

* fix lint

* Update base modules.

---------

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2023-02-28 15:59:17 +08:00
+								        if window_size is None:
 								            assert not use_rel_pos_bias
 								        else:
 								            assert isinstance(window_size, tuple)
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        self.window_size = window_size
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								        self.use_rel_pos_bias = use_rel_pos_bias
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        self._init_rel_pos_embedding()
 								        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
 								        self.attn_drop = nn.Dropout(attn_drop_rate)
 								        self.proj = nn.Linear(embed_dims, embed_dims)
 								        self.proj_drop = nn.Dropout(proj_drop_rate)
 								    def _init_qv_bias(self):
 								        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
 								        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))
 								    def _init_rel_pos_embedding(self):
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								        if self.use_rel_pos_bias:
 								            Wh, Ww = self.window_size
 								            # cls to token & token 2 cls & cls to cls
 								            self.num_relative_distance = (2 * Wh - 1) * (2 * Ww - 1) + 3
 								            # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
 								            self.relative_position_bias_table = nn.Parameter(
 								                torch.zeros(self.num_relative_distance, self.num_heads))
 								            # get pair-wise relative position index for
 								            # each token inside the window
 								            coords_h = torch.arange(Wh)
 								            coords_w = torch.arange(Ww)
 								            # coords shape is (2, Wh, Ww)
 								            coords = torch.stack(torch_meshgrid([coords_h, coords_w]))
 								            # coords_flatten shape is (2, Wh*Ww)
 								            coords_flatten = torch.flatten(coords, 1)
 								            relative_coords = (
 								                coords_flatten[:, :, None] - coords_flatten[:, None, :])
 								            # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
 								            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
 								            # shift to start from 0
 								            relative_coords[:, :, 0] += Wh - 1
 								            relative_coords[:, :, 1] += Ww - 1
 								            relative_coords[:, :, 0] *= 2 * Ww - 1
 								            relative_position_index = torch.zeros(
 								                size=(Wh * Ww + 1, ) * 2, dtype=relative_coords.dtype)
 								            # relative_position_index shape is (Wh*Ww, Wh*Ww)
 								            relative_position_index[1:, 1:] = relative_coords.sum(-1)
 								            relative_position_index[0, 0:] = self.num_relative_distance - 3
 								            relative_position_index[0:, 0] = self.num_relative_distance - 2
 								            relative_position_index[0, 0] = self.num_relative_distance - 1
 								            self.register_buffer('relative_position_index',
 								                                 relative_position_index)
 								        else:
 								            self.window_size = None
 								            self.relative_position_bias_table = None
 								            self.relative_position_index = None
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
 								    def init_weights(self):
 								        super().init_weights()
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								        if self.use_rel_pos_bias:
 								            trunc_normal_(self.relative_position_bias_table, std=0.02)
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								    def forward(self, x, rel_pos_bias=None):
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        """
 								        Args:
 								            x (tensor): input features with shape of (num_windows*B, N, C).
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
+								            rel_pos_bias (tensor): input relative position bias with shape of
 								                (num_heads, N, N).
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        """
 								        B, N, C = x.shape
 								        if self.bias == 'qv_bias':
 								            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
 								            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
 								            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
 								        else:
 								            qkv = self.qkv(x)
 								        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
 								        q, k, v = qkv[0], qkv[1], qkv[2]
 								        q = q * self.scale
 								        attn = (q @ k.transpose(-2, -1))
-												[Refactor] Add necks, heads and losses for the self-supervised task. (#1376)


* add necks

* refactor linear neck

* rename simmim neck

* add heads

* add losses

* fix

* add unittest

* update

* update cae

* remove mim head

* update config
											
										
										
											2023-02-28 10:05:00 +08:00
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        if self.relative_position_bias_table is not None:
 								            Wh = self.window_size[0]
 								            Ww = self.window_size[1]
 								            relative_position_bias = self.relative_position_bias_table[
 								                self.relative_position_index.view(-1)].view(
 								                    Wh * Ww + 1, Wh * Ww + 1, -1)
 								            relative_position_bias = relative_position_bias.permute(
 , 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
 								            attn = attn + relative_position_bias.unsqueeze(0)
-												[Refactor] Refactor BEiT backbone and support v1/v2 inference. (#1144)

* refactor beit backbone

* use LinearClsHead

* fix mean and std value

* fix lint

* support inference if beit-v2

* update encoder layer and init

* update

* add ut

* add prepare_relative_position_bias_table function

* add cls_token

* fix lint

* add pos_embed check

* update metafile and readme

* update weights link

* update link of weights

* update metafile

* update

* update docstrings

* update according to review

* rename readme

* update docstring

* fix lint
											
										
										
											2022-11-29 12:56:33 +08:00
 								        if rel_pos_bias is not None:
 								            # use shared relative position bias
 								            attn = attn + rel_pos_bias
-												[Feature]: Support BEiT Transformer layer. (#919)

* [Feature]: Add BEiT-style transformer encoder layer

* [Feature]: Add average token

* [Fix]: Fix lint

* [Fix]: Refactor CAE config

* [Fix]: Change cv2 backend to pillow backend

* [Fix]: Fix MAE and CAE reshape bug

* [Feature]: Add freeze vit layers

* [Feature]: Add mc

* [Fix]: Fix lint

* [Fix]: Fix dataset bug

* [Fix]: Delete cae selfsup config

* [Fix]: docstring

* [Refactor]: Add init_values to layer_scalue_init_value

* [Fix]: Refine the docstring of avg_token

* [Fix]: Call super init weight in beit attention

* [Fix]: remove mc

* [Fix]: Fix docstring

* [Fix]: Fix docstring

* [Fix]: Fix lint

* [Fix]: Fix init_value bug and change the logic of outputting cls token

* [Fix]: Fix docstring
											
										
										
											2022-08-17 00:07:06 +08:00
+								        attn = attn.softmax(dim=-1)
 								        attn = self.attn_drop(attn)
 								        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
 								        x = self.proj(x)
 								        x = self.proj_drop(x)
 								        return x
-												[Feature] Support EdgeNeXt for dev-1.x. (#1037)

* [Feature] add EdgeNeXt for dev-1.x

* [Feature] add update

* Imporve EdgeNeXt

* Fix div error in low pytorch version

Co-authored-by: mzr1996 <mzr1996@163.com>
											
										
										
											2022-09-21 13:27:04 +08:00
 								class ChannelMultiheadAttention(BaseModule):
 								    """Channel Multihead Self-attention Module.
 								    This module implements channel multi-head attention that supports different
 								    input dims and embed dims.
 								    Args:
 								        embed_dims (int): The embedding dimension.
 								        num_heads (int): Parallel attention heads.
 								        input_dims (int, optional): The input dimension, and if None,
 								            use ``embed_dims``. Defaults to None.
 								        attn_drop (float): Dropout rate of the dropout layer after the
 								            attention calculation of query and key. Defaults to 0.
 								        proj_drop (float): Dropout rate of the dropout layer after the
 								            output projection. Defaults to 0.
 								        dropout_layer (dict): The dropout config before adding the shoutcut.
 								            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
 								        qkv_bias (bool): If True, add a learnable bias to q, k, v.
 								            Defaults to False.
 								        proj_bias (bool) If True, add a learnable bias to output projection.
 								            Defaults to True.
 								        qk_scale_type (str): The scale type of qk scale.
 								            Defaults to 'learnable'. It can be 'learnable', 'fixed' or 'none'.
 								        qk_scale (float, optional): If set qk_scale_type to 'none', this
 								            should be specified with valid float number. Defaults to None.
 								        v_shortcut (bool): Add a shortcut from value to output. It's usually
 								            used if ``input_dims`` is different from ``embed_dims``.
 								            Defaults to False.
 								        init_cfg (dict, optional): The Config for initialization.
 								            Defaults to None.
 								    """
 								    def __init__(self,
 								                 embed_dims,
 								                 num_heads=8,
 								                 input_dims=None,
 								                 attn_drop=0.,
 								                 proj_drop=0.,
 								                 dropout_layer=dict(type='Dropout', drop_prob=0.),
 								                 qkv_bias=False,
 								                 proj_bias=True,
 								                 qk_scale_type='learnable',
 								                 qk_scale=None,
 								                 v_shortcut=False,
 								                 init_cfg=None):
 								        super().__init__(init_cfg)
 								        self.input_dims = input_dims or embed_dims
 								        self.embed_dims = embed_dims
 								        self.num_heads = num_heads
 								        self.v_shortcut = v_shortcut
 								        self.head_dims = embed_dims // num_heads
 								        if qk_scale_type == 'learnable':
 								            self.scale = nn.Parameter(torch.ones(num_heads, 1, 1))
 								        elif qk_scale_type == 'fixed':
 								            self.scale = self.head_dims**-0.5
 								        elif qk_scale_type == 'none':
 								            assert qk_scale is not None
 								            self.scale = qk_scale
 								        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=qkv_bias)
 								        self.attn_drop = nn.Dropout(attn_drop)
 								        self.proj = nn.Linear(embed_dims, embed_dims, bias=proj_bias)
 								        self.proj_drop = nn.Dropout(proj_drop)
 								        self.out_drop = build_dropout(dropout_layer)
 								    def forward(self, x):
 								        B, N, _ = x.shape
 								        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
 								                                  self.head_dims).permute(2, 0, 3, 1, 4)
 								        q, k, v = [item.transpose(-2, -1) for item in [qkv[0], qkv[1], qkv[2]]]
 								        q, k = F.normalize(q, dim=-1), F.normalize(k, dim=-1)
 								        attn = (q @ k.transpose(-2, -1)) * self.scale
 								        attn = attn.softmax(dim=-1)
 								        x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, self.embed_dims)
 								        x = self.proj(x)
 								        x = self.out_drop(self.proj_drop(x))
 								        if self.v_shortcut:
 								            x = qkv[2].squeeze(1) + x
 								        return x
-												[Feature] Add TinyViT for dev-1.x. (#1042)

* [Feature] add TinyViT for dev-1.x

* [Feature] update readme

* fix lint error

* refactor the code

* [Update] update the args

* [Update] add unit test and fix bugs

* Rename the configuration file

* delete invalid files

* [Feature] update tinyvit readme

* [Feature] update tinyvit readme

* [Feature] update metafile

* Update tinyvit metafile
											
										
										
											2022-12-20 13:04:00 +08:00
 								class LeAttention(BaseModule):
 								    """LeViT Attention. Multi-head attention with attention bias,  which is
 								    proposed in `LeViT: a Vision Transformer in ConvNet’s Clothing for Faster
 								    Inference<https://arxiv.org/abs/2104.01136>`_
 								    Args:
 								        dim (int): Number of input channels.
 								        num_heads (int): Number of attention heads. Default: 8.
 								        key_dim (int): Dimension of key. Default: None.
 								        attn_ratio (int): Ratio of attention heads. Default: 8.
 								        resolution (tuple[int]): Input resolution. Default: (16, 16).
 								        init_cfg (dict, optional): The Config for initialization.
 								    """
 								    def __init__(self,
 								                 dim,
 								                 key_dim,
 								                 num_heads=8,
 								                 attn_ratio=4,
 								                 resolution=(14, 14),
 								                 init_cfg=None):
 								        super().__init__(init_cfg=init_cfg)
 								        # (h, w)
 								        assert isinstance(resolution, tuple) and len(resolution) == 2
 								        self.num_heads = num_heads
 								        self.scale = key_dim**-0.5
 								        self.key_dim = key_dim
 								        self.nh_kd = nh_kd = key_dim * num_heads
 								        self.d = int(attn_ratio * key_dim)
 								        self.dh = int(attn_ratio * key_dim) * num_heads
 								        self.attn_ratio = attn_ratio
 								        h = self.dh + nh_kd * 2
 								        self.norm = nn.LayerNorm(dim)
 								        self.qkv = nn.Linear(dim, h)
 								        self.proj = nn.Linear(self.dh, dim)
 								        points = list(
 								            itertools.product(range(resolution[0]), range(resolution[1])))
 								        N = len(points)
 								        attention_offsets = {}
 								        idxs = []
 								        for p1 in points:
 								            for p2 in points:
 								                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
 								                if offset not in attention_offsets:
 								                    attention_offsets[offset] = len(attention_offsets)
 								                idxs.append(attention_offsets[offset])
 								        self.attention_biases = torch.nn.Parameter(
 								            torch.zeros(num_heads, len(attention_offsets)))
 								        self.register_buffer(
 								            'attention_bias_idxs',
 								            torch.LongTensor(idxs).view(N, N),
 								            persistent=False)
 								    @torch.no_grad()
 								    def train(self, mode=True):
 								        super().train(mode)
 								        if mode and hasattr(self, 'ab'):
 								            del self.ab
 								        else:
 								            self.ab = self.attention_biases[:, self.attention_bias_idxs]
 								    def forward(self, x):  # x (B,N,C)
 								        B, N, _ = x.shape
 								        # Normalization
 								        x = self.norm(x)
 								        qkv = self.qkv(x)
 								        # (B, N, num_heads, d)
 								        q, k, v = qkv.view(B, N, self.num_heads,
 								                           -1).split([self.key_dim, self.key_dim, self.d],
 								                                     dim=3)
 								        # (B, num_heads, N, d)
 								        q = q.permute(0, 2, 1, 3)
 								        k = k.permute(0, 2, 1, 3)
 								        v = v.permute(0, 2, 1, 3)
 								        attn = ((q @ k.transpose(-2, -1)) * self.scale +
 								                (self.attention_biases[:, self.attention_bias_idxs]
 								                 if self.training else self.ab))
 								        attn = attn.softmax(dim=-1)
 								        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
 								        x = self.proj(x)
 								        return x
-												[Refactor] Add necks, heads and losses for the self-supervised task. (#1376)


* add necks

* refactor linear neck

* rename simmim neck

* add heads

* add losses

* fix

* add unittest

* update

* update cae

* remove mim head

* update config
											
										
										
											2023-02-28 10:05:00 +08:00
 								class CrossMultiheadAttention(BaseModule):
 								    """Cross attention between queries and the union of keys and values.
 								    This module is different from ``MultiheadAttention``, for the attention
 								    is computed between queries and the union of keys and values.
 								    Args:
 								        embed_dims (int): The embedding dimension.
 								        num_heads (int): Parallel attention heads.
 								        qkv_bias (bool): If True, add a learnable bias to q, k, v.
 								            Defaults to True.
 								        qk_scale (float, optional): Override default qk scale of
 								            ``head_dim ** -0.5`` if set. Defaults to None.
 								        attn_drop (float): Dropout rate of the dropout layer after the
 								            attention calculation of query and key. Defaults to 0.
 								        proj_drop (float): Dropout rate of the dropout layer after the
 								            output projection. Defaults to 0.
 								    """
 								    def __init__(self,
 								                 embed_dims: int,
 								                 num_heads: int = 8,
 								                 qkv_bias: bool = False,
 								                 qk_scale: float = None,
 								                 attn_drop: float = 0.,
 								                 proj_drop: float = 0.) -> None:
 								        super().__init__()
 								        self.num_heads = num_heads
 								        head_dim = embed_dims // num_heads
 								        self.scale = qk_scale or head_dim**-0.5
 								        self.q = nn.Linear(embed_dims, embed_dims, bias=False)
 								        self.k = nn.Linear(embed_dims, embed_dims, bias=False)
 								        self.v = nn.Linear(embed_dims, embed_dims, bias=False)
 								        if qkv_bias:
 								            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
 								            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
 								        else:
 								            self.q_bias = None
 								            self.k_bias = None
 								            self.v_bias = None
 								        self.attn_drop = nn.Dropout(attn_drop)
 								        self.proj = nn.Linear(embed_dims, embed_dims)
 								        self.proj_drop = nn.Dropout(proj_drop)
 								    def forward(self,
 								                x: torch.Tensor,
 								                k: torch.Tensor = None,
 								                v: torch.Tensor = None) -> None:
 								        """Forward function."""
 								        B, N, _ = x.shape
 								        N_k = k.shape[1]
 								        N_v = v.shape[1]
 								        q_bias, k_bias, v_bias = None, None, None
 								        if self.q_bias is not None:
 								            q_bias = self.q_bias
 								            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
 								            v_bias = self.v_bias
 								        q = F.linear(
 								            input=x, weight=self.q.weight, bias=q_bias)  # (B, N_q, dim)
 								        k = F.linear(
 								            input=k, weight=self.k.weight, bias=k_bias)  # (B, N_k, dim)
 								        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
 								        q = q.reshape(B, N, 1, self.num_heads,
 								                      -1).permute(2, 0, 3, 1,
 ).squeeze(0)  # (B, num_heads, N_q, dim)
 								        k = k.reshape(B, N_k, 1, self.num_heads,
 								                      -1).permute(2, 0, 3, 1,
 ).squeeze(0)  # (B, num_heads, N_k, dim)
 								        v = v.reshape(B, N_v, 1, self.num_heads,
 								                      -1).permute(2, 0, 3, 1,
 ).squeeze(0)  # (B, num_heads, N_v, dim)
 								        q = q * self.scale
 								        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
 								        attn = attn.softmax(dim=-1)
 								        attn = self.attn_drop(attn)
 								        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
 								        x = self.proj(x)
 								        x = self.proj_drop(x)
 								        return x
 								class PromptMultiheadAttention(MultiheadAttention):
 								    """Prompt Multihead Attention for MILAN.
 								    This module is specific for the prompt encoder in MILAN. It will not update
 								    the visible tokens from the encoder.
 								    Args:
 								        embed_dims (int): The embedding dimension.
 								        num_heads (int): Parallel attention heads.
 								        input_dims (int, optional): The input dimension, and if None,
 								            use ``embed_dims``. Defaults to None.
 								        attn_drop (float): Dropout rate of the dropout layer after the
 								            attention calculation of query and key. Defaults to 0.
 								        proj_drop (float): Dropout rate of the dropout layer after the
 								            output projection. Defaults to 0.
 								        dropout_layer (dict): The dropout config before adding the shortcut.
 								            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
 								        qkv_bias (bool): If True, add a learnable bias to q, k, v.
 								            Defaults to True.
 								        qk_scale (float, optional): Override default qk scale of
 								            ``head_dim ** -0.5`` if set. Defaults to None.
 								        proj_bias (bool) If True, add a learnable bias to output projection.
 								            Defaults to True.
 								        v_shortcut (bool): Add a shortcut from value to output. It's usually
 								            used if ``input_dims`` is different from ``embed_dims``.
 								            Defaults to False.
 								        return_attention (bool): If True, return the attention map, computed by
 								            the cross attention between the class token and all other tokens.
 								            Defaults to False.
 								        init_cfg (Union[List[dict], dict], optional): The Config for
 								            initialization. Defaults to None.
 								    """
 								    def __init__(self,
 								                 embed_dims: int,
 								                 num_heads: int,
 								                 input_dims: Optional[int] = None,
 								                 attn_drop: float = 0,
 								                 proj_drop: float = 0,
 								                 dropout_layer: dict = dict(type='Dropout', drop_prob=0.),
 								                 qkv_bias: bool = True,
 								                 qk_scale: Optional[float] = None,
 								                 proj_bias: bool = True,
 								                 v_shortcut: bool = False,
 								                 use_layer_scale: bool = False,
 								                 init_cfg: Optional[Union[List[dict], dict]] = None) -> None:
-												[Feature] Support dinov2 backbone (#1522)

* support dinov2 backbone

* update metafile and readme

* compatible to use_layer_scale

* update SwiGLUFFN

* add deprecation warning

* update
											
										
										
											2023-05-05 16:59:37 +08:00
+								        super().__init__(
 								            embed_dims=embed_dims,
 								            num_heads=num_heads,
 								            input_dims=input_dims,
 								            attn_drop=attn_drop,
 								            proj_drop=proj_drop,
 								            dropout_layer=dropout_layer,
 								            qkv_bias=qkv_bias,
 								            qk_scale=qk_scale,
 								            proj_bias=proj_bias,
 								            v_shortcut=v_shortcut,
 								            use_layer_scale=use_layer_scale,
 								            init_cfg=init_cfg)
-												[Refactor] Add necks, heads and losses for the self-supervised task. (#1376)


* add necks

* refactor linear neck

* rename simmim neck

* add heads

* add losses

* fix

* add unittest

* update

* update cae

* remove mim head

* update config
											
										
										
											2023-02-28 10:05:00 +08:00
+								        # no longer need qkv
 								        del self.qkv
 								        # to project the mask tokens
 								        self.q = nn.Linear(embed_dims, embed_dims, bias=qkv_bias)
 								        # to project al the tokens
 								        self.kv = nn.Linear(embed_dims, embed_dims * 2, bias=qkv_bias)
 								    def forward(self, x: torch.Tensor, visible_tokens: torch.Tensor,
 								                ids_restore: torch.Tensor) -> torch.Tensor:
 								        """Forward function for `PromptMultiheadAttention`.
 								        Args:
 								            x (torch.Tensor): Mask token features with shape N x L_m x C.
 								            visible_tokens (torch.Tensor): The visible tokens features from
 								                encoder with shape N x L_v x C.
 								            ids_restore (torch.Tensor): The ids of all tokens in the original
 								                image with shape N x L.
 								        Returns:
 								            torch Tensor: Output features with shape N x L x C.
 								        """
 								        x_ = torch.cat([visible_tokens[:, 1:, :], x], dim=1)
 								        assert x_.shape[1] == ids_restore.shape[1]
 								        x_ = torch.gather(
 								            x_,
 								            dim=1,
 								            index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[-1]))
 								        x_ = torch.cat([visible_tokens[:, :1, :], x_], dim=1)
 								        # full sequence shape
 								        B, _, _ = x_.shape
 								        q = self.q(x).reshape(B, x.shape[1], self.num_heads,
 								                              self.head_dims).permute(0, 2, 1, 3)
 								        kv = self.kv(x_).reshape(B, x_.shape[1], 2, self.num_heads,
 								                                 self.head_dims).permute(2, 0, 3, 1, 4)
 								        k, v = kv[0], kv[1]
-												[Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`. (#1434)

* [Improve] Use PyTorch official `scaled_dot_product_attention` to accelerate `MultiheadAttention`.

* Support `--local-rank` and `--amp` option for new version PyTorch.

* Fix imports and UT.
											
										
										
											2023-03-29 15:50:44 +08:00
+								        attn_drop = self.attn_drop if self.training else 0.
 								        attn = self.scaled_dot_product_attention(q, k, v, dropout_p=attn_drop)
 								        x = attn.transpose(1, 2).reshape(B, x.shape[1], self.embed_dims)
-												[Refactor] Add necks, heads and losses for the self-supervised task. (#1376)


* add necks

* refactor linear neck

* rename simmim neck

* add heads

* add losses

* fix

* add unittest

* update

* update cae

* remove mim head

* update config
											
										
										
											2023-02-28 10:05:00 +08:00
 								        x = self.proj(x)
 								        x = self.out_drop(self.gamma1(self.proj_drop(x)))
 								        return x