mmsegmentation/mmseg/models/backbones/twins.py

589 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_norm_layer
from mmcv.cnn.bricks.drop import build_dropout
from mmcv.cnn.bricks.transformer import FFN
from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
trunc_normal_init)
from mmcv.runner import BaseModule, ModuleList
from torch.nn.modules.batchnorm import _BatchNorm
from mmseg.models.backbones.mit import EfficientMultiheadAttention
from mmseg.models.builder import BACKBONES
from ..utils.embed import PatchEmbed
class GlobalSubsampledAttention(EfficientMultiheadAttention):
"""Global Sub-sampled Attention (Spatial Reduction Attention)
This module is modified from EfficientMultiheadAttention
which is a module from mmseg.models.backbones.mit.py.
Specifically, there is no difference between
`GlobalSubsampledAttention` and `EfficientMultiheadAttention`,
`GlobalSubsampledAttention` is built as a brand new class
because it is renamed as `Global sub-sampled attention (GSA)`
in paper.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads.
attn_drop (float): A Dropout layer on attn_output_weights.
Default: 0.0.
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
Default: 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut. Default: None.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dims)
or (n, batch, embed_dims). Default: False.
qkv_bias (bool): enable bias for qkv if True. Default: True.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
sr_ratio (int): The ratio of spatial reduction of GSA of PCPVT.
Default: 1.
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=None,
batch_first=True,
qkv_bias=True,
norm_cfg=dict(type='LN'),
sr_ratio=1,
init_cfg=None):
super(GlobalSubsampledAttention, self).__init__(
embed_dims,
num_heads,
attn_drop=attn_drop,
proj_drop=proj_drop,
dropout_layer=dropout_layer,
batch_first=batch_first,
qkv_bias=qkv_bias,
norm_cfg=norm_cfg,
sr_ratio=sr_ratio,
init_cfg=init_cfg)
class GSAEncoderLayer(BaseModule):
"""Implements one encoder layer with GSA.
Args:
embed_dims (int): The feature dimension.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
drop_rate (float): Probability of an element to be zeroed
after the feed forward layer. Default: 0.0.
attn_drop_rate (float): The drop out rate for attention layer.
Default: 0.0.
drop_path_rate (float): Stochastic depth rate. Default 0.0.
num_fcs (int): The number of fully-connected layers for FFNs.
Default: 2.
qkv_bias (bool): Enable bias for qkv if True. Default: True
act_cfg (dict): The activation config for FFNs.
Default: dict(type='GELU').
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
sr_ratio (float): Kernel_size of conv in Attention modules. Default: 1.
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
num_fcs=2,
qkv_bias=True,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
sr_ratio=1.,
init_cfg=None):
super(GSAEncoderLayer, self).__init__(init_cfg=init_cfg)
self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
self.attn = GlobalSubsampledAttention(
embed_dims=embed_dims,
num_heads=num_heads,
attn_drop=attn_drop_rate,
proj_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
qkv_bias=qkv_bias,
norm_cfg=norm_cfg,
sr_ratio=sr_ratio)
self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
self.ffn = FFN(
embed_dims=embed_dims,
feedforward_channels=feedforward_channels,
num_fcs=num_fcs,
ffn_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
act_cfg=act_cfg,
add_identity=False)
self.drop_path = build_dropout(
dict(type='DropPath', drop_prob=drop_path_rate)
) if drop_path_rate > 0. else nn.Identity()
def forward(self, x, hw_shape):
x = x + self.drop_path(self.attn(self.norm1(x), hw_shape, identity=0.))
x = x + self.drop_path(self.ffn(self.norm2(x)))
return x
class LocallyGroupedSelfAttention(BaseModule):
"""Locally-grouped Self Attention (LSA) module.
Args:
embed_dims (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop_rate (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
window_size(int): Window size of LSA. Default: 1.
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
embed_dims,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop_rate=0.,
proj_drop_rate=0.,
window_size=1,
init_cfg=None):
super(LocallyGroupedSelfAttention, self).__init__(init_cfg=init_cfg)
assert embed_dims % num_heads == 0, f'dim {embed_dims} should be ' \
f'divided by num_heads ' \
f'{num_heads}.'
self.embed_dims = embed_dims
self.num_heads = num_heads
head_dim = embed_dims // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop_rate)
self.proj = nn.Linear(embed_dims, embed_dims)
self.proj_drop = nn.Dropout(proj_drop_rate)
self.window_size = window_size
def forward(self, x, hw_shape):
b, n, c = x.shape
h, w = hw_shape
x = x.view(b, h, w, c)
# pad feature maps to multiples of Local-groups
pad_l = pad_t = 0
pad_r = (self.window_size - w % self.window_size) % self.window_size
pad_b = (self.window_size - h % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
# calculate attention mask for LSA
Hp, Wp = x.shape[1:-1]
_h, _w = Hp // self.window_size, Wp // self.window_size
mask = torch.zeros((1, Hp, Wp), device=x.device)
mask[:, -pad_b:, :].fill_(1)
mask[:, :, -pad_r:].fill_(1)
# [B, _h, _w, window_size, window_size, C]
x = x.reshape(b, _h, self.window_size, _w, self.window_size,
c).transpose(2, 3)
mask = mask.reshape(1, _h, self.window_size, _w,
self.window_size).transpose(2, 3).reshape(
1, _h * _w,
self.window_size * self.window_size)
# [1, _h*_w, window_size*window_size, window_size*window_size]
attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3)
attn_mask = attn_mask.masked_fill(attn_mask != 0,
float(-1000.0)).masked_fill(
attn_mask == 0, float(0.0))
# [3, B, _w*_h, nhead, window_size*window_size, dim]
qkv = self.qkv(x).reshape(b, _h * _w,
self.window_size * self.window_size, 3,
self.num_heads, c // self.num_heads).permute(
3, 0, 1, 4, 2, 5)
q, k, v = qkv[0], qkv[1], qkv[2]
# [B, _h*_w, n_head, window_size*window_size, window_size*window_size]
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn + attn_mask.unsqueeze(2)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
attn = (attn @ v).transpose(2, 3).reshape(b, _h, _w, self.window_size,
self.window_size, c)
x = attn.transpose(2, 3).reshape(b, _h * self.window_size,
_w * self.window_size, c)
if pad_r > 0 or pad_b > 0:
x = x[:, :h, :w, :].contiguous()
x = x.reshape(b, n, c)
x = self.proj(x)
x = self.proj_drop(x)
return x
class LSAEncoderLayer(BaseModule):
"""Implements one encoder layer in Twins-SVT.
Args:
embed_dims (int): The feature dimension.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
drop_rate (float): Probability of an element to be zeroed
after the feed forward layer. Default: 0.0.
attn_drop_rate (float, optional): Dropout ratio of attention weight.
Default: 0.0
drop_path_rate (float): Stochastic depth rate. Default 0.0.
num_fcs (int): The number of fully-connected layers for FFNs.
Default: 2.
qkv_bias (bool): Enable bias for qkv if True. Default: True
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
act_cfg (dict): The activation config for FFNs.
Default: dict(type='GELU').
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
window_size (int): Window size of LSA. Default: 1.
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
num_fcs=2,
qkv_bias=True,
qk_scale=None,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
window_size=1,
init_cfg=None):
super(LSAEncoderLayer, self).__init__(init_cfg=init_cfg)
self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads,
qkv_bias, qk_scale,
attn_drop_rate, drop_rate,
window_size)
self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
self.ffn = FFN(
embed_dims=embed_dims,
feedforward_channels=feedforward_channels,
num_fcs=num_fcs,
ffn_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
act_cfg=act_cfg,
add_identity=False)
self.drop_path = build_dropout(
dict(type='DropPath', drop_prob=drop_path_rate)
) if drop_path_rate > 0. else nn.Identity()
def forward(self, x, hw_shape):
x = x + self.drop_path(self.attn(self.norm1(x), hw_shape))
x = x + self.drop_path(self.ffn(self.norm2(x)))
return x
class ConditionalPositionEncoding(BaseModule):
"""The Conditional Position Encoding (CPE) module.
The CPE is the implementation of 'Conditional Positional Encodings
for Vision Transformers <https://arxiv.org/abs/2102.10882>'_.
Args:
in_channels (int): Number of input channels.
embed_dims (int): The feature dimension. Default: 768.
stride (int): Stride of conv layer. Default: 1.
"""
def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None):
super(ConditionalPositionEncoding, self).__init__(init_cfg=init_cfg)
self.proj = nn.Conv2d(
in_channels,
embed_dims,
kernel_size=3,
stride=stride,
padding=1,
bias=True,
groups=embed_dims)
self.stride = stride
def forward(self, x, hw_shape):
b, n, c = x.shape
h, w = hw_shape
feat_token = x
cnn_feat = feat_token.transpose(1, 2).view(b, c, h, w)
if self.stride == 1:
x = self.proj(cnn_feat) + cnn_feat
else:
x = self.proj(cnn_feat)
x = x.flatten(2).transpose(1, 2)
return x
@BACKBONES.register_module()
class PCPVT(BaseModule):
"""The backbone of Twins-PCPVT.
This backbone is the implementation of `Twins: Revisiting the Design
of Spatial Attention in Vision Transformers
<https://arxiv.org/abs/1512.03385>`_.
Args:
in_channels (int): Number of input channels. Default: 3.
embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
strides (list): The strides. Default: [4, 2, 2, 2].
num_heads (int): Number of attention heads. Default: [1, 2, 4, 8].
mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
Default: [4, 4, 4, 4].
out_indices (tuple[int]): Output from which stages.
Default: (0, 1, 2, 3).
qkv_bias (bool): Enable bias for qkv if True. Default: False.
drop_rate (float): Probability of an element to be zeroed.
Default 0.
attn_drop_rate (float): The drop out rate for attention layer.
Default 0.0
drop_path_rate (float): Stochastic depth rate. Default 0.0
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN')
depths (list): Depths of each stage. Default [3, 4, 6, 3]
sr_ratios (list): Kernel_size of conv in each Attn module in
Transformer encoder layer. Default: [8, 4, 2, 1].
norm_after_stagebool): Add extra norm. Default False.
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
in_channels=3,
embed_dims=[64, 128, 256, 512],
patch_sizes=[4, 2, 2, 2],
strides=[4, 2, 2, 2],
num_heads=[1, 2, 4, 8],
mlp_ratios=[4, 4, 4, 4],
out_indices=(0, 1, 2, 3),
qkv_bias=False,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_cfg=dict(type='LN'),
depths=[3, 4, 6, 3],
sr_ratios=[8, 4, 2, 1],
norm_after_stage=False,
pretrained=None,
init_cfg=None):
super(PCPVT, self).__init__(init_cfg=init_cfg)
assert not (init_cfg and pretrained), \
'init_cfg and pretrained cannot be set at the same time'
if isinstance(pretrained, str):
warnings.warn('DeprecationWarning: pretrained is deprecated, '
'please use "init_cfg" instead')
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
elif pretrained is not None:
raise TypeError('pretrained must be a str or None')
self.depths = depths
# patch_embed
self.patch_embeds = ModuleList()
self.position_encoding_drops = ModuleList()
self.layers = ModuleList()
for i in range(len(depths)):
self.patch_embeds.append(
PatchEmbed(
in_channels=in_channels if i == 0 else embed_dims[i - 1],
embed_dims=embed_dims[i],
conv_type='Conv2d',
kernel_size=patch_sizes[i],
stride=strides[i],
padding='corner',
norm_cfg=norm_cfg))
self.position_encoding_drops.append(nn.Dropout(p=drop_rate))
self.position_encodings = ModuleList([
ConditionalPositionEncoding(embed_dim, embed_dim)
for embed_dim in embed_dims
])
# transformer encoder
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
cur = 0
for k in range(len(depths)):
_block = ModuleList([
GSAEncoderLayer(
embed_dims=embed_dims[k],
num_heads=num_heads[k],
feedforward_channels=mlp_ratios[k] * embed_dims[k],
attn_drop_rate=attn_drop_rate,
drop_rate=drop_rate,
drop_path_rate=dpr[cur + i],
num_fcs=2,
qkv_bias=qkv_bias,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
sr_ratio=sr_ratios[k]) for i in range(depths[k])
])
self.layers.append(_block)
cur += depths[k]
self.norm_name, norm = build_norm_layer(
norm_cfg, embed_dims[-1], postfix=1)
self.out_indices = out_indices
self.norm_after_stage = norm_after_stage
if self.norm_after_stage:
self.norm_list = ModuleList()
for dim in embed_dims:
self.norm_list.append(build_norm_layer(norm_cfg, dim)[1])
def init_weights(self):
if self.init_cfg is not None:
super(PCPVT, self).init_weights()
else:
for m in self.modules():
if isinstance(m, nn.Linear):
trunc_normal_init(m, std=.02, bias=0.)
elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
constant_init(m, val=1.0, bias=0.)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[
1] * m.out_channels
fan_out //= m.groups
normal_init(
m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
def forward(self, x):
outputs = list()
b = x.shape[0]
for i in range(len(self.depths)):
x, hw_shape = self.patch_embeds[i](x)
h, w = hw_shape
x = self.position_encoding_drops[i](x)
for j, blk in enumerate(self.layers[i]):
x = blk(x, hw_shape)
if j == 0:
x = self.position_encodings[i](x, hw_shape)
if self.norm_after_stage:
x = self.norm_list[i](x)
x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
if i in self.out_indices:
outputs.append(x)
return tuple(outputs)
@BACKBONES.register_module()
class SVT(PCPVT):
"""The backbone of Twins-SVT.
This backbone is the implementation of `Twins: Revisiting the Design
of Spatial Attention in Vision Transformers
<https://arxiv.org/abs/1512.03385>`_.
Args:
in_channels (int): Number of input channels. Default: 3.
embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
strides (list): The strides. Default: [4, 2, 2, 2].
num_heads (int): Number of attention heads. Default: [1, 2, 4].
mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
Default: [4, 4, 4].
out_indices (tuple[int]): Output from which stages.
Default: (0, 1, 2, 3).
qkv_bias (bool): Enable bias for qkv if True. Default: False.
drop_rate (float): Dropout rate. Default 0.
attn_drop_rate (float): Dropout ratio of attention weight.
Default 0.0
drop_path_rate (float): Stochastic depth rate. Default 0.2.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN')
depths (list): Depths of each stage. Default [4, 4, 4].
sr_ratios (list): Kernel_size of conv in each Attn module in
Transformer encoder layer. Default: [4, 2, 1].
windiow_sizes (list): Window size of LSA. Default: [7, 7, 7],
input_features_slicebool): Input features need slice. Default: False.
norm_after_stagebool): Add extra norm. Default False.
strides (list): Strides in patch-Embedding modules. Default: (2, 2, 2)
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
in_channels=3,
embed_dims=[64, 128, 256],
patch_sizes=[4, 2, 2, 2],
strides=[4, 2, 2, 2],
num_heads=[1, 2, 4],
mlp_ratios=[4, 4, 4],
out_indices=(0, 1, 2, 3),
qkv_bias=False,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_cfg=dict(type='LN'),
depths=[4, 4, 4],
sr_ratios=[4, 2, 1],
windiow_sizes=[7, 7, 7],
norm_after_stage=True,
pretrained=None,
init_cfg=None):
super(SVT, self).__init__(in_channels, embed_dims, patch_sizes,
strides, num_heads, mlp_ratios, out_indices,
qkv_bias, drop_rate, attn_drop_rate,
drop_path_rate, norm_cfg, depths, sr_ratios,
norm_after_stage, pretrained, init_cfg)
# transformer encoder
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
for k in range(len(depths)):
for i in range(depths[k]):
if i % 2 == 0:
self.layers[k][i] = \
LSAEncoderLayer(
embed_dims=embed_dims[k],
num_heads=num_heads[k],
feedforward_channels=mlp_ratios[k] * embed_dims[k],
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=dpr[sum(depths[:k])+i],
qkv_bias=qkv_bias,
window_size=windiow_sizes[k])