mirror of https://github.com/open-mmlab/mmcv.git
682 lines
20 KiB
Python
682 lines
20 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import copy
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from mmcv.cnn.bricks.drop import DropPath
|
|
from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
|
|
BaseTransformerLayer,
|
|
MultiheadAttention, PatchEmbed,
|
|
PatchMerging,
|
|
TransformerLayerSequence)
|
|
from mmcv.runner import ModuleList
|
|
|
|
|
|
def test_adaptive_padding():
|
|
|
|
for padding in ('same', 'corner'):
|
|
kernel_size = 16
|
|
stride = 16
|
|
dilation = 1
|
|
input = torch.rand(1, 1, 15, 17)
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
out = adap_pad(input)
|
|
# padding to divisible by 16
|
|
assert (out.shape[2], out.shape[3]) == (16, 32)
|
|
input = torch.rand(1, 1, 16, 17)
|
|
out = adap_pad(input)
|
|
# padding to divisible by 16
|
|
assert (out.shape[2], out.shape[3]) == (16, 32)
|
|
|
|
kernel_size = (2, 2)
|
|
stride = (2, 2)
|
|
dilation = (1, 1)
|
|
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
input = torch.rand(1, 1, 11, 13)
|
|
out = adap_pad(input)
|
|
# padding to divisible by 2
|
|
assert (out.shape[2], out.shape[3]) == (12, 14)
|
|
|
|
kernel_size = (2, 2)
|
|
stride = (10, 10)
|
|
dilation = (1, 1)
|
|
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
input = torch.rand(1, 1, 10, 13)
|
|
out = adap_pad(input)
|
|
# no padding
|
|
assert (out.shape[2], out.shape[3]) == (10, 13)
|
|
|
|
kernel_size = (11, 11)
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
input = torch.rand(1, 1, 11, 13)
|
|
out = adap_pad(input)
|
|
# all padding
|
|
assert (out.shape[2], out.shape[3]) == (21, 21)
|
|
|
|
# test padding as kernel is (7,9)
|
|
input = torch.rand(1, 1, 11, 13)
|
|
stride = (3, 4)
|
|
kernel_size = (4, 5)
|
|
dilation = (2, 2)
|
|
# actually (7, 9)
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
dilation_out = adap_pad(input)
|
|
assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
|
|
kernel_size = (7, 9)
|
|
dilation = (1, 1)
|
|
adap_pad = AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=padding)
|
|
kernel79_out = adap_pad(input)
|
|
assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
|
|
assert kernel79_out.shape == dilation_out.shape
|
|
|
|
# assert only support "same" "corner"
|
|
with pytest.raises(AssertionError):
|
|
AdaptivePadding(
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
padding=1)
|
|
|
|
|
|
def test_patch_embed():
|
|
B = 2
|
|
H = 3
|
|
W = 4
|
|
C = 3
|
|
embed_dims = 10
|
|
kernel_size = 3
|
|
stride = 1
|
|
dummy_input = torch.rand(B, C, H, W)
|
|
patch_merge_1 = PatchEmbed(
|
|
in_channels=C,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=0,
|
|
dilation=1,
|
|
norm_cfg=None)
|
|
|
|
x1, shape = patch_merge_1(dummy_input)
|
|
# test out shape
|
|
assert x1.shape == (2, 2, 10)
|
|
# test outsize is correct
|
|
assert shape == (1, 2)
|
|
# test L = out_h * out_w
|
|
assert shape[0] * shape[1] == x1.shape[1]
|
|
|
|
B = 2
|
|
H = 10
|
|
W = 10
|
|
C = 3
|
|
embed_dims = 10
|
|
kernel_size = 5
|
|
stride = 2
|
|
dummy_input = torch.rand(B, C, H, W)
|
|
# test dilation
|
|
patch_merge_2 = PatchEmbed(
|
|
in_channels=C,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=0,
|
|
dilation=2,
|
|
norm_cfg=None,
|
|
)
|
|
|
|
x2, shape = patch_merge_2(dummy_input)
|
|
# test out shape
|
|
assert x2.shape == (2, 1, 10)
|
|
# test outsize is correct
|
|
assert shape == (1, 1)
|
|
# test L = out_h * out_w
|
|
assert shape[0] * shape[1] == x2.shape[1]
|
|
|
|
stride = 2
|
|
input_size = (10, 10)
|
|
|
|
dummy_input = torch.rand(B, C, H, W)
|
|
# test stride and norm
|
|
patch_merge_3 = PatchEmbed(
|
|
in_channels=C,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=0,
|
|
dilation=2,
|
|
norm_cfg=dict(type='LN'),
|
|
input_size=input_size)
|
|
|
|
x3, shape = patch_merge_3(dummy_input)
|
|
# test out shape
|
|
assert x3.shape == (2, 1, 10)
|
|
# test outsize is correct
|
|
assert shape == (1, 1)
|
|
# test L = out_h * out_w
|
|
assert shape[0] * shape[1] == x3.shape[1]
|
|
|
|
# test the init_out_size with nn.Unfold
|
|
assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
|
|
1) // 2 + 1
|
|
assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
|
|
1) // 2 + 1
|
|
H = 11
|
|
W = 12
|
|
input_size = (H, W)
|
|
dummy_input = torch.rand(B, C, H, W)
|
|
# test stride and norm
|
|
patch_merge_3 = PatchEmbed(
|
|
in_channels=C,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=0,
|
|
dilation=2,
|
|
norm_cfg=dict(type='LN'),
|
|
input_size=input_size)
|
|
|
|
_, shape = patch_merge_3(dummy_input)
|
|
# when input_size equal to real input
|
|
# the out_size should be equal to `init_out_size`
|
|
assert shape == patch_merge_3.init_out_size
|
|
|
|
input_size = (H, W)
|
|
dummy_input = torch.rand(B, C, H, W)
|
|
# test stride and norm
|
|
patch_merge_3 = PatchEmbed(
|
|
in_channels=C,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=0,
|
|
dilation=2,
|
|
norm_cfg=dict(type='LN'),
|
|
input_size=input_size)
|
|
|
|
_, shape = patch_merge_3(dummy_input)
|
|
# when input_size equal to real input
|
|
# the out_size should be equal to `init_out_size`
|
|
assert shape == patch_merge_3.init_out_size
|
|
|
|
# test adap padding
|
|
for padding in ('same', 'corner'):
|
|
in_c = 2
|
|
embed_dims = 3
|
|
B = 2
|
|
|
|
# test stride is 1
|
|
input_size = (5, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (1, 1)
|
|
dilation = 1
|
|
bias = False
|
|
|
|
x = torch.rand(B, in_c, *input_size)
|
|
patch_embed = PatchEmbed(
|
|
in_channels=in_c,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_embed(x)
|
|
assert x_out.size() == (B, 25, 3)
|
|
assert out_size == (5, 5)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test kernel_size == stride
|
|
input_size = (5, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (5, 5)
|
|
dilation = 1
|
|
bias = False
|
|
|
|
x = torch.rand(B, in_c, *input_size)
|
|
patch_embed = PatchEmbed(
|
|
in_channels=in_c,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_embed(x)
|
|
assert x_out.size() == (B, 1, 3)
|
|
assert out_size == (1, 1)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test kernel_size == stride
|
|
input_size = (6, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (5, 5)
|
|
dilation = 1
|
|
bias = False
|
|
|
|
x = torch.rand(B, in_c, *input_size)
|
|
patch_embed = PatchEmbed(
|
|
in_channels=in_c,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_embed(x)
|
|
assert x_out.size() == (B, 2, 3)
|
|
assert out_size == (2, 1)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test different kernel_size with different stride
|
|
input_size = (6, 5)
|
|
kernel_size = (6, 2)
|
|
stride = (6, 2)
|
|
dilation = 1
|
|
bias = False
|
|
|
|
x = torch.rand(B, in_c, *input_size)
|
|
patch_embed = PatchEmbed(
|
|
in_channels=in_c,
|
|
embed_dims=embed_dims,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_embed(x)
|
|
assert x_out.size() == (B, 3, 3)
|
|
assert out_size == (1, 3)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
|
|
def test_patch_merging():
|
|
|
|
# Test the model with int padding
|
|
in_c = 3
|
|
out_c = 4
|
|
kernel_size = 3
|
|
stride = 3
|
|
padding = 1
|
|
dilation = 1
|
|
bias = False
|
|
# test the case `pad_to_stride` is False
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
B, L, C = 1, 100, 3
|
|
input_size = (10, 10)
|
|
x = torch.rand(B, L, C)
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (1, 16, 4)
|
|
assert out_size == (4, 4)
|
|
# assert out size is consistent with real output
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
in_c = 4
|
|
out_c = 5
|
|
kernel_size = 6
|
|
stride = 3
|
|
padding = 2
|
|
dilation = 2
|
|
bias = False
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
B, L, C = 1, 100, 4
|
|
input_size = (10, 10)
|
|
x = torch.rand(B, L, C)
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (1, 4, 5)
|
|
assert out_size == (2, 2)
|
|
# assert out size is consistent with real output
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# Test with adaptive padding
|
|
for padding in ('same', 'corner'):
|
|
in_c = 2
|
|
out_c = 3
|
|
B = 2
|
|
|
|
# test stride is 1
|
|
input_size = (5, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (1, 1)
|
|
dilation = 1
|
|
bias = False
|
|
L = input_size[0] * input_size[1]
|
|
|
|
x = torch.rand(B, L, in_c)
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (B, 25, 3)
|
|
assert out_size == (5, 5)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test kernel_size == stride
|
|
input_size = (5, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (5, 5)
|
|
dilation = 1
|
|
bias = False
|
|
L = input_size[0] * input_size[1]
|
|
|
|
x = torch.rand(B, L, in_c)
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (B, 1, 3)
|
|
assert out_size == (1, 1)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test kernel_size == stride
|
|
input_size = (6, 5)
|
|
kernel_size = (5, 5)
|
|
stride = (5, 5)
|
|
dilation = 1
|
|
bias = False
|
|
L = input_size[0] * input_size[1]
|
|
|
|
x = torch.rand(B, L, in_c)
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (B, 2, 3)
|
|
assert out_size == (2, 1)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
# test different kernel_size with different stride
|
|
input_size = (6, 5)
|
|
kernel_size = (6, 2)
|
|
stride = (6, 2)
|
|
dilation = 1
|
|
bias = False
|
|
L = input_size[0] * input_size[1]
|
|
|
|
x = torch.rand(B, L, in_c)
|
|
patch_merge = PatchMerging(
|
|
in_channels=in_c,
|
|
out_channels=out_c,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
x_out, out_size = patch_merge(x, input_size)
|
|
assert x_out.size() == (B, 3, 3)
|
|
assert out_size == (1, 3)
|
|
assert x_out.size(1) == out_size[0] * out_size[1]
|
|
|
|
|
|
def test_multiheadattention():
|
|
MultiheadAttention(
|
|
embed_dims=5,
|
|
num_heads=5,
|
|
attn_drop=0,
|
|
proj_drop=0,
|
|
dropout_layer=dict(type='Dropout', drop_prob=0.),
|
|
batch_first=True)
|
|
batch_dim = 2
|
|
embed_dim = 5
|
|
num_query = 100
|
|
attn_batch_first = MultiheadAttention(
|
|
embed_dims=5,
|
|
num_heads=5,
|
|
attn_drop=0,
|
|
proj_drop=0,
|
|
dropout_layer=dict(type='DropPath', drop_prob=0.),
|
|
batch_first=True)
|
|
|
|
attn_query_first = MultiheadAttention(
|
|
embed_dims=5,
|
|
num_heads=5,
|
|
attn_drop=0,
|
|
proj_drop=0,
|
|
dropout_layer=dict(type='DropPath', drop_prob=0.),
|
|
batch_first=False)
|
|
|
|
param_dict = dict(attn_query_first.named_parameters())
|
|
for n, v in attn_batch_first.named_parameters():
|
|
param_dict[n].data = v.data
|
|
|
|
input_batch_first = torch.rand(batch_dim, num_query, embed_dim)
|
|
input_query_first = input_batch_first.transpose(0, 1)
|
|
|
|
assert torch.allclose(
|
|
attn_query_first(input_query_first).sum(),
|
|
attn_batch_first(input_batch_first).sum())
|
|
|
|
key_batch_first = torch.rand(batch_dim, num_query, embed_dim)
|
|
key_query_first = key_batch_first.transpose(0, 1)
|
|
|
|
assert torch.allclose(
|
|
attn_query_first(input_query_first, key_query_first).sum(),
|
|
attn_batch_first(input_batch_first, key_batch_first).sum())
|
|
|
|
identity = torch.ones_like(input_query_first)
|
|
|
|
# check deprecated arguments can be used normally
|
|
|
|
assert torch.allclose(
|
|
attn_query_first(
|
|
input_query_first, key_query_first, residual=identity).sum(),
|
|
attn_batch_first(input_batch_first, key_batch_first).sum() +
|
|
identity.sum() - input_batch_first.sum())
|
|
|
|
assert torch.allclose(
|
|
attn_query_first(
|
|
input_query_first, key_query_first, identity=identity).sum(),
|
|
attn_batch_first(input_batch_first, key_batch_first).sum() +
|
|
identity.sum() - input_batch_first.sum())
|
|
|
|
attn_query_first(
|
|
input_query_first, key_query_first, identity=identity).sum(),
|
|
|
|
|
|
def test_ffn():
|
|
with pytest.raises(AssertionError):
|
|
# num_fcs should be no less than 2
|
|
FFN(num_fcs=1)
|
|
FFN(dropout=0, add_residual=True)
|
|
ffn = FFN(dropout=0, add_identity=True)
|
|
|
|
input_tensor = torch.rand(2, 20, 256)
|
|
input_tensor_nbc = input_tensor.transpose(0, 1)
|
|
assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
|
|
residual = torch.rand_like(input_tensor)
|
|
torch.allclose(
|
|
ffn(input_tensor, residual=residual).sum(),
|
|
ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
|
|
|
|
torch.allclose(
|
|
ffn(input_tensor, identity=residual).sum(),
|
|
ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
|
|
|
|
|
|
@pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')
|
|
def test_basetransformerlayer_cuda():
|
|
# To test if the BaseTransformerLayer's behaviour remains
|
|
# consistent after being deepcopied
|
|
operation_order = ('self_attn', 'ffn')
|
|
baselayer = BaseTransformerLayer(
|
|
operation_order=operation_order,
|
|
batch_first=True,
|
|
attn_cfgs=dict(
|
|
type='MultiheadAttention',
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
),
|
|
)
|
|
baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])
|
|
baselayers.to('cuda')
|
|
x = torch.rand(2, 10, 256).cuda()
|
|
for m in baselayers:
|
|
x = m(x)
|
|
assert x.shape == torch.Size([2, 10, 256])
|
|
|
|
|
|
@pytest.mark.parametrize('embed_dims', [False, 256])
|
|
def test_basetransformerlayer(embed_dims):
|
|
attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
|
|
if embed_dims:
|
|
ffn_cfgs = dict(
|
|
type='FFN',
|
|
embed_dims=embed_dims,
|
|
feedforward_channels=1024,
|
|
num_fcs=2,
|
|
ffn_drop=0.,
|
|
act_cfg=dict(type='ReLU', inplace=True),
|
|
)
|
|
else:
|
|
ffn_cfgs = dict(
|
|
type='FFN',
|
|
feedforward_channels=1024,
|
|
num_fcs=2,
|
|
ffn_drop=0.,
|
|
act_cfg=dict(type='ReLU', inplace=True),
|
|
)
|
|
|
|
feedforward_channels = 2048
|
|
ffn_dropout = 0.1
|
|
operation_order = ('self_attn', 'norm', 'ffn', 'norm')
|
|
|
|
# test deprecated_args
|
|
baselayer = BaseTransformerLayer(
|
|
attn_cfgs=attn_cfgs,
|
|
ffn_cfgs=ffn_cfgs,
|
|
feedforward_channels=feedforward_channels,
|
|
ffn_dropout=ffn_dropout,
|
|
operation_order=operation_order)
|
|
assert baselayer.batch_first is False
|
|
assert baselayer.ffns[0].feedforward_channels == feedforward_channels
|
|
|
|
attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
|
|
feedforward_channels = 2048
|
|
ffn_dropout = 0.1
|
|
operation_order = ('self_attn', 'norm', 'ffn', 'norm')
|
|
baselayer = BaseTransformerLayer(
|
|
attn_cfgs=attn_cfgs,
|
|
feedforward_channels=feedforward_channels,
|
|
ffn_dropout=ffn_dropout,
|
|
operation_order=operation_order,
|
|
batch_first=True)
|
|
assert baselayer.attentions[0].batch_first
|
|
in_tensor = torch.rand(2, 10, 256)
|
|
baselayer(in_tensor)
|
|
|
|
|
|
def test_transformerlayersequence():
|
|
squeue = TransformerLayerSequence(
|
|
num_layers=6,
|
|
transformerlayers=dict(
|
|
type='BaseTransformerLayer',
|
|
attn_cfgs=[
|
|
dict(
|
|
type='MultiheadAttention',
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
dropout=0.1),
|
|
dict(type='MultiheadAttention', embed_dims=256, num_heads=4)
|
|
],
|
|
feedforward_channels=1024,
|
|
ffn_dropout=0.1,
|
|
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
|
|
'norm')))
|
|
assert len(squeue.layers) == 6
|
|
assert squeue.pre_norm is False
|
|
with pytest.raises(AssertionError):
|
|
# if transformerlayers is a list, len(transformerlayers)
|
|
# should be equal to num_layers
|
|
TransformerLayerSequence(
|
|
num_layers=6,
|
|
transformerlayers=[
|
|
dict(
|
|
type='BaseTransformerLayer',
|
|
attn_cfgs=[
|
|
dict(
|
|
type='MultiheadAttention',
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
dropout=0.1),
|
|
dict(type='MultiheadAttention', embed_dims=256)
|
|
],
|
|
feedforward_channels=1024,
|
|
ffn_dropout=0.1,
|
|
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
|
|
'ffn', 'norm'))
|
|
])
|
|
|
|
|
|
def test_drop_path():
|
|
drop_path = DropPath(drop_prob=0)
|
|
test_in = torch.rand(2, 3, 4, 5)
|
|
assert test_in is drop_path(test_in)
|
|
|
|
drop_path = DropPath(drop_prob=0.1)
|
|
drop_path.training = False
|
|
test_in = torch.rand(2, 3, 4, 5)
|
|
assert test_in is drop_path(test_in)
|
|
drop_path.training = True
|
|
assert test_in is not drop_path(test_in)
|