mmcv/tests/test_ops/test_ms_deformable_attn.py

import pytest
import torch

from mmcv.ops.multi_scale_deform_attn import (
    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
    multi_scale_deformable_attn_pytorch)

_USING_PARROTS = True
try:
    from parrots.autograd import gradcheck
except ImportError:
    from torch.autograd import gradcheck
    _USING_PARROTS = False


def test_forward_multi_scale_deformable_attn_pytorch():
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
    S = sum([(H * W).item() for H, W in shapes])

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D) * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)

    multi_scale_deformable_attn_pytorch(value.double(), shapes,
                                        sampling_locations.double(),
                                        attention_weights.double()).detach()


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_forward_equal_with_pytorch_double():
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum([(H * W).item() for H, W in shapes])

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D).cuda() * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value.double(), shapes, sampling_locations.double(),
        attention_weights.double()).detach().cpu()

    output_cuda = MultiScaleDeformableAttnFunction.apply(
        value.double(), shapes, level_start_index, sampling_locations.double(),
        attention_weights.double(), im2col_step).detach().cpu()
    assert torch.allclose(output_cuda, output_pytorch)
    max_abs_err = (output_cuda - output_pytorch).abs().max()
    max_rel_err = ((output_cuda - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-18
    assert max_rel_err < 1e-15


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
def test_forward_equal_with_pytorch_float():
    N, M, D = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum([(H * W).item() for H, W in shapes])

    torch.manual_seed(3)
    value = torch.rand(N, S, M, D).cuda() * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2
    output_pytorch = multi_scale_deformable_attn_pytorch(
        value, shapes, sampling_locations, attention_weights).detach().cpu()

    output_cuda = MultiScaleDeformableAttnFunction.apply(
        value, shapes, level_start_index, sampling_locations,
        attention_weights, im2col_step).detach().cpu()
    assert torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
    max_abs_err = (output_cuda - output_pytorch).abs().max()
    max_rel_err = ((output_cuda - output_pytorch).abs() /
                   output_pytorch.abs()).max()
    assert max_abs_err < 1e-9
    assert max_rel_err < 1e-6


@pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
@pytest.mark.parametrize('channels', [
    4,
    30,
    32,
    64,
    71,
    1025,
])
def test_gradient_numerical(channels,
                            grad_value=True,
                            grad_sampling_loc=True,
                            grad_attn_weight=True):

    N, M, _ = 1, 2, 2
    Lq, L, P = 2, 2, 2
    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
    level_start_index = torch.cat((shapes.new_zeros(
        (1, )), shapes.prod(1).cumsum(0)[:-1]))
    S = sum([(H * W).item() for H, W in shapes])

    value = torch.rand(N, S, M, channels).cuda() * 0.01
    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
    attention_weights /= attention_weights.sum(
        -1, keepdim=True).sum(
            -2, keepdim=True)
    im2col_step = 2

    func = MultiScaleDeformableAttnFunction.apply

    value.requires_grad = grad_value
    sampling_locations.requires_grad = grad_sampling_loc
    attention_weights.requires_grad = grad_attn_weight
    if _USING_PARROTS:
        assert gradcheck(
            func, (value.double(), shapes, level_start_index,
                   sampling_locations.double(), attention_weights.double(),
                   im2col_step),
            no_grads=[shapes, level_start_index])
    else:
        assert gradcheck(func, (value.double(), shapes, level_start_index,
                                sampling_locations.double(),
                                attention_weights.double(), im2col_step))


def test_multiscale_deformable_attention():
    with pytest.raises(ValueError):
        # embed_dims must be divisible by num_heads,
        MultiScaleDeformableAttention(
            embed_dims=256,
            num_heads=7,
        )
    with pytest.raises(ValueError):
        # embed_dims must be divisible by num_heads,
        MultiScaleDeformableAttention(
            embed_dims=256,
            num_heads=7,
        )

    MultiScaleDeformableAttention(embed_dims=256, num_heads=8)
[Feature]: support Multi-Scale-DeformAttention in deformable-detr (#878) * add c++ ms_deform_atten * fix cpp lint * fix cpp lint * clang format * remove cmakefile * google style * clang-format precommit * use clang-format-lint-action * add transformer base class * add merge * add docstr * add pyargs * fix according to commments * resiger module * change to use basemodule * add _ between build function * split the name * fix according to comments * fix lint and fix unitest * fix cpp lint * fix bug of deformdetr_atten * fix drop out * fix residual * use CUDA_1D_KERNEL_LOOP 2021-04-23 16:35:15 +08:00			`import pytest`
			`import torch`

			`from mmcv.ops.multi_scale_deform_attn import (`
Refactor the baseclass related to transformer (#978) * minor changes * change to modulist * change to Sequential * replace dropout with attn_drop and proj_drop in MultiheadAttention * add operation_name for attn * add drop path and move all ffn args to ffncfgs * fix typo * fix a bug when use default value of ffn_cfgs * fix ffns * add deprecate warning * fix deprecate warning * change to pop kwargs * support register FFN of transformer * support batch first * fix batch first wapper * fix forward wapper * fix typo * fix lint * add unitest for transformer * fix unitest * fix equal * use allclose * fix comments * fix comments * change configdict to dict * move drop to a file * add comments for drop path * add noqa 501 * move bnc wapper to MultiheadAttention * move bnc wapper to MultiheadAttention * use dep warning * resolve comments * add unitest: * rename residual to identity * revert runner * msda residual to identity * rename inp_identity to identity * fix name * fix transformer * remove key in msda * remove assert for key Co-authored-by: HIT-cwh <2892770585@qq.com> Co-authored-by: bkhuang <congee524@gmail.com> Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> 2021-06-11 18:09:31 +08:00			`MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,`
			`multi_scale_deformable_attn_pytorch)`
[Feature]: support Multi-Scale-DeformAttention in deformable-detr (#878) * add c++ ms_deform_atten * fix cpp lint * fix cpp lint * clang format * remove cmakefile * google style * clang-format precommit * use clang-format-lint-action * add transformer base class * add merge * add docstr * add pyargs * fix according to commments * resiger module * change to use basemodule * add _ between build function * split the name * fix according to comments * fix lint and fix unitest * fix cpp lint * fix bug of deformdetr_atten * fix drop out * fix residual * use CUDA_1D_KERNEL_LOOP 2021-04-23 16:35:15 +08:00
Add ms_deformable_attn in parrots (#1042) 2021-05-25 13:13:05 +08:00			`_USING_PARROTS = True`
			`try:`
			`from parrots.autograd import gradcheck`
			`except ImportError:`
			`from torch.autograd import gradcheck`
			`_USING_PARROTS = False`

[Feature]: support Multi-Scale-DeformAttention in deformable-detr (#878) * add c++ ms_deform_atten * fix cpp lint * fix cpp lint * clang format * remove cmakefile * google style * clang-format precommit * use clang-format-lint-action * add transformer base class * add merge * add docstr * add pyargs * fix according to commments * resiger module * change to use basemodule * add _ between build function * split the name * fix according to comments * fix lint and fix unitest * fix cpp lint * fix bug of deformdetr_atten * fix drop out * fix residual * use CUDA_1D_KERNEL_LOOP 2021-04-23 16:35:15 +08:00
			`def test_forward_multi_scale_deformable_attn_pytorch():`
			`N, M, D = 1, 2, 2`
			`Lq, L, P = 2, 2, 2`
			`shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)`
			`S = sum([(H * W).item() for H, W in shapes])`

			`torch.manual_seed(3)`
			`value = torch.rand(N, S, M, D) * 0.01`
			`sampling_locations = torch.rand(N, Lq, M, L, P, 2)`
			`attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5`
			`attention_weights /= attention_weights.sum(`
			`-1, keepdim=True).sum(`
			`-2, keepdim=True)`

			`multi_scale_deformable_attn_pytorch(value.double(), shapes,`
			`sampling_locations.double(),`
			`attention_weights.double()).detach()`


			`@pytest.mark.skipif(`
			`not torch.cuda.is_available(), reason='requires CUDA support')`
			`def test_forward_equal_with_pytorch_double():`
			`N, M, D = 1, 2, 2`
			`Lq, L, P = 2, 2, 2`
			`shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()`
			`level_start_index = torch.cat((shapes.new_zeros(`
			`(1, )), shapes.prod(1).cumsum(0)[:-1]))`
			`S = sum([(H * W).item() for H, W in shapes])`

			`torch.manual_seed(3)`
			`value = torch.rand(N, S, M, D).cuda() * 0.01`
			`sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()`
			`attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5`
			`attention_weights /= attention_weights.sum(`
			`-1, keepdim=True).sum(`
			`-2, keepdim=True)`
			`im2col_step = 2`
			`output_pytorch = multi_scale_deformable_attn_pytorch(`
			`value.double(), shapes, sampling_locations.double(),`
			`attention_weights.double()).detach().cpu()`

			`output_cuda = MultiScaleDeformableAttnFunction.apply(`
			`value.double(), shapes, level_start_index, sampling_locations.double(),`
			`attention_weights.double(), im2col_step).detach().cpu()`
			`assert torch.allclose(output_cuda, output_pytorch)`
			`max_abs_err = (output_cuda - output_pytorch).abs().max()`
			`max_rel_err = ((output_cuda - output_pytorch).abs() /`
			`output_pytorch.abs()).max()`
			`assert max_abs_err < 1e-18`
			`assert max_rel_err < 1e-15`


			`@pytest.mark.skipif(`
			`not torch.cuda.is_available(), reason='requires CUDA support')`
			`def test_forward_equal_with_pytorch_float():`
			`N, M, D = 1, 2, 2`
			`Lq, L, P = 2, 2, 2`
			`shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()`
			`level_start_index = torch.cat((shapes.new_zeros(`
			`(1, )), shapes.prod(1).cumsum(0)[:-1]))`
			`S = sum([(H * W).item() for H, W in shapes])`

			`torch.manual_seed(3)`
			`value = torch.rand(N, S, M, D).cuda() * 0.01`
			`sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()`
			`attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5`
			`attention_weights /= attention_weights.sum(`
			`-1, keepdim=True).sum(`
			`-2, keepdim=True)`
			`im2col_step = 2`
			`output_pytorch = multi_scale_deformable_attn_pytorch(`
			`value, shapes, sampling_locations, attention_weights).detach().cpu()`

			`output_cuda = MultiScaleDeformableAttnFunction.apply(`
			`value, shapes, level_start_index, sampling_locations,`
			`attention_weights, im2col_step).detach().cpu()`
			`assert torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)`
			`max_abs_err = (output_cuda - output_pytorch).abs().max()`
			`max_rel_err = ((output_cuda - output_pytorch).abs() /`
			`output_pytorch.abs()).max()`
			`assert max_abs_err < 1e-9`
			`assert max_rel_err < 1e-6`


			`@pytest.mark.skipif(`
			`not torch.cuda.is_available(), reason='requires CUDA support')`
Refactor the baseclass related to transformer (#978) * minor changes * change to modulist * change to Sequential * replace dropout with attn_drop and proj_drop in MultiheadAttention * add operation_name for attn * add drop path and move all ffn args to ffncfgs * fix typo * fix a bug when use default value of ffn_cfgs * fix ffns * add deprecate warning * fix deprecate warning * change to pop kwargs * support register FFN of transformer * support batch first * fix batch first wapper * fix forward wapper * fix typo * fix lint * add unitest for transformer * fix unitest * fix equal * use allclose * fix comments * fix comments * change configdict to dict * move drop to a file * add comments for drop path * add noqa 501 * move bnc wapper to MultiheadAttention * move bnc wapper to MultiheadAttention * use dep warning * resolve comments * add unitest: * rename residual to identity * revert runner * msda residual to identity * rename inp_identity to identity * fix name * fix transformer * remove key in msda * remove assert for key Co-authored-by: HIT-cwh <2892770585@qq.com> Co-authored-by: bkhuang <congee524@gmail.com> Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> 2021-06-11 18:09:31 +08:00			`@pytest.mark.parametrize('channels', [`
			`4,`
			`30,`
			`32,`
			`64,`
			`71,`
			`1025,`
			`])`
[Feature]: support Multi-Scale-DeformAttention in deformable-detr (#878) * add c++ ms_deform_atten * fix cpp lint * fix cpp lint * clang format * remove cmakefile * google style * clang-format precommit * use clang-format-lint-action * add transformer base class * add merge * add docstr * add pyargs * fix according to commments * resiger module * change to use basemodule * add _ between build function * split the name * fix according to comments * fix lint and fix unitest * fix cpp lint * fix bug of deformdetr_atten * fix drop out * fix residual * use CUDA_1D_KERNEL_LOOP 2021-04-23 16:35:15 +08:00			`def test_gradient_numerical(channels,`
			`grad_value=True,`
			`grad_sampling_loc=True,`
			`grad_attn_weight=True):`

			`N, M, _ = 1, 2, 2`
			`Lq, L, P = 2, 2, 2`
			`shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()`
			`level_start_index = torch.cat((shapes.new_zeros(`
			`(1, )), shapes.prod(1).cumsum(0)[:-1]))`
			`S = sum([(H * W).item() for H, W in shapes])`

			`value = torch.rand(N, S, M, channels).cuda() * 0.01`
			`sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()`
			`attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5`
			`attention_weights /= attention_weights.sum(`
			`-1, keepdim=True).sum(`
			`-2, keepdim=True)`
			`im2col_step = 2`

			`func = MultiScaleDeformableAttnFunction.apply`

			`value.requires_grad = grad_value`
			`sampling_locations.requires_grad = grad_sampling_loc`
			`attention_weights.requires_grad = grad_attn_weight`
Add ms_deformable_attn in parrots (#1042) 2021-05-25 13:13:05 +08:00			`if _USING_PARROTS:`
			`assert gradcheck(`
			`func, (value.double(), shapes, level_start_index,`
			`sampling_locations.double(), attention_weights.double(),`
			`im2col_step),`
			`no_grads=[shapes, level_start_index])`
			`else:`
			`assert gradcheck(func, (value.double(), shapes, level_start_index,`
			`sampling_locations.double(),`
			`attention_weights.double(), im2col_step))`
Refactor the baseclass related to transformer (#978) * minor changes * change to modulist * change to Sequential * replace dropout with attn_drop and proj_drop in MultiheadAttention * add operation_name for attn * add drop path and move all ffn args to ffncfgs * fix typo * fix a bug when use default value of ffn_cfgs * fix ffns * add deprecate warning * fix deprecate warning * change to pop kwargs * support register FFN of transformer * support batch first * fix batch first wapper * fix forward wapper * fix typo * fix lint * add unitest for transformer * fix unitest * fix equal * use allclose * fix comments * fix comments * change configdict to dict * move drop to a file * add comments for drop path * add noqa 501 * move bnc wapper to MultiheadAttention * move bnc wapper to MultiheadAttention * use dep warning * resolve comments * add unitest: * rename residual to identity * revert runner * msda residual to identity * rename inp_identity to identity * fix name * fix transformer * remove key in msda * remove assert for key Co-authored-by: HIT-cwh <2892770585@qq.com> Co-authored-by: bkhuang <congee524@gmail.com> Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> 2021-06-11 18:09:31 +08:00

			`def test_multiscale_deformable_attention():`
			`with pytest.raises(ValueError):`
			`# embed_dims must be divisible by num_heads,`
			`MultiScaleDeformableAttention(`
			`embed_dims=256,`
			`num_heads=7,`
			`)`
			`with pytest.raises(ValueError):`
			`# embed_dims must be divisible by num_heads,`
			`MultiScaleDeformableAttention(`
			`embed_dims=256,`
			`num_heads=7,`
			`)`

			`MultiScaleDeformableAttention(embed_dims=256, num_heads=8)`