mirror of https://github.com/alibaba/EasyCV.git
290 lines
11 KiB
Python
290 lines
11 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import math
|
|
from typing import List, Optional
|
|
|
|
import numpy as np
|
|
import torch
|
|
import torchvision
|
|
from packaging import version
|
|
from torch import Tensor
|
|
from torch.autograd import Function
|
|
|
|
if version.parse(torchvision.__version__) < version.parse('0.7'):
|
|
from torchvision.ops import _new_empty_tensor
|
|
from torchvision.ops.misc import _output_size
|
|
|
|
|
|
@torch.no_grad()
|
|
def accuracy(output, target, topk=(1, )):
|
|
"""Computes the precision@k for the specified values of k"""
|
|
if target.numel() == 0:
|
|
return [torch.zeros([], device=output.device)]
|
|
maxk = max(topk)
|
|
batch_size = target.size(0)
|
|
|
|
_, pred = output.topk(maxk, 1, True, True)
|
|
pred = pred.t()
|
|
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
|
|
|
res = []
|
|
for k in topk:
|
|
correct_k = correct[:k].view(-1).float().sum(0)
|
|
res.append(correct_k.mul_(100.0 / batch_size))
|
|
return res
|
|
|
|
|
|
def interpolate(input,
|
|
size=None,
|
|
scale_factor=None,
|
|
mode='nearest',
|
|
align_corners=None):
|
|
# type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
|
|
"""
|
|
Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
|
|
This will eventually be supported natively by PyTorch, and this
|
|
class can go away.
|
|
"""
|
|
if version.parse(torchvision.__version__) < version.parse('0.7'):
|
|
if input.numel() > 0:
|
|
return torch.nn.functional.interpolate(input, size, scale_factor,
|
|
mode, align_corners)
|
|
|
|
output_shape = _output_size(2, input, size, scale_factor)
|
|
output_shape = list(input.shape[:-2]) + list(output_shape)
|
|
return _new_empty_tensor(input, output_shape)
|
|
else:
|
|
return torchvision.ops.misc.interpolate(input, size, scale_factor,
|
|
mode, align_corners)
|
|
|
|
|
|
def select_single_mlvl(mlvl_tensors, batch_id, detach=True):
|
|
"""Extract a multi-scale single image tensor from a multi-scale batch
|
|
tensor based on batch index.
|
|
|
|
Note: The default value of detach is True, because the proposal gradient
|
|
needs to be detached during the training of the two-stage model. E.g
|
|
Cascade Mask R-CNN.
|
|
|
|
Args:
|
|
mlvl_tensors (list[Tensor]): Batch tensor for all scale levels,
|
|
each is a 4D-tensor.
|
|
batch_id (int): Batch index.
|
|
detach (bool): Whether detach gradient. Default True.
|
|
|
|
Returns:
|
|
list[Tensor]: Multi-scale single image tensor.
|
|
"""
|
|
assert isinstance(mlvl_tensors, (list, tuple))
|
|
num_levels = len(mlvl_tensors)
|
|
|
|
if detach:
|
|
mlvl_tensor_list = [
|
|
mlvl_tensors[i][batch_id].detach() for i in range(num_levels)
|
|
]
|
|
else:
|
|
mlvl_tensor_list = [
|
|
mlvl_tensors[i][batch_id] for i in range(num_levels)
|
|
]
|
|
return mlvl_tensor_list
|
|
|
|
|
|
def filter_scores_and_topk(scores, score_thr, topk, results=None):
|
|
"""Filter results using score threshold and topk candidates.
|
|
|
|
Args:
|
|
scores (Tensor): The scores, shape (num_bboxes, K).
|
|
score_thr (float): The score filter threshold.
|
|
topk (int): The number of topk candidates.
|
|
results (dict or list or Tensor, Optional): The results to
|
|
which the filtering rule is to be applied. The shape
|
|
of each item is (num_bboxes, N).
|
|
|
|
Returns:
|
|
tuple: Filtered results
|
|
|
|
- scores (Tensor): The scores after being filtered, \
|
|
shape (num_bboxes_filtered, ).
|
|
- labels (Tensor): The class labels, shape \
|
|
(num_bboxes_filtered, ).
|
|
- anchor_idxs (Tensor): The anchor indexes, shape \
|
|
(num_bboxes_filtered, ).
|
|
- filtered_results (dict or list or Tensor, Optional): \
|
|
The filtered results. The shape of each item is \
|
|
(num_bboxes_filtered, N).
|
|
"""
|
|
valid_mask = scores > score_thr
|
|
scores = scores[valid_mask]
|
|
valid_idxs = torch.nonzero(valid_mask)
|
|
|
|
num_topk = min(topk, valid_idxs.size(0))
|
|
# torch.sort is actually faster than .topk (at least on GPUs)
|
|
scores, idxs = scores.sort(descending=True)
|
|
scores = scores[:num_topk]
|
|
topk_idxs = valid_idxs[idxs[:num_topk]]
|
|
keep_idxs, labels = topk_idxs.unbind(dim=1)
|
|
|
|
filtered_results = None
|
|
if results is not None:
|
|
if isinstance(results, dict):
|
|
filtered_results = {k: v[keep_idxs] for k, v in results.items()}
|
|
elif isinstance(results, list):
|
|
filtered_results = [result[keep_idxs] for result in results]
|
|
elif isinstance(results, torch.Tensor):
|
|
filtered_results = results[keep_idxs]
|
|
else:
|
|
raise NotImplementedError(f'Only supports dict or list or Tensor, '
|
|
f'but get {type(results)}.')
|
|
return scores, labels, keep_idxs, filtered_results
|
|
|
|
|
|
def inverse_sigmoid(x, eps=1e-3):
|
|
x = x.clamp(min=0, max=1)
|
|
x1 = x.clamp(min=eps)
|
|
x2 = (1 - x).clamp(min=eps)
|
|
return torch.log(x1 / x2)
|
|
|
|
|
|
def gen_encoder_output_proposals(memory: Tensor,
|
|
memory_padding_mask: Tensor,
|
|
spatial_shapes: Tensor,
|
|
learnedwh=None):
|
|
"""
|
|
Input:
|
|
- memory: bs, \sum{hw}, d_model
|
|
- memory_padding_mask: bs, \sum{hw}
|
|
- spatial_shapes: nlevel, 2
|
|
- learnedwh: 2
|
|
Output:
|
|
- output_memory: bs, \sum{hw}, d_model
|
|
- output_proposals: bs, \sum{hw}, 4
|
|
"""
|
|
N_, S_, C_ = memory.shape
|
|
base_scale = 4.0
|
|
proposals = []
|
|
_cur = 0
|
|
for lvl, (H_, W_) in enumerate(spatial_shapes):
|
|
mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(
|
|
N_, H_, W_, 1)
|
|
valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
|
|
valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
|
|
|
|
# import ipdb; ipdb.set_trace()
|
|
|
|
grid_y, grid_x = torch.meshgrid(
|
|
torch.linspace(
|
|
0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
|
|
torch.linspace(
|
|
0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
|
|
grid = torch.cat(
|
|
[grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
|
|
|
|
scale = torch.cat([valid_W.unsqueeze(-1),
|
|
valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
|
|
grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
|
|
|
|
if learnedwh is not None:
|
|
# import ipdb; ipdb.set_trace()
|
|
wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl)
|
|
else:
|
|
wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
|
|
|
|
# scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
|
|
# grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
|
|
# wh = torch.ones_like(grid) / scale
|
|
proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
|
|
proposals.append(proposal)
|
|
_cur += (H_ * W_)
|
|
# import ipdb; ipdb.set_trace()
|
|
output_proposals = torch.cat(proposals, 1)
|
|
output_proposals_valid = ((output_proposals > 0.01) &
|
|
(output_proposals < 0.99)).all(
|
|
-1, keepdim=True)
|
|
output_proposals = torch.log(output_proposals /
|
|
(1 - output_proposals)) # unsigmoid
|
|
output_proposals = output_proposals.masked_fill(
|
|
memory_padding_mask.unsqueeze(-1), float('inf'))
|
|
output_proposals = output_proposals.masked_fill(~output_proposals_valid,
|
|
float('inf'))
|
|
|
|
output_memory = memory
|
|
output_memory = output_memory.masked_fill(
|
|
memory_padding_mask.unsqueeze(-1), float(0))
|
|
output_memory = output_memory.masked_fill(~output_proposals_valid,
|
|
float(0))
|
|
|
|
# output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
|
|
# output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
|
|
|
|
return output_memory, output_proposals
|
|
|
|
|
|
def gen_sineembed_for_position(pos_tensor):
|
|
# n_query, bs, _ = pos_tensor.size()
|
|
# sineembed_tensor = torch.zeros(n_query, bs, 256)
|
|
scale = 2 * math.pi
|
|
dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
|
|
dim_t = 10000**(2 * (dim_t // 2) / 128)
|
|
x_embed = pos_tensor[:, :, 0] * scale
|
|
y_embed = pos_tensor[:, :, 1] * scale
|
|
pos_x = x_embed[:, :, None] / dim_t
|
|
pos_y = y_embed[:, :, None] / dim_t
|
|
pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
|
|
dim=3).flatten(2)
|
|
pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
|
|
dim=3).flatten(2)
|
|
if pos_tensor.size(-1) == 2:
|
|
pos = torch.cat((pos_y, pos_x), dim=2)
|
|
elif pos_tensor.size(-1) == 4:
|
|
w_embed = pos_tensor[:, :, 2] * scale
|
|
pos_w = w_embed[:, :, None] / dim_t
|
|
pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
|
|
dim=3).flatten(2)
|
|
|
|
h_embed = pos_tensor[:, :, 3] * scale
|
|
pos_h = h_embed[:, :, None] / dim_t
|
|
pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
|
|
dim=3).flatten(2)
|
|
|
|
pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
|
|
else:
|
|
raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
|
|
pos_tensor.size(-1)))
|
|
return pos
|
|
|
|
|
|
class SigmoidGeometricMean(Function):
|
|
"""Forward and backward function of geometric mean of two sigmoid
|
|
functions.
|
|
|
|
This implementation with analytical gradient function substitutes
|
|
the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The
|
|
original implementation incurs none during gradient backprapagation
|
|
if both x and y are very small values.
|
|
"""
|
|
|
|
@staticmethod
|
|
def forward(ctx, x, y):
|
|
x_sigmoid = x.sigmoid()
|
|
y_sigmoid = y.sigmoid()
|
|
z = (x_sigmoid * y_sigmoid).sqrt()
|
|
ctx.save_for_backward(x_sigmoid, y_sigmoid, z)
|
|
return z
|
|
|
|
@staticmethod
|
|
def backward(ctx, grad_output):
|
|
x_sigmoid, y_sigmoid, z = ctx.saved_tensors
|
|
grad_x = grad_output * z * (1 - x_sigmoid) / 2
|
|
grad_y = grad_output * z * (1 - y_sigmoid) / 2
|
|
return grad_x, grad_y
|
|
|
|
|
|
sigmoid_geometric_mean = SigmoidGeometricMean.apply
|
|
|
|
|
|
def fp16_clamp(x, min=None, max=None):
|
|
if not x.is_cuda and x.dtype == torch.float16:
|
|
# clamp for cpu float16, tensor fp16 has no clamp implementation
|
|
return x.float().clamp(min, max).half()
|
|
|
|
return x.clamp(min, max)
|