# Copyright (c) OpenMMLab. All rights reserved. import math from typing import List, Optional import numpy as np import torch import torchvision from packaging import version from torch import Tensor from torch.autograd import Function if version.parse(torchvision.__version__) < version.parse('0.7'): from torchvision.ops import _new_empty_tensor from torchvision.ops.misc import _output_size @torch.no_grad() def accuracy(output, target, topk=(1, )): """Computes the precision@k for the specified values of k""" if target.numel() == 0: return [torch.zeros([], device=output.device)] maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0) res.append(correct_k.mul_(100.0 / batch_size)) return res def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None): # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor """ Equivalent to nn.functional.interpolate, but with support for empty batch sizes. This will eventually be supported natively by PyTorch, and this class can go away. """ if version.parse(torchvision.__version__) < version.parse('0.7'): if input.numel() > 0: return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners) output_shape = _output_size(2, input, size, scale_factor) output_shape = list(input.shape[:-2]) + list(output_shape) return _new_empty_tensor(input, output_shape) else: return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) def select_single_mlvl(mlvl_tensors, batch_id, detach=True): """Extract a multi-scale single image tensor from a multi-scale batch tensor based on batch index. Note: The default value of detach is True, because the proposal gradient needs to be detached during the training of the two-stage model. E.g Cascade Mask R-CNN. Args: mlvl_tensors (list[Tensor]): Batch tensor for all scale levels, each is a 4D-tensor. batch_id (int): Batch index. detach (bool): Whether detach gradient. Default True. Returns: list[Tensor]: Multi-scale single image tensor. """ assert isinstance(mlvl_tensors, (list, tuple)) num_levels = len(mlvl_tensors) if detach: mlvl_tensor_list = [ mlvl_tensors[i][batch_id].detach() for i in range(num_levels) ] else: mlvl_tensor_list = [ mlvl_tensors[i][batch_id] for i in range(num_levels) ] return mlvl_tensor_list def inverse_sigmoid(x, eps=1e-3): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2) def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None): """ Input: - memory: bs, \sum{hw}, d_model - memory_padding_mask: bs, \sum{hw} - spatial_shapes: nlevel, 2 - learnedwh: 2 Output: - output_memory: bs, \sum{hw}, d_model - output_proposals: bs, \sum{hw}, 4 """ N_, S_, C_ = memory.shape base_scale = 4.0 proposals = [] _cur = 0 for lvl, (H_, W_) in enumerate(spatial_shapes): mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view( N_, H_, W_, 1) valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) # import ipdb; ipdb.set_trace() grid_y, grid_x = torch.meshgrid( torch.linspace( 0, H_ - 1, H_, dtype=torch.float32, device=memory.device), torch.linspace( 0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) grid = torch.cat( [grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale if learnedwh is not None: # import ipdb; ipdb.set_trace() wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) else: wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale # wh = torch.ones_like(grid) / scale proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) proposals.append(proposal) _cur += (H_ * W_) # import ipdb; ipdb.set_trace() output_proposals = torch.cat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( -1, keepdim=True) output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid output_proposals = output_proposals.masked_fill( memory_padding_mask.unsqueeze(-1), float('inf')) output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) output_memory = memory output_memory = output_memory.masked_fill( memory_padding_mask.unsqueeze(-1), float(0)) output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) return output_memory, output_proposals def gen_sineembed_for_position(pos_tensor): # n_query, bs, _ = pos_tensor.size() # sineembed_tensor = torch.zeros(n_query, bs, 256) scale = 2 * math.pi dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) dim_t = 10000**(2 * (dim_t // 2) / 128) x_embed = pos_tensor[:, :, 0] * scale y_embed = pos_tensor[:, :, 1] * scale pos_x = x_embed[:, :, None] / dim_t pos_y = y_embed[:, :, None] / dim_t pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) if pos_tensor.size(-1) == 2: pos = torch.cat((pos_y, pos_x), dim=2) elif pos_tensor.size(-1) == 4: w_embed = pos_tensor[:, :, 2] * scale pos_w = w_embed[:, :, None] / dim_t pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) h_embed = pos_tensor[:, :, 3] * scale pos_h = h_embed[:, :, None] / dim_t pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) else: raise ValueError('Unknown pos_tensor shape(-1):{}'.format( pos_tensor.size(-1))) return pos class SigmoidGeometricMean(Function): """Forward and backward function of geometric mean of two sigmoid functions. This implementation with analytical gradient function substitutes the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The original implementation incurs none during gradient backprapagation if both x and y are very small values. """ @staticmethod def forward(ctx, x, y): x_sigmoid = x.sigmoid() y_sigmoid = y.sigmoid() z = (x_sigmoid * y_sigmoid).sqrt() ctx.save_for_backward(x_sigmoid, y_sigmoid, z) return z @staticmethod def backward(ctx, grad_output): x_sigmoid, y_sigmoid, z = ctx.saved_tensors grad_x = grad_output * z * (1 - x_sigmoid) / 2 grad_y = grad_output * z * (1 - y_sigmoid) / 2 return grad_x, grad_y sigmoid_geometric_mean = SigmoidGeometricMean.apply def fp16_clamp(x, min=None, max=None): if not x.is_cuda and x.dtype == torch.float16: # clamp for cpu float16, tensor fp16 has no clamp implementation return x.float().clamp(min, max).half() return x.clamp(min, max)