EasyCV/easycv/predictors/bevformer_predictor.py

# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import pickle

import mmcv
import numpy as np
import torch
from mmcv.parallel import DataContainer as DC

from easycv.core.bbox import get_box_type
from easycv.datasets.registry import PIPELINES
from easycv.datasets.shared.pipelines.format import to_tensor
from easycv.datasets.shared.pipelines.transforms import Compose
from easycv.framework.errors import ValueError
from easycv.predictors.base import InputProcessor, PredictorV2
from easycv.predictors.builder import PREDICTORS
from easycv.utils.misc import encode_str_to_tensor
from easycv.utils.registry import build_from_cfg


class BEVFormerInputProcessor(InputProcessor):
    """Process inputs for BEVFormer model.

    Args:
        cfg (Config): Config instance.
        pipelines (list[dict]): Data pipeline configs.
        batch_size (int): batch size for forward.
        use_camera (bool): Whether use camera data.
        box_type_3d (str): Box type.
        threads (int): Number of processes to process inputs.
    """

    def __init__(self,
                 cfg,
                 pipelines=None,
                 batch_size=1,
                 use_camera=True,
                 box_type_3d='LiDAR',
                 adapt_jit=False,
                 threads=8):
        self.use_camera = use_camera
        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
        self.adapt_jit = adapt_jit

        super(BEVFormerInputProcessor, self).__init__(
            cfg, pipelines=pipelines, batch_size=batch_size, threads=threads)

    def _prepare_input_dict(self, data_info):
        from nuscenes.eval.common.utils import Quaternion, quaternion_yaw

        input_dict = dict(
            ego2global_translation=data_info['ego2global_translation'],
            ego2global_rotation=data_info['ego2global_rotation'],
            scene_token=data_info['scene_token'],
            can_bus=data_info['can_bus'])
        if self.use_camera:
            image_paths = []
            lidar2img_rts = []
            lidar2cam_rts = []
            cam_intrinsics = []
            for cam_type, cam_info in data_info['cams'].items():
                cam_info['data_path'] = os.path.expanduser(
                    cam_info['data_path'])
                image_paths.append(cam_info['data_path'])
                # obtain lidar to image transformation matrix
                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
                lidar2cam_t = cam_info[
                    'sensor2lidar_translation'] @ lidar2cam_r.T
                lidar2cam_rt = np.eye(4)
                lidar2cam_rt[:3, :3] = lidar2cam_r.T
                lidar2cam_rt[3, :3] = -lidar2cam_t
                intrinsic = cam_info['cam_intrinsic']
                viewpad = np.eye(4)
                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
                lidar2img_rts.append(lidar2img_rt)

                cam_intrinsics.append(viewpad)
                lidar2cam_rts.append(lidar2cam_rt.T)

            input_dict.update(
                dict(
                    img_filename=image_paths,
                    lidar2img=lidar2img_rts,
                    cam_intrinsic=cam_intrinsics,
                    lidar2cam=lidar2cam_rts,
                ))

        rotation = Quaternion(input_dict['ego2global_rotation'])
        translation = input_dict['ego2global_translation']
        can_bus = input_dict['can_bus']
        can_bus[:3] = translation
        can_bus[3:7] = rotation
        patch_angle = quaternion_yaw(rotation) / np.pi * 180
        if patch_angle < 0:
            patch_angle += 360
        can_bus[-2] = patch_angle / 180 * np.pi
        can_bus[-1] = patch_angle

        input_dict['img_fields'] = []
        input_dict['bbox3d_fields'] = []
        input_dict['pts_mask_fields'] = []
        input_dict['pts_seg_fields'] = []
        input_dict['bbox_fields'] = []
        input_dict['mask_fields'] = []
        input_dict['seg_fields'] = []
        input_dict['box_type_3d'] = self.box_type_3d
        input_dict['box_mode_3d'] = self.box_mode_3d

        load_pipelines = [
            dict(type='LoadMultiViewImageFromFiles', to_float32=True)
        ]
        load_pipelines = Compose(
            [build_from_cfg(p, PIPELINES) for p in load_pipelines])
        result = load_pipelines(input_dict)
        return result

    def process_single(self, input):
        """Process single input sample.
        Args:
            input (str): Pickle file path, the content format is the same with the infos file of nusences.
        """
        data_info = mmcv.load(input) if isinstance(input, str) else input
        result = self._prepare_input_dict(data_info)
        result = self.processor(result)

        if self.adapt_jit:
            result['can_bus'] = DC(
                to_tensor(result['img_metas'][0]._data['can_bus']),
                cpu_only=False)
            result['lidar2img'] = DC(
                to_tensor(result['img_metas'][0]._data['lidar2img']),
                cpu_only=False)
            result['scene_token'] = DC(
                torch.tensor(
                    bytearray(
                        pickle.dumps(
                            result['img_metas'][0]._data['scene_token'])),
                    dtype=torch.uint8),
                cpu_only=False)
            result['img_shape'] = DC(
                to_tensor(result['img_metas'][0]._data['img_shape']),
                cpu_only=False)
        else:
            result['can_bus'] = DC(
                torch.stack(
                    [to_tensor(result['img_metas'][0]._data['can_bus'])]),
                cpu_only=False)
            result['lidar2img'] = DC(
                torch.stack(
                    [to_tensor(result['img_metas'][0]._data['lidar2img'])]),
                cpu_only=False)

        return result


@PREDICTORS.register_module()
class BEVFormerPredictor(PredictorV2):
    """Predictor for BEVFormer.

    Args:
        model_path (str): Path of model path.
        config_file (Optinal[str]): config file path for model and processor to init. Defaults to None.
        batch_size (int): batch size for forward.
        device (str | torch.device): Support str('cuda' or 'cpu') or torch.device, if is None, detect device automatically.
        save_results (bool): Whether to save predict results.
        save_path (str): File path for saving results, only valid when `save_results` is True.
        pipelines (list[dict]): Data pipeline configs.
        box_type_3d (str): Box type.
        use_camera (bool): Whether use camera data.
        score_threshold (float): Score threshold to filter inference results.
        input_processor_threads (int): Number of processes to process inputs.
        mode (str): The image mode into the model.
    """

    def __init__(self,
                 model_path,
                 config_file=None,
                 batch_size=1,
                 device=None,
                 save_results=False,
                 save_path=None,
                 pipelines=None,
                 box_type_3d='LiDAR',
                 use_camera=True,
                 score_threshold=0.1,
                 model_type=None,
                 input_processor_threads=8,
                 mode='BGR',
                 *arg,
                 **kwargs):
        if batch_size > 1:
            raise ValueError(
                f'Only support batch_size=1 now, but get batch_size={batch_size}'
            )
        self.model_type = model_type
        if self.model_type is None:
            if model_path.endswith('jit'):
                self.model_type = 'jit'
            elif model_path.endswith('blade'):
                self.model_type = 'blade'
        self.is_jit_model = self.model_type in ['jit', 'blade']
        self.use_camera = use_camera
        self.score_threshold = score_threshold
        self.result_key = 'pts_bbox'
        self.box_type_3d_str = box_type_3d
        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)

        super(BEVFormerPredictor, self).__init__(
            model_path,
            config_file=config_file,
            batch_size=batch_size,
            device=device,
            save_results=save_results,
            save_path=save_path,
            pipelines=pipelines,
            input_processor_threads=input_processor_threads,
            mode=mode,
            *arg,
            **kwargs)

        self.CLASSES = self.cfg.get('CLASSES', None)
        # The initial prev_bev should be the weight of self.model.pts_bbox_head.bev_embedding, but the weight cannot be taken out from the blade model.
        # So we using the dummy data as the the initial value, and it will not be used, just to adapt to jit and blade models.
        # init_prev_bev = self.model.pts_bbox_head.bev_embedding.weight.clone().detach()
        # init_prev_bev = init_prev_bev[:, None, :],  # [40000, 256] -> [40000, 1, 256]
        dummy_prev_bev = torch.rand(
            [self.cfg.bev_h * self.cfg.bev_w, 1,
             self.cfg.embed_dim]).to(self.device)
        self.prev_frame_info = {
            'prev_bev': dummy_prev_bev.to(self.device),
            'prev_scene_token': encode_str_to_tensor('dummy_prev_scene_token'),
            'prev_pos': torch.tensor(0),
            'prev_angle': torch.tensor(0),
        }

    def get_input_processor(self):
        return BEVFormerInputProcessor(
            self.cfg,
            pipelines=self.pipelines,
            batch_size=self.batch_size,
            use_camera=self.use_camera,
            box_type_3d=self.box_type_3d_str,
            adapt_jit=self.is_jit_model,
            threads=self.input_processor_threads)

    def prepare_model(self):
        if self.is_jit_model:
            model = torch.jit.load(self.model_path, map_location=self.device)
            return model
        return super().prepare_model()

    def model_forward(self, inputs):
        if self.is_jit_model:
            with torch.no_grad():
                img = inputs['img'][0][0]
                img_metas = {
                    'can_bus': inputs['can_bus'][0],
                    'lidar2img': inputs['lidar2img'][0],
                    'img_shape': inputs['img_shape'][0],
                    'scene_token': inputs['scene_token'][0],
                    'prev_bev': self.prev_frame_info['prev_bev'],
                    'prev_pos': self.prev_frame_info['prev_pos'],
                    'prev_angle': self.prev_frame_info['prev_angle'],
                    'prev_scene_token':
                    self.prev_frame_info['prev_scene_token']
                }
                inputs = (img, img_metas)
                outputs = self.model(*inputs)

            # update prev_frame_info
            self.prev_frame_info['prev_bev'] = outputs[3][0]
            self.prev_frame_info['prev_pos'] = outputs[3][1]
            self.prev_frame_info['prev_angle'] = outputs[3][2]
            self.prev_frame_info['prev_scene_token'] = outputs[3][3]

            outputs = {
                'pts_bbox': [{
                    'scores_3d':
                    outputs[0],
                    'labels_3d':
                    outputs[1],
                    'boxes_3d':
                    self.box_type_3d(outputs[2].cpu(), outputs[2].size()[-1])
                }],
            }
            return outputs
        return super().model_forward(inputs)

    def visualize(self, inputs, results, out_dir, show=False, pipeline=None):
        raise NotImplementedError