### v1.3.18 Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example: ```c++ if (input.device().is_cuda()) { #ifdef MMCV_WITH_CUDA CHECK_CUDA_INPUT(input); CHECK_CUDA_INPUT(rois); CHECK_CUDA_INPUT(output); CHECK_CUDA_INPUT(argmax_y); CHECK_CUDA_INPUT(argmax_x); roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); #else AT_ERROR("RoIAlign is not compiled with GPU support"); #endif } else { CHECK_CPU_INPUT(input); CHECK_CPU_INPUT(rois); CHECK_CPU_INPUT(output); CHECK_CPU_INPUT(argmax_y); CHECK_CPU_INPUT(argmax_x); roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` Registry and dispatcher are added to manage these implementations. ```c++ void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { ROIAlignForwardCUDAKernelLauncher( input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } // register cuda implementation void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda); // roi_align.cpp // use the dispatcher to invoke different implementation depending on device type of input tensors. void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` ### v1.3.11 In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206). The original directory was organized as follows. ``` . ├── common_cuda_helper.hpp ├── ops_cuda_kernel.cuh ├── pytorch_cpp_helper.hpp ├── pytorch_cuda_helper.hpp ├── parrots_cpp_helper.hpp ├── parrots_cuda_helper.hpp ├── parrots_cudawarpfunction.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── pybind.cpp └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ``` After refactored, it is organized as follows. ``` . ├── common │ ├── box_iou_rotated_utils.hpp │ ├── parrots_cpp_helper.hpp │ ├── parrots_cuda_helper.hpp │ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cuda_helper.hpp │   └── cuda │   ├── common_cuda_helper.hpp │   ├── parrots_cudawarpfunction.cuh │   ├── ... │   └── ops_cuda_kernel.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── info.cpp │   ├── pybind.cpp │   ├── ... │   ├── ops.cpp │   └── cuda │      ├── ... │      └── ops_cuda.cu └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ```