From d9549fba04851755709fea59699f5f846d63ce62 Mon Sep 17 00:00:00 2001
From: zhuyuanhao <zhuyuanhao@sensetime.com>
Date: Mon, 29 Jun 2020 18:48:50 +0800
Subject: [PATCH] fix cpp header error (#371)

* 1. use macro USE_PARROTS control header include
2. add clang-format google style in pre-commit

* use MMCV_ macros
---
 .pre-commit-config.yaml                       |   2 +-
 mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh   |   9 +-
 mmcv/ops/csrc/carafe_cuda_kernel.cuh          |  11 ++
 mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh    |   8 +-
 mmcv/ops/csrc/cc_attention_cuda_kernel.cuh    |  12 +-
 mmcv/ops/csrc/deform_conv_cuda_kernel.cuh     |  12 +-
 mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh |  12 +-
 mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh   |  11 ++
 .../modulated_deform_conv_cuda_kernel.cuh     |  12 +-
 .../{nms_kernel.cuh => nms_cuda_kernel.cuh}   |  12 +-
 mmcv/ops/csrc/parrots/deform_conv_cuda.cu     |   2 +-
 mmcv/ops/csrc/parrots/focal_loss_cuda.cu      |   4 +-
 mmcv/ops/csrc/parrots/nms_cuda.cu             |   2 +-
 mmcv/ops/csrc/parrots/roi_align_cuda.cu       |   2 +-
 mmcv/ops/csrc/parrots/roi_pool_cuda.cu        |   2 +-
 mmcv/ops/csrc/psamask_cuda_kernel.cuh         |  13 +-
 mmcv/ops/csrc/pytorch/bbox_overlaps.cpp       |   4 +-
 mmcv/ops/csrc/pytorch/carafe.cpp              |   6 +-
 mmcv/ops/csrc/pytorch/carafe_naive.cpp        |   6 +-
 mmcv/ops/csrc/pytorch/cc_attention.cpp        |  10 +-
 mmcv/ops/csrc/pytorch/deform_conv.cpp         |   8 +-
 mmcv/ops/csrc/pytorch/deform_roi_pool.cpp     |   6 +-
 mmcv/ops/csrc/pytorch/focal_loss.cpp          |  10 +-
 mmcv/ops/csrc/pytorch/focal_loss_cuda.cu      |   4 +-
 mmcv/ops/csrc/pytorch/info.cpp                |   4 +-
 mmcv/ops/csrc/pytorch/masked_conv2d.cpp       |   6 +-
 .../csrc/pytorch/modulated_deform_conv.cpp    |   6 +-
 mmcv/ops/csrc/pytorch/nms.cpp                 |   4 +-
 mmcv/ops/csrc/pytorch/nms_cuda.cu             |   2 +-
 mmcv/ops/csrc/pytorch/psamask.cpp             |   6 +-
 mmcv/ops/csrc/pytorch/roi_align.cpp           |   6 +-
 mmcv/ops/csrc/pytorch/roi_align_cuda.cu       |   2 +-
 mmcv/ops/csrc/pytorch/roi_pool.cpp            |   6 +-
 mmcv/ops/csrc/pytorch/roi_pool_cuda.cu        |   2 +-
 mmcv/ops/csrc/pytorch/sync_bn.cpp             |  12 +-
 ...n_kernel.cuh => roi_align_cuda_kernel.cuh} |  12 +-
 ...ol_kernel.cuh => roi_pool_cuda_kernel.cuh} |  12 +-
 ...cuh => sigmoid_focal_loss_cuda_kernel.cuh} |  13 +-
 ...cuh => softmax_focal_loss_cuda_kernel.cuh} |  13 +-
 mmcv/ops/csrc/softnms_kernel.cuh              | 160 ------------------
 mmcv/ops/csrc/sync_bn_cuda_kernel.cuh         |  12 +-
 setup.py                                      |  24 ++-
 42 files changed, 216 insertions(+), 266 deletions(-)
 rename mmcv/ops/csrc/{nms_kernel.cuh => nms_cuda_kernel.cuh} (92%)
 rename mmcv/ops/csrc/{roi_align_kernel.cuh => roi_align_cuda_kernel.cuh} (97%)
 rename mmcv/ops/csrc/{roi_pool_kernel.cuh => roi_pool_cuda_kernel.cuh} (94%)
 rename mmcv/ops/csrc/{sigmoid_focal_loss_kernel.cuh => sigmoid_focal_loss_cuda_kernel.cuh} (89%)
 rename mmcv/ops/csrc/{softmax_focal_loss_kernel.cuh => softmax_focal_loss_cuda_kernel.cuh} (88%)
 delete mode 100644 mmcv/ops/csrc/softnms_kernel.cuh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4d1088623..a5c95dce9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,6 +33,6 @@ repos:
       - id: clang-format
         name: clang-format
         description: Format files with ClangFormat
-        entry: clang-format -i
+        entry: clang-format -style=google -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh
index 31d79e14f..e5fccabae 100644
--- a/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/bbox_overlaps_cuda_kernel.cuh
@@ -1,6 +1,12 @@
 #ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
 #define BBOX_OVERLAPS_CUDA_KERNEL_CUH
 
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
 template <typename T>
 __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                           T* ious, const int num_bbox1,
@@ -73,4 +79,5 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
     }
   }
 }
-#endif
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/carafe_cuda_kernel.cuh
index 67c865c71..e9b569d3b 100644
--- a/mmcv/ops/csrc/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/carafe_cuda_kernel.cuh
@@ -1,3 +1,12 @@
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
 #define WARP_SIZE 32
 #define THREADS_PER_PIXEL 32
 #define MAX_SHARED_MEMORY 49152
@@ -301,3 +310,5 @@ __global__ void CARAFEBackward_Mask(const int num_kernels,
     mask_diff[mask_id] = output_val;
   }
 }
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh b/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh
index b51ad2414..6f375162c 100644
--- a/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/carafe_naive_cuda_kernel.cuh
@@ -1,6 +1,12 @@
 #ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
 #define CARAFE_NAIVE_CUDA_KERNEL_CUH
 
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
 __device__ inline int Loc2Index(const int n, const int c, const int h,
                                 const int w, const int channel_num,
                                 const int height, const int width) {
@@ -101,4 +107,4 @@ __global__ void carafe_naive_backward_cuda_kernel(
   }
 }
 
-#endif
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh b/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh
index 74dfbf277..0dd9c33c6 100644
--- a/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/cc_attention_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef CA_CUDA_KERNEL_CUH
-#define CA_CUDA_KERNEL_CUH
+#ifndef CC_ATTENTION_CUDA_KERNEL_CUH
+#define CC_ATTENTION_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void ca_forward_kernel(const T *t, const T *f, T *weight, int num,
@@ -176,4 +182,4 @@ __global__ void ca_map_backward_kernel_g(const T *dout, const T *weight,
   }
 }
 
-#endif
+#endif  // CC_ATTENTION_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh
index a5e0b36fb..b6ddf34c9 100644
--- a/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/deform_conv_cuda_kernel.cuh
@@ -63,8 +63,14 @@
 // modified from
 // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
 
-#ifndef DEFORM_CONV_KERNEL_CUH
-#define DEFORM_CONV_KERNEL_CUH
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __device__ T deformable_im2col_bilinear(const T *input, const int data_width,
@@ -353,4 +359,4 @@ __global__ void deformable_col2im_coord_gpu_kernel(
   }
 }
 
-#endif
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh
index 6ac3dda0f..e6b59b399 100644
--- a/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/deform_roi_pool_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef DEFORM_POOL_KERNEL_CUH
-#define DEFORM_POOL_KERNEL_CUH
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void deform_roi_pool_forward_cuda_kernel(
@@ -174,4 +180,4 @@ __global__ void deform_roi_pool_backward_cuda_kernel(
   }
 }
 
-#endif
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh b/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh
index 60054407e..4be8329ae 100644
--- a/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/masked_conv2d_cuda_kernel.cuh
@@ -1,3 +1,12 @@
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
 template <typename scalar_t>
 __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
                                     const int height, const int width,
@@ -48,3 +57,5 @@ __global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
     data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
   }
 }
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh
index 4c12863a6..9953a0945 100644
--- a/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/modulated_deform_conv_cuda_kernel.cuh
@@ -63,8 +63,14 @@
 // modified from
 // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
 
-#ifndef MODULATED_DEFORM_CONV_KERNEL_CUH
-#define MODULATED_DEFORM_CONV_KERNEL_CUH
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
@@ -385,4 +391,4 @@ __global__ void modulated_deformable_col2im_coord_gpu_kernel(
   }
 }
 
-#endif
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/nms_kernel.cuh b/mmcv/ops/csrc/nms_cuda_kernel.cuh
similarity index 92%
rename from mmcv/ops/csrc/nms_kernel.cuh
rename to mmcv/ops/csrc/nms_cuda_kernel.cuh
index d37e74168..bde6334fa 100644
--- a/mmcv/ops/csrc/nms_kernel.cuh
+++ b/mmcv/ops/csrc/nms_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef NMS_KERNEL_CUH
-#define NMS_KERNEL_CUH
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 int const threadsPerBlock = sizeof(unsigned long long int) * 8;
@@ -60,4 +66,4 @@ __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
     dev_mask[cur_box_idx * gridDim.y + col_start] = t;
   }
 }
-#endif
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/parrots/deform_conv_cuda.cu b/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
index 859b8c6d6..6aa1147ff 100644
--- a/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
+++ b/mmcv/ops/csrc/parrots/deform_conv_cuda.cu
@@ -406,7 +406,7 @@ void DeformConvBackwardInputCUDAKernelLauncher(
   }
 }
 
-int DeformConvBackwardParametersCUDAKernelLauncher(
+void DeformConvBackwardParametersCUDAKernelLauncher(
     DArrayLite input, DArrayLite offset, DArrayLite gradOutput,
     DArrayLite gradWeight, DArrayLite columns, DArrayLite ones, int kW, int kH,
     int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
diff --git a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
index b25347cc5..2bba0dcc9 100644
--- a/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/parrots/focal_loss_cuda.cu
@@ -1,6 +1,6 @@
 #include "parrots_cuda_helper.hpp"
-#include "sigmoid_focal_loss_kernel.cuh"
-#include "softmax_focal_loss_kernel.cuh"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
 
 void SigmoidFocalLossForwardCUDAKernelLauncher(
     const DArrayLite input, const DArrayLite target, const DArrayLite weight,
diff --git a/mmcv/ops/csrc/parrots/nms_cuda.cu b/mmcv/ops/csrc/parrots/nms_cuda.cu
index 477d239c0..840e79264 100644
--- a/mmcv/ops/csrc/parrots/nms_cuda.cu
+++ b/mmcv/ops/csrc/parrots/nms_cuda.cu
@@ -1,4 +1,4 @@
-#include "nms_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
 #include "parrots_cuda_helper.hpp"
 
 DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
diff --git a/mmcv/ops/csrc/parrots/roi_align_cuda.cu b/mmcv/ops/csrc/parrots/roi_align_cuda.cu
index 5e640b7e0..2c3983111 100644
--- a/mmcv/ops/csrc/parrots/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_align_cuda.cu
@@ -1,5 +1,5 @@
 #include "parrots_cuda_helper.hpp"
-#include "roi_align_kernel.cuh"
+#include "roi_align_cuda_kernel.cuh"
 
 void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
                                        const DArrayLite rois, DArrayLite output,
diff --git a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu b/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
index 5d0e89d10..a94ffcea2 100644
--- a/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/parrots/roi_pool_cuda.cu
@@ -1,5 +1,5 @@
 #include "parrots_cuda_helper.hpp"
-#include "roi_pool_kernel.cuh"
+#include "roi_pool_cuda_kernel.cuh"
 
 void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
                                       const DArrayLite rois, DArrayLite output,
diff --git a/mmcv/ops/csrc/psamask_cuda_kernel.cuh b/mmcv/ops/csrc/psamask_cuda_kernel.cuh
index c4c2c07b4..3e22944c8 100644
--- a/mmcv/ops/csrc/psamask_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/psamask_cuda_kernel.cuh
@@ -1,5 +1,12 @@
-#ifndef PSAMASK_CUDA_CUH
-#define PSAMASK_CUDA_CUH
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
 // CUDA: grid stride looping
 #ifndef CUDA_KERNEL_LOOP
 #define CUDA_KERNEL_LOOP(i, n)                                 \
@@ -130,4 +137,4 @@ __global__ void psamask_distribute_backward_cuda(
   }
 }
 
-#endif
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
index 5240bfa6d..23bf7d434 100644
--- a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                     Tensor ious, const int mode,
                                     const bool aligned, const int offset);
@@ -14,7 +14,7 @@ void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                    const int mode, const bool aligned, const int offset) {
   if (bboxes1.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(bboxes1);
     CHECK_CUDA_INPUT(bboxes2);
     CHECK_CUDA_INPUT(ious);
diff --git a/mmcv/ops/csrc/pytorch/carafe.cpp b/mmcv/ops/csrc/pytorch/carafe.cpp
index b627b425d..67619284f 100644
--- a/mmcv/ops/csrc/pytorch/carafe.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
                                      Tensor rfeatures, Tensor routput,
                                      Tensor rmasks, Tensor output,
@@ -38,7 +38,7 @@ void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                     Tensor routput, Tensor rmasks, Tensor output,
                     int kernel_size, int group_size, int scale_factor) {
   if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(features);
     CHECK_CUDA_INPUT(masks);
     CHECK_CUDA_INPUT(rfeatures);
@@ -61,7 +61,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                      Tensor mask_grad, int kernel_size, int group_size,
                      int scale_factor) {
   if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(top_grad);
     CHECK_CUDA_INPUT(rfeatures);
     CHECK_CUDA_INPUT(masks);
diff --git a/mmcv/ops/csrc/pytorch/carafe_naive.cpp b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
index 9e3679c3b..bb0aa0978 100644
--- a/mmcv/ops/csrc/pytorch/carafe_naive.cpp
+++ b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
                                           const Tensor masks, Tensor output,
                                           const int kernel_size,
@@ -32,7 +32,7 @@ void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                           int kernel_size, int group_size, int scale_factor) {
   if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(features);
     CHECK_CUDA_INPUT(masks);
     CHECK_CUDA_INPUT(output);
@@ -50,7 +50,7 @@ void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                            Tensor bottom_grad, Tensor mask_grad,
                            int kernel_size, int group_size, int scale_factor) {
   if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(top_grad);
     CHECK_CUDA_INPUT(features);
     CHECK_CUDA_INPUT(masks);
diff --git a/mmcv/ops/csrc/pytorch/cc_attention.cpp b/mmcv/ops/csrc/pytorch/cc_attention.cpp
index 37c14d6eb..9d7e48a4d 100644
--- a/mmcv/ops/csrc/pytorch/cc_attention.cpp
+++ b/mmcv/ops/csrc/pytorch/cc_attention.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f, Tensor weight);
 
 void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
@@ -33,7 +33,7 @@ void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
 
 void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
   if (t.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(t);
     CHECK_CUDA_INPUT(f);
     CHECK_CUDA_INPUT(weight);
@@ -49,7 +49,7 @@ void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
 void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
                  Tensor df) {
   if (dw.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(dw);
     CHECK_CUDA_INPUT(t);
     CHECK_CUDA_INPUT(f);
@@ -66,7 +66,7 @@ void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
 
 void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
   if (weight.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(weight);
     CHECK_CUDA_INPUT(g);
     CHECK_CUDA_INPUT(out);
@@ -82,7 +82,7 @@ void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
 void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
                      Tensor dw, Tensor dg) {
   if (dout.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(dout);
     CHECK_CUDA_INPUT(weight);
     CHECK_CUDA_INPUT(g);
diff --git a/mmcv/ops/csrc/pytorch/deform_conv.cpp b/mmcv/ops/csrc/pytorch/deform_conv.cpp
index ca1e8c20e..9b2dc4495 100644
--- a/mmcv/ops/csrc/pytorch/deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_conv.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void DeformConvForwardCUDAKernelLauncher(Tensor input, Tensor weight,
                                          Tensor offset, Tensor output,
                                          Tensor columns, Tensor ones, int kW,
@@ -62,7 +62,7 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
                          int dilationW, int dilationH, int group,
                          int deformable_group, int im2col_step) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(offset);
     CHECK_CUDA_INPUT(weight);
@@ -88,7 +88,7 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
                                 int dilationW, int dilationH, int group,
                                 int deformable_group, int im2col_step) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(offset);
     CHECK_CUDA_INPUT(gradOutput);
@@ -117,7 +117,7 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                                      int deformable_group, float scale,
                                      int im2col_step) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(offset);
     CHECK_CUDA_INPUT(gradOutput);
diff --git a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
index 793cd2f17..1022ea91f 100644
--- a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
                                             Tensor offset, Tensor output,
                                             int pooled_height, int pooled_width,
@@ -38,7 +38,7 @@ void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                              float spatial_scale, int sampling_ratio,
                              float gamma) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(rois);
     CHECK_CUDA_INPUT(offset);
@@ -61,7 +61,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                               int pooled_width, float spatial_scale,
                               int sampling_ratio, float gamma) {
   if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_output);
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(rois);
diff --git a/mmcv/ops/csrc/pytorch/focal_loss.cpp b/mmcv/ops/csrc/pytorch/focal_loss.cpp
index a792e7ad5..ae650611e 100644
--- a/mmcv/ops/csrc/pytorch/focal_loss.cpp
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight, Tensor output,
                                                const float gamma,
@@ -54,7 +54,7 @@ void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(target);
     CHECK_CUDA_INPUT(weight);
@@ -73,7 +73,7 @@ void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor grad_input, float gamma, float alpha) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(target);
     CHECK_CUDA_INPUT(weight);
@@ -92,7 +92,7 @@ void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(target);
     CHECK_CUDA_INPUT(weight);
@@ -112,7 +112,7 @@ void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor buff, Tensor grad_input, float gamma,
                                  float alpha) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(target);
     CHECK_CUDA_INPUT(weight);
diff --git a/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu b/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
index ca069edc0..508f449ba 100644
--- a/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/focal_loss_cuda.cu
@@ -1,6 +1,6 @@
 #include "pytorch_cuda_helper.hpp"
-#include "sigmoid_focal_loss_kernel.cuh"
-#include "softmax_focal_loss_kernel.cuh"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
 
 void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
                                                Tensor weight, Tensor output,
diff --git a/mmcv/ops/csrc/pytorch/info.cpp b/mmcv/ops/csrc/pytorch/info.cpp
index f3db1d7ac..a2ebafa84 100644
--- a/mmcv/ops/csrc/pytorch/info.cpp
+++ b/mmcv/ops/csrc/pytorch/info.cpp
@@ -2,13 +2,13 @@
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 #include <cuda_runtime_api.h>
 int get_cudart_version() { return CUDART_VERSION; }
 #endif
 
 std::string get_compiling_cuda_version() {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
   std::ostringstream oss;
   // copied from
   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
diff --git a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
index 21d402df1..8efc8eba7 100644
--- a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
                                            const Tensor mask_h_idx,
                                            const Tensor mask_w_idx,
@@ -39,7 +39,7 @@ void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                            const int kernel_h, const int kernel_w,
                            const int pad_h, const int pad_w) {
   if (im.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(im);
     CHECK_CUDA_INPUT(mask_h_idx);
     CHECK_CUDA_INPUT(mask_w_idx);
@@ -58,7 +58,7 @@ void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                            const Tensor mask_w_idx, Tensor im, int height,
                            int width, int channels) {
   if (col.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(col);
     CHECK_CUDA_INPUT(mask_h_idx);
     CHECK_CUDA_INPUT(mask_w_idx);
diff --git a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
index 5a95e72f9..a7e6013fb 100644
--- a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ModulatedDeformConvForwardCUDAKernelLauncher(
     Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
     Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
@@ -50,7 +50,7 @@ void modulated_deform_conv_forward(
     const int dilation_h, const int dilation_w, const int group,
     const int deformable_group, const bool with_bias) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(weight);
     CHECK_CUDA_INPUT(bias);
@@ -80,7 +80,7 @@ void modulated_deform_conv_backward(
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(weight);
     CHECK_CUDA_INPUT(bias);
diff --git a/mmcv/ops/csrc/pytorch/nms.cpp b/mmcv/ops/csrc/pytorch/nms.cpp
index 10ef8ae59..73e852bee 100644
--- a/mmcv/ops/csrc/pytorch/nms.cpp
+++ b/mmcv/ops/csrc/pytorch/nms.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                              int offset);
 
@@ -62,7 +62,7 @@ Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
 
 Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
   if (boxes.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(boxes);
     CHECK_CUDA_INPUT(scores);
     return nms_cuda(boxes, scores, iou_threshold, offset);
diff --git a/mmcv/ops/csrc/pytorch/nms_cuda.cu b/mmcv/ops/csrc/pytorch/nms_cuda.cu
index 63ded78a4..893acae52 100644
--- a/mmcv/ops/csrc/pytorch/nms_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/nms_cuda.cu
@@ -1,4 +1,4 @@
-#include "nms_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
 Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
diff --git a/mmcv/ops/csrc/pytorch/psamask.cpp b/mmcv/ops/csrc/pytorch/psamask.cpp
index 5fa3e6d1d..d5d0e5685 100644
--- a/mmcv/ops/csrc/pytorch/psamask.cpp
+++ b/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -182,7 +182,7 @@ void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
                                 grad_input);
 }
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
                                       Tensor output, const int num_,
                                       const int h_feature, const int w_feature,
@@ -221,7 +221,7 @@ void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                      const int h_mask, const int w_mask, const int half_h_mask,
                      const int half_w_mask) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(output);
     psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
@@ -240,7 +240,7 @@ void psamask_backward(Tensor grad_output, const Tensor grad_input,
                       const int w_feature, const int h_mask, const int w_mask,
                       const int half_h_mask, const int half_w_mask) {
   if (grad_input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_input);
     CHECK_CUDA_INPUT(grad_output);
     psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
diff --git a/mmcv/ops/csrc/pytorch/roi_align.cpp b/mmcv/ops/csrc/pytorch/roi_align.cpp
index 459a650fe..5b1d63ca3 100644
--- a/mmcv/ops/csrc/pytorch/roi_align.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                        Tensor argmax_y, Tensor argmax_x,
                                        int aligned_height, int aligned_width,
@@ -40,7 +40,7 @@ void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(rois);
     CHECK_CUDA_INPUT(output);
@@ -63,7 +63,7 @@ void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                         int aligned_width, float spatial_scale,
                         int sampling_ratio, int pool_mode, bool aligned) {
   if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_output);
     CHECK_CUDA_INPUT(rois);
     CHECK_CUDA_INPUT(argmax_y);
diff --git a/mmcv/ops/csrc/pytorch/roi_align_cuda.cu b/mmcv/ops/csrc/pytorch/roi_align_cuda.cu
index 63aaf1774..822e9183f 100644
--- a/mmcv/ops/csrc/pytorch/roi_align_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/roi_align_cuda.cu
@@ -1,5 +1,5 @@
 #include "pytorch_cuda_helper.hpp"
-#include "roi_align_kernel.cuh"
+#include "roi_align_cuda_kernel.cuh"
 
 void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                        Tensor argmax_y, Tensor argmax_x,
diff --git a/mmcv/ops/csrc/pytorch/roi_pool.cpp b/mmcv/ops/csrc/pytorch/roi_pool.cpp
index 1cee9d7f8..e41270512 100644
--- a/mmcv/ops/csrc/pytorch/roi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_pool.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax, int pooled_height,
                                       int pooled_width, float spatial_scale);
@@ -29,7 +29,7 @@ void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                       int pooled_height, int pooled_width,
                       float spatial_scale) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(rois);
     CHECK_CUDA_INPUT(output);
@@ -49,7 +49,7 @@ void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                        Tensor grad_input, int pooled_height, int pooled_width,
                        float spatial_scale) {
   if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_output);
     CHECK_CUDA_INPUT(rois);
     CHECK_CUDA_INPUT(argmax);
diff --git a/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu
index 9a17743ae..313f1d74c 100644
--- a/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/roi_pool_cuda.cu
@@ -1,5 +1,5 @@
 #include "pytorch_cuda_helper.hpp"
-#include "roi_pool_kernel.cuh"
+#include "roi_pool_cuda_kernel.cuh"
 
 void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
                                       Tensor argmax, int pooled_height,
diff --git a/mmcv/ops/csrc/pytorch/sync_bn.cpp b/mmcv/ops/csrc/pytorch/sync_bn.cpp
index 40a4a05c0..94b9b7c94 100644
--- a/mmcv/ops/csrc/pytorch/sync_bn.cpp
+++ b/mmcv/ops/csrc/pytorch/sync_bn.cpp
@@ -1,6 +1,6 @@
 #include "pytorch_cpp_helper.hpp"
 
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
 void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
 
 void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
@@ -61,7 +61,7 @@ void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
 
 void sync_bn_forward_mean(const Tensor input, Tensor mean) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(mean);
     sync_bn_forward_mean_cuda(input, mean);
@@ -75,7 +75,7 @@ void sync_bn_forward_mean(const Tensor input, Tensor mean) {
 
 void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(mean);
     CHECK_CUDA_INPUT(var);
@@ -95,7 +95,7 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
                             Tensor output, float eps, float momentum,
                             int group_size) {
   if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(input);
     CHECK_CUDA_INPUT(mean);
     CHECK_CUDA_INPUT(var);
@@ -120,7 +120,7 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
 void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                             Tensor grad_weight, Tensor grad_bias) {
   if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_output);
     CHECK_CUDA_INPUT(norm);
     CHECK_CUDA_INPUT(grad_weight);
@@ -139,7 +139,7 @@ void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                            const Tensor norm, const Tensor std,
                            Tensor grad_input) {
   if (grad_output.device().is_cuda()) {
-#ifdef WITH_CUDA
+#ifdef MMCV_WITH_CUDA
     CHECK_CUDA_INPUT(grad_output);
     CHECK_CUDA_INPUT(weight);
     CHECK_CUDA_INPUT(grad_weight);
diff --git a/mmcv/ops/csrc/roi_align_kernel.cuh b/mmcv/ops/csrc/roi_align_cuda_kernel.cuh
similarity index 97%
rename from mmcv/ops/csrc/roi_align_kernel.cuh
rename to mmcv/ops/csrc/roi_align_cuda_kernel.cuh
index ff1618c29..fb300747a 100644
--- a/mmcv/ops/csrc/roi_align_kernel.cuh
+++ b/mmcv/ops/csrc/roi_align_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef ROI_ALIGN_KERNEL_CUH
-#define ROI_ALIGN_KERNEL_CUH
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 /*** Forward ***/
 template <typename T>
@@ -196,4 +202,4 @@ __global__ void roi_align_backward_cuda_kernel(
   }
 }
 
-#endif  // ROI_ALIGN_KERNEL_CUH
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/roi_pool_kernel.cuh b/mmcv/ops/csrc/roi_pool_cuda_kernel.cuh
similarity index 94%
rename from mmcv/ops/csrc/roi_pool_kernel.cuh
rename to mmcv/ops/csrc/roi_pool_cuda_kernel.cuh
index 5aa3f3154..d499ade76 100644
--- a/mmcv/ops/csrc/roi_pool_kernel.cuh
+++ b/mmcv/ops/csrc/roi_pool_cuda_kernel.cuh
@@ -1,7 +1,11 @@
-#ifndef ROI_POOL_KERNEL_CUH
-#define ROI_POOL_KERNEL_CUH
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
 
-#include <cuda.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void roi_pool_forward_cuda_kernel(
@@ -85,4 +89,4 @@ __global__ void roi_pool_backward_cuda_kernel(
   }
 }
 
-#endif
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/sigmoid_focal_loss_kernel.cuh b/mmcv/ops/csrc/sigmoid_focal_loss_cuda_kernel.cuh
similarity index 89%
rename from mmcv/ops/csrc/sigmoid_focal_loss_kernel.cuh
rename to mmcv/ops/csrc/sigmoid_focal_loss_cuda_kernel.cuh
index fc3830d23..fb7c636eb 100644
--- a/mmcv/ops/csrc/sigmoid_focal_loss_kernel.cuh
+++ b/mmcv/ops/csrc/sigmoid_focal_loss_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef SIGMOID_FOCAL_LOSS_KERNEL_CUH
-#define SIGMOID_FOCAL_LOSS_KERNEL_CUH
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void sigmoid_focal_loss_forward_cuda_kernel(
@@ -60,4 +66,5 @@ __global__ void sigmoid_focal_loss_backward_cuda_kernel(
     }
   }
 }
-#endif
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/softmax_focal_loss_kernel.cuh b/mmcv/ops/csrc/softmax_focal_loss_cuda_kernel.cuh
similarity index 88%
rename from mmcv/ops/csrc/softmax_focal_loss_kernel.cuh
rename to mmcv/ops/csrc/softmax_focal_loss_cuda_kernel.cuh
index 09e91b75d..9d44e9ee9 100644
--- a/mmcv/ops/csrc/softmax_focal_loss_kernel.cuh
+++ b/mmcv/ops/csrc/softmax_focal_loss_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef SOFTMAX_FOCAL_LOSS_KERNEL_CUH
-#define SOFTMAX_FOCAL_LOSS_KERNEL_CUH
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void softmax_focal_loss_forward_cuda_kernel(
@@ -61,4 +67,5 @@ __global__ void softmax_focal_loss_backward_cuda2_kernel(
     }
   }
 }
-#endif
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/softnms_kernel.cuh b/mmcv/ops/csrc/softnms_kernel.cuh
deleted file mode 100644
index ef9a28681..000000000
--- a/mmcv/ops/csrc/softnms_kernel.cuh
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef SOFTNMS_KERNEL_CUH
-#define SOFTNMS_KERNEL_CUH
-
-#include <cuda.h>
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long int) * 8;
-
-template <typename scalar_t>
-__device__ inline scalar_t devIoU(scalar_t const *const a,
-                                  scalar_t const *const b) {
-  scalar_t left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  scalar_t top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
-  scalar_t width = fmaxf(right - left + 1, 0.f),
-           height = fmaxf(bottom - top + 1, 0.f);
-  scalar_t interS = width * height;
-  scalar_t Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  scalar_t Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-template <typename scalar_t>
-__global__ void softnms_max_kernel(const int n_boxes,
-                                   const scalar_t overlap_thresh,
-                                   const scalar_t *dev_boxes, int *order,
-                                   float *max_value, int *max_index) {
-  __shared__ float maximum[threadsPerBlock];
-  __shared__ int max_id[threadsPerBlock];
-
-  unsigned int tid = threadIdx.x;
-  unsigned int idx = blockIdx.x * threadsPerBlock + threadIdx.x;
-
-  if (idx >= n_boxes) {
-    return;
-  }
-
-  const int block_size = fminf(n_boxes + tid - idx, threadsPerBlock);
-  int *l_order = order + (idx - tid);
-  if (l_order[tid] == 0 && dev_boxes[idx * 5 + 4] >= overlap_thresh) {
-    maximum[tid] = dev_boxes[idx * 5 + 4];
-  } else {
-    maximum[tid] = -1.0;
-  }
-  max_id[tid] = tid;
-  __syncthreads();
-
-  if (block_size >= 1024 && tid < 512) {
-    if (maximum[tid] < maximum[tid + 512]) {
-      maximum[tid] = maximum[tid + 512];
-      max_id[tid] = max_id[tid + 512];
-    }
-  }
-  if (block_size >= 512 && tid < 256) {
-    if (maximum[tid] < maximum[tid + 256]) {
-      maximum[tid] = maximum[tid + 256];
-      max_id[tid] = max_id[tid + 256];
-    }
-  }
-  if (block_size >= 256 && tid < 128) {
-    if (maximum[tid] < maximum[tid + 128]) {
-      maximum[tid] = maximum[tid + 128];
-      max_id[tid] = max_id[tid + 128];
-    }
-  }
-  if (block_size >= 128 && tid < 64) {
-    if (maximum[tid] < maximum[tid + 64]) {
-      maximum[tid] = maximum[tid + 64];
-      max_id[tid] = max_id[tid + 64];
-    }
-  }
-  if (tid < 32) {
-    volatile float *vmaximum = maximum;
-    volatile int *vmax_id = max_id;
-    if (block_size >= 64 && vmaximum[tid] < vmaximum[tid + 32]) {
-      vmaximum[tid] = vmaximum[tid + 32];
-      vmax_id[tid] = vmax_id[tid + 32];
-    }
-    if (block_size >= 32 && tid < 16 && vmaximum[tid] < vmaximum[tid + 16]) {
-      vmaximum[tid] = vmaximum[tid + 16];
-      vmax_id[tid] = vmax_id[tid + 16];
-    }
-    if (block_size >= 16 && tid < 8 && vmaximum[tid] < vmaximum[tid + 8]) {
-      vmaximum[tid] = vmaximum[tid + 8];
-      vmax_id[tid] = vmax_id[tid + 8];
-    }
-    if (block_size >= 8 && tid < 4 && vmaximum[tid] < vmaximum[tid + 4]) {
-      vmaximum[tid] = vmaximum[tid + 4];
-      vmax_id[tid] = vmax_id[tid + 4];
-    }
-    if (block_size >= 4 && tid < 2 && vmaximum[tid] < vmaximum[tid + 2]) {
-      vmaximum[tid] = vmaximum[tid + 2];
-      vmax_id[tid] = vmax_id[tid + 2];
-    }
-    if (block_size >= 2 && tid < 1 && vmaximum[tid] < vmaximum[tid + 1]) {
-      vmaximum[tid] = vmaximum[tid + 1];
-      vmax_id[tid] = vmax_id[tid + 1];
-    }
-  }
-  if (tid == 0) {
-    max_value[blockIdx.x] = maximum[0];
-    max_index[blockIdx.x] = max_id[0];
-  }
-}
-
-template <typename scalar_t>
-__global__ void softnms_update_kernel(const int n_boxes, const scalar_t sigma,
-                                      const scalar_t n_thresh,
-                                      const unsigned int method,
-                                      const scalar_t overlap_thresh,
-                                      scalar_t *dev_boxes, int *order,
-                                      unsigned long long *keep, int max_id) {
-  const int col_start = blockIdx.x;
-
-  const int col_size =
-      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-  const int cur_idx = threadsPerBlock * col_start + threadIdx.x;
-  const int tid = threadIdx.x;
-
-  if (cur_idx >= n_boxes) {
-    return;
-  }
-  __shared__ scalar_t cur_max_boxes[5];
-
-  cur_max_boxes[0] = dev_boxes[max_id * 5 + 0];
-  cur_max_boxes[1] = dev_boxes[max_id * 5 + 1];
-  cur_max_boxes[2] = dev_boxes[max_id * 5 + 2];
-  cur_max_boxes[3] = dev_boxes[max_id * 5 + 3];
-  cur_max_boxes[4] = dev_boxes[max_id * 5 + 4];
-
-  __syncthreads();
-
-  if (cur_idx != max_id && tid < col_size && order[cur_idx] == 0 &&
-      (!(keep[col_start] & (1ULL << tid)))) {
-    scalar_t block_boxes[5];
-    block_boxes[0] = dev_boxes[cur_idx * 5 + 0];
-    block_boxes[1] = dev_boxes[cur_idx * 5 + 1];
-    block_boxes[2] = dev_boxes[cur_idx * 5 + 2];
-    block_boxes[3] = dev_boxes[cur_idx * 5 + 3];
-    block_boxes[4] = dev_boxes[cur_idx * 5 + 4];
-
-    scalar_t ovr = devIoU(cur_max_boxes, block_boxes);
-    scalar_t weight = 1.0;
-    if (method == 1) {
-      if (ovr > n_thresh) {
-        weight = 1.0 - ovr;
-      }
-    } else if (method == 2) {
-      weight = exp(-(ovr * ovr) / sigma);
-    } else if (ovr >= n_thresh) {
-      weight = 0.0;
-    }
-    block_boxes[4] *= weight;
-    dev_boxes[cur_idx * 5 + 4] = block_boxes[4];
-    if (block_boxes[4] < overlap_thresh) {
-      keep[col_start] |= 1ULL << tid;
-    }
-  }
-}
-
-#endif
diff --git a/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh b/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh
index 5e2fc1919..41844f8d0 100644
--- a/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/sync_bn_cuda_kernel.cuh
@@ -1,5 +1,11 @@
-#ifndef SYNC_BN_KERNEL_CUH
-#define SYNC_BN_KERNEL_CUH
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
 
 template <typename T>
 __global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
@@ -321,4 +327,4 @@ __global__ void sync_bn_backward_data_cuda_kernel(
   }
 }
 
-#endif  // SYNC_BN_KERNEL_CUH
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/setup.py b/setup.py
index bedfe4545..17916483c 100644
--- a/setup.py
+++ b/setup.py
@@ -150,22 +150,23 @@ def get_extensions():
 
     try:
         import torch
-        cuda_args = [
-            '-gencode=arch=compute_52,code=sm_52',
-            '-gencode=arch=compute_60,code=sm_60',
-            '-gencode=arch=compute_61,code=sm_61',
-            '-gencode=arch=compute_70,code=sm_70',
-            '-gencode=arch=compute_70,code=compute_70'
-        ]
         ext_name = 'mmcv._ext'
         if torch.__version__ == 'parrots':
             from parrots.utils.build_extension import BuildExtension, Extension
+            cuda_args = [
+                '-gencode=arch=compute_60,code=sm_60',
+                '-gencode=arch=compute_61,code=sm_61',
+                '-gencode=arch=compute_70,code=sm_70',
+                '-gencode=arch=compute_70,code=compute_70'
+            ]
+            define_macros = [('MMCV_USE_PARROTS', None)]
             op_files = glob.glob('./mmcv/ops/csrc/parrots/*')
             include_path = os.path.abspath('./mmcv/ops/csrc')
             ext_ops = Extension(
                 name=ext_name,
                 sources=op_files,
                 include_dirs=[include_path],
+                define_macros=define_macros,
                 extra_compile_args={
                     'nvcc': cuda_args,
                     'cxx': [],
@@ -177,12 +178,19 @@ def get_extensions():
                                                    CUDAExtension, CppExtension)
             # prevent ninja from using too many resources
             os.environ.setdefault('MAX_JOBS', '4')
+            cuda_args = [
+                '-gencode=arch=compute_52,code=sm_52',
+                '-gencode=arch=compute_60,code=sm_60',
+                '-gencode=arch=compute_61,code=sm_61',
+                '-gencode=arch=compute_70,code=sm_70',
+                '-gencode=arch=compute_70,code=compute_70'
+            ]
             define_macros = []
             extra_compile_args = {'cxx': []}
 
             if (torch.cuda.is_available()
                     or os.getenv('FORCE_CUDA', '0') == '1'):
-                define_macros += [('WITH_CUDA', None)]
+                define_macros += [('MMCV_WITH_CUDA', None)]
                 extra_compile_args['nvcc'] = cuda_args
                 op_files = glob.glob('./mmcv/ops/csrc/pytorch/*')
                 extension = CUDAExtension