12 #include "DeviceDefs.cuh"
13 #include "Float16.cuh"
15 namespace faiss {
namespace gpu {
18 inline __device__ T shfl(
const T val,
19 int srcLane,
int width = kWarpSize) {
20 #if CUDA_VERSION >= 9000
21 return __shfl_sync(0xffffffff, val, srcLane, width);
23 return __shfl(val, srcLane, width);
29 inline __device__ T* shfl(T*
const val,
30 int srcLane,
int width = kWarpSize) {
31 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
32 long long v = (
long long) val;
34 return (T*) shfl(v, srcLane, width);
38 inline __device__ T shfl_up(
const T val,
39 unsigned int delta,
int width = kWarpSize) {
40 #if CUDA_VERSION >= 9000
41 return __shfl_up_sync(0xffffffff, val, delta, width);
43 return __shfl_up(val, delta, width);
49 inline __device__ T* shfl_up(T*
const val,
50 unsigned int delta,
int width = kWarpSize) {
51 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
52 long long v = (
long long) val;
54 return (T*) shfl_up(v, delta, width);
58 inline __device__ T shfl_down(
const T val,
59 unsigned int delta,
int width = kWarpSize) {
60 #if CUDA_VERSION >= 9000
61 return __shfl_down_sync(0xffffffff, val, delta, width);
63 return __shfl_down(val, delta, width);
69 inline __device__ T* shfl_down(T*
const val,
70 unsigned int delta,
int width = kWarpSize) {
71 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
72 long long v = (
long long) val;
73 return (T*) shfl_down(v, delta, width);
77 inline __device__ T shfl_xor(
const T val,
78 int laneMask,
int width = kWarpSize) {
79 #if CUDA_VERSION >= 9000
80 return __shfl_xor_sync(0xffffffff, val, laneMask, width);
82 return __shfl_xor(val, laneMask, width);
88 inline __device__ T* shfl_xor(T*
const val,
89 int laneMask,
int width = kWarpSize) {
90 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
91 long long v = (
long long) val;
92 return (T*) shfl_xor(v, laneMask, width);
95 #ifdef FAISS_USE_FLOAT16
97 #if CUDA_VERSION < 9000
98 inline __device__ half shfl(half v,
99 int srcLane,
int width = kWarpSize) {
100 unsigned int vu = v.x;
101 vu = __shfl(vu, srcLane, width);
104 h.x = (
unsigned short) vu;
108 inline __device__ half shfl_xor(half v,
109 int laneMask,
int width = kWarpSize) {
110 unsigned int vu = v.x;
111 vu = __shfl_xor(vu, laneMask, width);
114 h.x = (
unsigned short) vu;
117 #endif // CUDA_VERSION
118 #endif // FAISS_USE_FLOAT16