14 #include "DeviceDefs.cuh"
15 #include "Float16.cuh"
17 namespace faiss {
namespace gpu {
20 inline __device__ T shfl(
const T val,
21 int srcLane,
int width = kWarpSize) {
22 #if CUDA_VERSION >= 9000
23 return __shfl_sync(0xffffffff, val, srcLane, width);
25 return __shfl(val, srcLane, width);
31 inline __device__ T* shfl(T*
const val,
32 int srcLane,
int width = kWarpSize) {
33 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
34 long long v = (
long long) val;
36 return (T*) shfl(v, srcLane, width);
40 inline __device__ T shfl_up(
const T val,
41 unsigned int delta,
int width = kWarpSize) {
42 #if CUDA_VERSION >= 9000
43 return __shfl_up_sync(0xffffffff, val, delta, width);
45 return __shfl_up(val, delta, width);
51 inline __device__ T* shfl_up(T*
const val,
52 unsigned int delta,
int width = kWarpSize) {
53 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
54 long long v = (
long long) val;
56 return (T*) shfl_up(v, delta, width);
60 inline __device__ T shfl_down(
const T val,
61 unsigned int delta,
int width = kWarpSize) {
62 #if CUDA_VERSION >= 9000
63 return __shfl_down_sync(0xffffffff, val, delta, width);
65 return __shfl_down(val, delta, width);
71 inline __device__ T* shfl_down(T*
const val,
72 unsigned int delta,
int width = kWarpSize) {
73 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
74 long long v = (
long long) val;
75 return (T*) shfl_down(v, delta, width);
79 inline __device__ T shfl_xor(
const T val,
80 int laneMask,
int width = kWarpSize) {
81 #if CUDA_VERSION >= 9000
82 return __shfl_xor_sync(0xffffffff, val, laneMask, width);
84 return __shfl_xor(val, laneMask, width);
90 inline __device__ T* shfl_xor(T*
const val,
91 int laneMask,
int width = kWarpSize) {
92 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
93 long long v = (
long long) val;
94 return (T*) shfl_xor(v, laneMask, width);
97 #ifdef FAISS_USE_FLOAT16
99 #if CUDA_VERSION < 9000
100 inline __device__ half shfl(half v,
101 int srcLane,
int width = kWarpSize) {
102 unsigned int vu = v.x;
103 vu = __shfl(vu, srcLane, width);
106 h.x = (
unsigned short) vu;
110 inline __device__ half shfl_xor(half v,
111 int laneMask,
int width = kWarpSize) {
112 unsigned int vu = v.x;
113 vu = __shfl_xor(vu, laneMask, width);
116 h.x = (
unsigned short) vu;
119 #endif // CUDA_VERSION
120 #endif // FAISS_USE_FLOAT16