13 #include "DeviceDefs.cuh"
14 #include "Float16.cuh"
16 namespace faiss {
namespace gpu {
19 inline __device__ T shfl(
const T val,
20 int srcLane,
int width = kWarpSize) {
21 #if CUDA_VERSION >= 9000
22 return __shfl_sync(0xffffffff, val, srcLane, width);
24 return __shfl(val, srcLane, width);
30 inline __device__ T* shfl(T*
const val,
31 int srcLane,
int width = kWarpSize) {
32 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
33 long long v = (
long long) val;
35 return (T*) shfl(v, srcLane, width);
39 inline __device__ T shfl_up(
const T val,
40 unsigned int delta,
int width = kWarpSize) {
41 #if CUDA_VERSION >= 9000
42 return __shfl_up_sync(0xffffffff, val, delta, width);
44 return __shfl_up(val, delta, width);
50 inline __device__ T* shfl_up(T*
const val,
51 unsigned int delta,
int width = kWarpSize) {
52 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
53 long long v = (
long long) val;
55 return (T*) shfl_up(v, delta, width);
59 inline __device__ T shfl_down(
const T val,
60 unsigned int delta,
int width = kWarpSize) {
61 #if CUDA_VERSION >= 9000
62 return __shfl_down_sync(0xffffffff, val, delta, width);
64 return __shfl_down(val, delta, width);
70 inline __device__ T* shfl_down(T*
const val,
71 unsigned int delta,
int width = kWarpSize) {
72 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
73 long long v = (
long long) val;
74 return (T*) shfl_down(v, delta, width);
78 inline __device__ T shfl_xor(
const T val,
79 int laneMask,
int width = kWarpSize) {
80 #if CUDA_VERSION >= 9000
81 return __shfl_xor_sync(0xffffffff, val, laneMask, width);
83 return __shfl_xor(val, laneMask, width);
89 inline __device__ T* shfl_xor(T*
const val,
90 int laneMask,
int width = kWarpSize) {
91 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
92 long long v = (
long long) val;
93 return (T*) shfl_xor(v, laneMask, width);
96 #ifdef FAISS_USE_FLOAT16
98 #if CUDA_VERSION < 9000
99 inline __device__ half shfl(half v,
100 int srcLane,
int width = kWarpSize) {
101 unsigned int vu = v.x;
102 vu = __shfl(vu, srcLane, width);
105 h.x = (
unsigned short) vu;
109 inline __device__ half shfl_xor(half v,
110 int laneMask,
int width = kWarpSize) {
111 unsigned int vu = v.x;
112 vu = __shfl_xor(vu, laneMask, width);
115 h.x = (
unsigned short) vu;
118 #endif // CUDA_VERSION
119 #endif // FAISS_USE_FLOAT16