14 #include "DeviceDefs.cuh"
15 #include "Float16.cuh"
17 namespace faiss {
namespace gpu {
20 inline __device__ T shfl(
const T val,
21 int srcLane,
int width = kWarpSize) {
22 return __shfl(val, srcLane, width);
27 inline __device__ T* shfl(T*
const val,
28 int srcLane,
int width = kWarpSize) {
29 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
30 long long v = (
long long) val;
31 return (T*) __shfl(v, srcLane, width);
35 inline __device__ T shfl_up(
const T val,
36 unsigned int delta,
int width = kWarpSize) {
37 return __shfl_up(val, delta, width);
42 inline __device__ T* shfl_up(T*
const val,
43 unsigned int delta,
int width = kWarpSize) {
44 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
45 long long v = (
long long) val;
46 return (T*) __shfl_up(v, delta, width);
50 inline __device__ T shfl_down(
const T val,
51 unsigned int delta,
int width = kWarpSize) {
52 return __shfl_down(val, delta, width);
57 inline __device__ T* shfl_down(T*
const val,
58 unsigned int delta,
int width = kWarpSize) {
59 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
60 long long v = (
long long) val;
61 return (T*) __shfl_down(v, delta, width);
65 inline __device__ T shfl_xor(
const T val,
66 int laneMask,
int width = kWarpSize) {
67 return __shfl_xor(val, laneMask, width);
72 inline __device__ T* shfl_xor(T*
const val,
73 int laneMask,
int width = kWarpSize) {
74 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
75 long long v = (
long long) val;
76 return (T*) __shfl_xor(v, laneMask, width);
79 #ifdef FAISS_USE_FLOAT16
80 inline __device__ half shfl(half v,
81 int srcLane,
int width = kWarpSize) {
82 unsigned int vu = v.x;
83 vu = __shfl(vu, srcLane, width);
86 h.x = (
unsigned short) vu;
90 inline __device__ half shfl_xor(half v,
91 int laneMask,
int width = kWarpSize) {
92 unsigned int vu = v.x;
93 vu = __shfl_xor(vu, laneMask, width);
96 h.x = (
unsigned short) vu;