15 #include "DeviceDefs.cuh"
16 #include "Float16.cuh"
18 namespace faiss {
namespace gpu {
21 inline __device__ T shfl(
const T val,
22 int srcLane,
int width = kWarpSize) {
23 return __shfl(val, srcLane, width);
28 inline __device__ T* shfl(T*
const val,
29 int srcLane,
int width = kWarpSize) {
30 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
31 long long v = (
long long) val;
32 return (T*) __shfl(v, srcLane, width);
36 inline __device__ T shfl_up(
const T val,
37 unsigned int delta,
int width = kWarpSize) {
38 return __shfl_up(val, delta, width);
43 inline __device__ T* shfl_up(T*
const val,
44 unsigned int delta,
int width = kWarpSize) {
45 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
46 long long v = (
long long) val;
47 return (T*) __shfl_up(v, delta, width);
51 inline __device__ T shfl_down(
const T val,
52 unsigned int delta,
int width = kWarpSize) {
53 return __shfl_down(val, delta, width);
58 inline __device__ T* shfl_down(T*
const val,
59 unsigned int delta,
int width = kWarpSize) {
60 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
61 long long v = (
long long) val;
62 return (T*) __shfl_down(v, delta, width);
66 inline __device__ T shfl_xor(
const T val,
67 int laneMask,
int width = kWarpSize) {
68 return __shfl_xor(val, laneMask, width);
73 inline __device__ T* shfl_xor(T*
const val,
74 int laneMask,
int width = kWarpSize) {
75 static_assert(
sizeof(T*) ==
sizeof(
long long),
"pointer size");
76 long long v = (
long long) val;
77 return (T*) __shfl_xor(v, laneMask, width);
80 #ifdef FAISS_USE_FLOAT16
81 inline __device__ half shfl(half v,
82 int srcLane,
int width = kWarpSize) {
83 unsigned int vu = v.x;
84 vu = __shfl(vu, srcLane, width);
87 h.x = (
unsigned short) vu;
91 inline __device__ half shfl_xor(half v,
92 int laneMask,
int width = kWarpSize) {
93 unsigned int vu = v.x;
94 vu = __shfl_xor(vu, laneMask, width);
97 h.x = (
unsigned short) vu;