Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpShuffles.cuh
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #pragma once
10 
11 #include <cuda.h>
12 #include "DeviceDefs.cuh"
13 #include "Float16.cuh"
14 
15 namespace faiss { namespace gpu {
16 
17 template <typename T>
18 inline __device__ T shfl(const T val,
19  int srcLane, int width = kWarpSize) {
20 #if CUDA_VERSION >= 9000
21  return __shfl_sync(0xffffffff, val, srcLane, width);
22 #else
23  return __shfl(val, srcLane, width);
24 #endif
25 }
26 
27 // CUDA SDK does not provide specializations for T*
28 template <typename T>
29 inline __device__ T* shfl(T* const val,
30  int srcLane, int width = kWarpSize) {
31  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
32  long long v = (long long) val;
33 
34  return (T*) shfl(v, srcLane, width);
35 }
36 
37 template <typename T>
38 inline __device__ T shfl_up(const T val,
39  unsigned int delta, int width = kWarpSize) {
40 #if CUDA_VERSION >= 9000
41  return __shfl_up_sync(0xffffffff, val, delta, width);
42 #else
43  return __shfl_up(val, delta, width);
44 #endif
45 }
46 
47 // CUDA SDK does not provide specializations for T*
48 template <typename T>
49 inline __device__ T* shfl_up(T* const val,
50  unsigned int delta, int width = kWarpSize) {
51  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
52  long long v = (long long) val;
53 
54  return (T*) shfl_up(v, delta, width);
55 }
56 
57 template <typename T>
58 inline __device__ T shfl_down(const T val,
59  unsigned int delta, int width = kWarpSize) {
60 #if CUDA_VERSION >= 9000
61  return __shfl_down_sync(0xffffffff, val, delta, width);
62 #else
63  return __shfl_down(val, delta, width);
64 #endif
65 }
66 
67 // CUDA SDK does not provide specializations for T*
68 template <typename T>
69 inline __device__ T* shfl_down(T* const val,
70  unsigned int delta, int width = kWarpSize) {
71  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
72  long long v = (long long) val;
73  return (T*) shfl_down(v, delta, width);
74 }
75 
76 template <typename T>
77 inline __device__ T shfl_xor(const T val,
78  int laneMask, int width = kWarpSize) {
79 #if CUDA_VERSION >= 9000
80  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
81 #else
82  return __shfl_xor(val, laneMask, width);
83 #endif
84 }
85 
86 // CUDA SDK does not provide specializations for T*
87 template <typename T>
88 inline __device__ T* shfl_xor(T* const val,
89  int laneMask, int width = kWarpSize) {
90  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
91  long long v = (long long) val;
92  return (T*) shfl_xor(v, laneMask, width);
93 }
94 
95 #ifdef FAISS_USE_FLOAT16
96 // CUDA 9.0 has half shuffle
97 #if CUDA_VERSION < 9000
98 inline __device__ half shfl(half v,
99  int srcLane, int width = kWarpSize) {
100  unsigned int vu = v.x;
101  vu = __shfl(vu, srcLane, width);
102 
103  half h;
104  h.x = (unsigned short) vu;
105  return h;
106 }
107 
108 inline __device__ half shfl_xor(half v,
109  int laneMask, int width = kWarpSize) {
110  unsigned int vu = v.x;
111  vu = __shfl_xor(vu, laneMask, width);
112 
113  half h;
114  h.x = (unsigned short) vu;
115  return h;
116 }
117 #endif // CUDA_VERSION
118 #endif // FAISS_USE_FLOAT16
119 
120 } } // namespace