Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpShuffles.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #pragma once
11 
12 #include <cuda.h>
13 #include "DeviceDefs.cuh"
14 #include "Float16.cuh"
15 
16 namespace faiss { namespace gpu {
17 
18 template <typename T>
19 inline __device__ T shfl(const T val,
20  int srcLane, int width = kWarpSize) {
21 #if CUDA_VERSION >= 9000
22  return __shfl_sync(0xffffffff, val, srcLane, width);
23 #else
24  return __shfl(val, srcLane, width);
25 #endif
26 }
27 
28 // CUDA SDK does not provide specializations for T*
29 template <typename T>
30 inline __device__ T* shfl(T* const val,
31  int srcLane, int width = kWarpSize) {
32  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
33  long long v = (long long) val;
34 
35  return (T*) shfl(v, srcLane, width);
36 }
37 
38 template <typename T>
39 inline __device__ T shfl_up(const T val,
40  unsigned int delta, int width = kWarpSize) {
41 #if CUDA_VERSION >= 9000
42  return __shfl_up_sync(0xffffffff, val, delta, width);
43 #else
44  return __shfl_up(val, delta, width);
45 #endif
46 }
47 
48 // CUDA SDK does not provide specializations for T*
49 template <typename T>
50 inline __device__ T* shfl_up(T* const val,
51  unsigned int delta, int width = kWarpSize) {
52  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
53  long long v = (long long) val;
54 
55  return (T*) shfl_up(v, delta, width);
56 }
57 
58 template <typename T>
59 inline __device__ T shfl_down(const T val,
60  unsigned int delta, int width = kWarpSize) {
61 #if CUDA_VERSION >= 9000
62  return __shfl_down_sync(0xffffffff, val, delta, width);
63 #else
64  return __shfl_down(val, delta, width);
65 #endif
66 }
67 
68 // CUDA SDK does not provide specializations for T*
69 template <typename T>
70 inline __device__ T* shfl_down(T* const val,
71  unsigned int delta, int width = kWarpSize) {
72  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
73  long long v = (long long) val;
74  return (T*) shfl_down(v, delta, width);
75 }
76 
77 template <typename T>
78 inline __device__ T shfl_xor(const T val,
79  int laneMask, int width = kWarpSize) {
80 #if CUDA_VERSION >= 9000
81  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
82 #else
83  return __shfl_xor(val, laneMask, width);
84 #endif
85 }
86 
87 // CUDA SDK does not provide specializations for T*
88 template <typename T>
89 inline __device__ T* shfl_xor(T* const val,
90  int laneMask, int width = kWarpSize) {
91  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
92  long long v = (long long) val;
93  return (T*) shfl_xor(v, laneMask, width);
94 }
95 
96 #ifdef FAISS_USE_FLOAT16
97 // CUDA 9.0 has half shuffle
98 #if CUDA_VERSION < 9000
99 inline __device__ half shfl(half v,
100  int srcLane, int width = kWarpSize) {
101  unsigned int vu = v.x;
102  vu = __shfl(vu, srcLane, width);
103 
104  half h;
105  h.x = (unsigned short) vu;
106  return h;
107 }
108 
109 inline __device__ half shfl_xor(half v,
110  int laneMask, int width = kWarpSize) {
111  unsigned int vu = v.x;
112  vu = __shfl_xor(vu, laneMask, width);
113 
114  half h;
115  h.x = (unsigned short) vu;
116  return h;
117 }
118 #endif // CUDA_VERSION
119 #endif // FAISS_USE_FLOAT16
120 
121 } } // namespace