Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpShuffles.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include <cuda.h>
14 #include "DeviceDefs.cuh"
15 #include "Float16.cuh"
16 
17 namespace faiss { namespace gpu {
18 
19 template <typename T>
20 inline __device__ T shfl(const T val,
21  int srcLane, int width = kWarpSize) {
22 #if CUDA_VERSION >= 9000
23  return __shfl_sync(0xffffffff, val, srcLane, width);
24 #else
25  return __shfl(val, srcLane, width);
26 #endif
27 }
28 
29 // CUDA SDK does not provide specializations for T*
30 template <typename T>
31 inline __device__ T* shfl(T* const val,
32  int srcLane, int width = kWarpSize) {
33  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
34  long long v = (long long) val;
35 
36  return (T*) shfl(v, srcLane, width);
37 }
38 
39 template <typename T>
40 inline __device__ T shfl_up(const T val,
41  unsigned int delta, int width = kWarpSize) {
42 #if CUDA_VERSION >= 9000
43  return __shfl_up_sync(0xffffffff, val, delta, width);
44 #else
45  return __shfl_up(val, delta, width);
46 #endif
47 }
48 
49 // CUDA SDK does not provide specializations for T*
50 template <typename T>
51 inline __device__ T* shfl_up(T* const val,
52  unsigned int delta, int width = kWarpSize) {
53  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
54  long long v = (long long) val;
55 
56  return (T*) shfl_up(v, delta, width);
57 }
58 
59 template <typename T>
60 inline __device__ T shfl_down(const T val,
61  unsigned int delta, int width = kWarpSize) {
62 #if CUDA_VERSION >= 9000
63  return __shfl_down_sync(0xffffffff, val, delta, width);
64 #else
65  return __shfl_down(val, delta, width);
66 #endif
67 }
68 
69 // CUDA SDK does not provide specializations for T*
70 template <typename T>
71 inline __device__ T* shfl_down(T* const val,
72  unsigned int delta, int width = kWarpSize) {
73  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
74  long long v = (long long) val;
75  return (T*) shfl_down(v, delta, width);
76 }
77 
78 template <typename T>
79 inline __device__ T shfl_xor(const T val,
80  int laneMask, int width = kWarpSize) {
81 #if CUDA_VERSION >= 9000
82  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
83 #else
84  return __shfl_xor(val, laneMask, width);
85 #endif
86 }
87 
88 // CUDA SDK does not provide specializations for T*
89 template <typename T>
90 inline __device__ T* shfl_xor(T* const val,
91  int laneMask, int width = kWarpSize) {
92  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
93  long long v = (long long) val;
94  return (T*) shfl_xor(v, laneMask, width);
95 }
96 
97 #ifdef FAISS_USE_FLOAT16
98 // CUDA 9.0 has half shuffle
99 #if CUDA_VERSION < 9000
100 inline __device__ half shfl(half v,
101  int srcLane, int width = kWarpSize) {
102  unsigned int vu = v.x;
103  vu = __shfl(vu, srcLane, width);
104 
105  half h;
106  h.x = (unsigned short) vu;
107  return h;
108 }
109 
110 inline __device__ half shfl_xor(half v,
111  int laneMask, int width = kWarpSize) {
112  unsigned int vu = v.x;
113  vu = __shfl_xor(vu, laneMask, width);
114 
115  half h;
116  h.x = (unsigned short) vu;
117  return h;
118 }
119 #endif // CUDA_VERSION
120 #endif // FAISS_USE_FLOAT16
121 
122 } } // namespace