Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpShuffles.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include <cuda.h>
14 #include "DeviceDefs.cuh"
15 #include "Float16.cuh"
16 
17 namespace faiss { namespace gpu {
18 
19 template <typename T>
20 inline __device__ T shfl(const T val,
21  int srcLane, int width = kWarpSize) {
22  return __shfl(val, srcLane, width);
23 }
24 
25 // CUDA SDK does not provide specializations for T*
26 template <typename T>
27 inline __device__ T* shfl(T* const val,
28  int srcLane, int width = kWarpSize) {
29  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
30  long long v = (long long) val;
31  return (T*) __shfl(v, srcLane, width);
32 }
33 
34 template <typename T>
35 inline __device__ T shfl_up(const T val,
36  unsigned int delta, int width = kWarpSize) {
37  return __shfl_up(val, delta, width);
38 }
39 
40 // CUDA SDK does not provide specializations for T*
41 template <typename T>
42 inline __device__ T* shfl_up(T* const val,
43  unsigned int delta, int width = kWarpSize) {
44  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
45  long long v = (long long) val;
46  return (T*) __shfl_up(v, delta, width);
47 }
48 
49 template <typename T>
50 inline __device__ T shfl_down(const T val,
51  unsigned int delta, int width = kWarpSize) {
52  return __shfl_down(val, delta, width);
53 }
54 
55 // CUDA SDK does not provide specializations for T*
56 template <typename T>
57 inline __device__ T* shfl_down(T* const val,
58  unsigned int delta, int width = kWarpSize) {
59  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
60  long long v = (long long) val;
61  return (T*) __shfl_down(v, delta, width);
62 }
63 
64 template <typename T>
65 inline __device__ T shfl_xor(const T val,
66  int laneMask, int width = kWarpSize) {
67  return __shfl_xor(val, laneMask, width);
68 }
69 
70 // CUDA SDK does not provide specializations for T*
71 template <typename T>
72 inline __device__ T* shfl_xor(T* const val,
73  int laneMask, int width = kWarpSize) {
74  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
75  long long v = (long long) val;
76  return (T*) __shfl_xor(v, laneMask, width);
77 }
78 
79 #ifdef FAISS_USE_FLOAT16
80 inline __device__ half shfl(half v,
81  int srcLane, int width = kWarpSize) {
82  unsigned int vu = v.x;
83  vu = __shfl(vu, srcLane, width);
84 
85  half h;
86  h.x = (unsigned short) vu;
87  return h;
88 }
89 
90 inline __device__ half shfl_xor(half v,
91  int laneMask, int width = kWarpSize) {
92  unsigned int vu = v.x;
93  vu = __shfl_xor(vu, laneMask, width);
94 
95  half h;
96  h.x = (unsigned short) vu;
97  return h;
98 }
99 #endif
100 
101 } } // namespace