Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpShuffles.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 #include <cuda.h>
15 #include "DeviceDefs.cuh"
16 #include "Float16.cuh"
17 
18 namespace faiss { namespace gpu {
19 
20 template <typename T>
21 inline __device__ T shfl(const T val,
22  int srcLane, int width = kWarpSize) {
23  return __shfl(val, srcLane, width);
24 }
25 
26 // CUDA SDK does not provide specializations for T*
27 template <typename T>
28 inline __device__ T* shfl(T* const val,
29  int srcLane, int width = kWarpSize) {
30  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
31  long long v = (long long) val;
32  return (T*) __shfl(v, srcLane, width);
33 }
34 
35 template <typename T>
36 inline __device__ T shfl_up(const T val,
37  unsigned int delta, int width = kWarpSize) {
38  return __shfl_up(val, delta, width);
39 }
40 
41 // CUDA SDK does not provide specializations for T*
42 template <typename T>
43 inline __device__ T* shfl_up(T* const val,
44  unsigned int delta, int width = kWarpSize) {
45  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
46  long long v = (long long) val;
47  return (T*) __shfl_up(v, delta, width);
48 }
49 
50 template <typename T>
51 inline __device__ T shfl_down(const T val,
52  unsigned int delta, int width = kWarpSize) {
53  return __shfl_down(val, delta, width);
54 }
55 
56 // CUDA SDK does not provide specializations for T*
57 template <typename T>
58 inline __device__ T* shfl_down(T* const val,
59  unsigned int delta, int width = kWarpSize) {
60  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
61  long long v = (long long) val;
62  return (T*) __shfl_down(v, delta, width);
63 }
64 
65 template <typename T>
66 inline __device__ T shfl_xor(const T val,
67  int laneMask, int width = kWarpSize) {
68  return __shfl_xor(val, laneMask, width);
69 }
70 
71 // CUDA SDK does not provide specializations for T*
72 template <typename T>
73 inline __device__ T* shfl_xor(T* const val,
74  int laneMask, int width = kWarpSize) {
75  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
76  long long v = (long long) val;
77  return (T*) __shfl_xor(v, laneMask, width);
78 }
79 
80 #ifdef FAISS_USE_FLOAT16
81 inline __device__ half shfl(half v,
82  int srcLane, int width = kWarpSize) {
83  unsigned int vu = v.x;
84  vu = __shfl(vu, srcLane, width);
85 
86  half h;
87  h.x = (unsigned short) vu;
88  return h;
89 }
90 
91 inline __device__ half shfl_xor(half v,
92  int laneMask, int width = kWarpSize) {
93  unsigned int vu = v.x;
94  vu = __shfl_xor(vu, laneMask, width);
95 
96  half h;
97  h.x = (unsigned short) vu;
98  return h;
99 }
100 #endif
101 
102 } } // namespace