Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
LoadStoreOperators.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #pragma once
11 
12 #include "Float16.cuh"
13 
14 #ifndef __HALF2_TO_UI
15 // cuda_fp16.hpp doesn't export this
16 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
17 #endif
18 
19 
20 //
21 // Templated wrappers to express load/store for different scalar and vector
22 // types, so kernels can have the same written form but can operate
23 // over half and float, and on vector types transparently
24 //
25 
26 namespace faiss { namespace gpu {
27 
28 template <typename T>
29 struct LoadStore {
30  static inline __device__ T load(void* p) {
31  return *((T*) p);
32  }
33 
34  static inline __device__ void store(void* p, const T& v) {
35  *((T*) p) = v;
36  }
37 };
38 
39 #ifdef FAISS_USE_FLOAT16
40 
41 template <>
42 struct LoadStore<Half4> {
43  static inline __device__ Half4 load(void* p) {
44  Half4 out;
45 #if CUDA_VERSION >= 9000
46  asm("ld.global.v2.u32 {%0, %1}, [%2];" :
47  "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
48 #else
49  asm("ld.global.v2.u32 {%0, %1}, [%2];" :
50  "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
51 #endif
52  return out;
53  }
54 
55  static inline __device__ void store(void* p, Half4& v) {
56 #if CUDA_VERSION >= 9000
57  asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
58  "r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
59 #else
60  asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
61 #endif
62  }
63 };
64 
65 template <>
66 struct LoadStore<Half8> {
67  static inline __device__ Half8 load(void* p) {
68  Half8 out;
69 #if CUDA_VERSION >= 9000
70  asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
71  "=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
72  "=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
73 #else
74  asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
75  "=r"(out.a.a.x), "=r"(out.a.b.x),
76  "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
77 #endif
78  return out;
79  }
80 
81  static inline __device__ void store(void* p, Half8& v) {
82 #if CUDA_VERSION >= 9000
83  asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
84  : : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
85  "r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
86 #else
87  asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
88  : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
89 #endif
90  }
91 };
92 
93 #endif // FAISS_USE_FLOAT16
94 
95 } } // namespace