Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
LoadStoreOperators.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include "Float16.cuh"
14 
15 #ifndef __HALF2_TO_UI
16 // cuda_fp16.hpp doesn't export this
17 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
18 #endif
19 
20 
21 //
22 // Templated wrappers to express load/store for different scalar and vector
23 // types, so kernels can have the same written form but can operate
24 // over half and float, and on vector types transparently
25 //
26 
27 namespace faiss { namespace gpu {
28 
29 template <typename T>
30 struct LoadStore {
31  static inline __device__ T load(void* p) {
32  return *((T*) p);
33  }
34 
35  static inline __device__ void store(void* p, const T& v) {
36  *((T*) p) = v;
37  }
38 };
39 
40 #ifdef FAISS_USE_FLOAT16
41 
42 template <>
43 struct LoadStore<Half4> {
44  static inline __device__ Half4 load(void* p) {
45  Half4 out;
46 #if CUDA_VERSION >= 9000
47  asm("ld.global.v2.u32 {%0, %1}, [%2];" :
48  "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
49 #else
50  asm("ld.global.v2.u32 {%0, %1}, [%2];" :
51  "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
52 #endif
53  return out;
54  }
55 
56  static inline __device__ void store(void* p, Half4& v) {
57 #if CUDA_VERSION >= 9000
58  asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
59  "r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
60 #else
61  asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
62 #endif
63  }
64 };
65 
66 template <>
67 struct LoadStore<Half8> {
68  static inline __device__ Half8 load(void* p) {
69  Half8 out;
70 #if CUDA_VERSION >= 9000
71  asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
72  "=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
73  "=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
74 #else
75  asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
76  "=r"(out.a.a.x), "=r"(out.a.b.x),
77  "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
78 #endif
79  return out;
80  }
81 
82  static inline __device__ void store(void* p, Half8& v) {
83 #if CUDA_VERSION >= 9000
84  asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
85  : : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
86  "r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
87 #else
88  asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
89  : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
90 #endif
91  }
92 };
93 
94 #endif // FAISS_USE_FLOAT16
95 
96 } } // namespace