Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Float16.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #pragma once
11 
12 #include <cuda.h>
13 #include "../GpuResources.h"
14 #include "DeviceTensor.cuh"
15 
16 // For float16, We use the half datatype, expecting it to be a struct
17 // as in CUDA 7.5.
18 #if CUDA_VERSION >= 7050
19 #define FAISS_USE_FLOAT16 1
20 
21 // Some compute capabilities have full float16 ALUs.
22 #if __CUDA_ARCH__ >= 530
23 #define FAISS_USE_FULL_FLOAT16 1
24 #endif // __CUDA_ARCH__ types
25 
26 #endif // CUDA_VERSION
27 
28 #ifdef FAISS_USE_FLOAT16
29 #include <cuda_fp16.h>
30 #endif
31 
32 namespace faiss { namespace gpu {
33 
34 #ifdef FAISS_USE_FLOAT16
35 
36 // 64 bytes containing 4 half (float16) values
37 struct Half4 {
38  half2 a;
39  half2 b;
40 };
41 
42 inline __device__ float4 half4ToFloat4(Half4 v) {
43  float2 a = __half22float2(v.a);
44  float2 b = __half22float2(v.b);
45 
46  float4 out;
47  out.x = a.x;
48  out.y = a.y;
49  out.z = b.x;
50  out.w = b.y;
51 
52  return out;
53 }
54 
55 inline __device__ Half4 float4ToHalf4(float4 v) {
56  float2 a;
57  a.x = v.x;
58  a.y = v.y;
59 
60  float2 b;
61  b.x = v.z;
62  b.y = v.w;
63 
64  Half4 out;
65  out.a = __float22half2_rn(a);
66  out.b = __float22half2_rn(b);
67 
68  return out;
69 }
70 
71 // 128 bytes containing 8 half (float16) values
72 struct Half8 {
73  Half4 a;
74  Half4 b;
75 };
76 
77 /// Returns true if the given device supports native float16 math
78 bool getDeviceSupportsFloat16Math(int device);
79 
80 /// Copies `in` to `out` while performing a float32 -> float16 conversion
81 void runConvertToFloat16(half* out,
82  const float* in,
83  size_t num,
84  cudaStream_t stream);
85 
86 /// Copies `in` to `out` while performing a float16 -> float32
87 /// conversion
88 void runConvertToFloat32(float* out,
89  const half* in,
90  size_t num,
91  cudaStream_t stream);
92 
93 template <int Dim>
94 void toHalf(cudaStream_t stream,
95  Tensor<float, Dim, true>& in,
96  Tensor<half, Dim, true>& out) {
97  FAISS_ASSERT(in.numElements() == out.numElements());
98 
99  // The memory is contiguous (the `true`), so apply a pointwise
100  // kernel to convert
101  runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
102 }
103 
104 template <int Dim>
105 DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
106  cudaStream_t stream,
107  Tensor<float, Dim, true>& in) {
108  DeviceTensor<half, Dim, true> out;
109  if (resources) {
110  out = std::move(DeviceTensor<half, Dim, true>(
111  resources->getMemoryManagerCurrentDevice(),
112  in.sizes(),
113  stream));
114  } else {
115  out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
116  }
117 
118  toHalf<Dim>(stream, in, out);
119  return out;
120 }
121 
122 template <int Dim>
123 void fromHalf(cudaStream_t stream,
124  Tensor<half, Dim, true>& in,
125  Tensor<float, Dim, true>& out) {
126  FAISS_ASSERT(in.numElements() == out.numElements());
127 
128  // The memory is contiguous (the `true`), so apply a pointwise
129  // kernel to convert
130  runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
131 }
132 
133 template <int Dim>
134 DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
135  cudaStream_t stream,
136  Tensor<half, Dim, true>& in) {
137  DeviceTensor<float, Dim, true> out;
138  if (resources) {
139  out = std::move(DeviceTensor<float, Dim, true>(
140  resources->getMemoryManagerCurrentDevice(),
141  in.sizes(),
142  stream));
143  } else {
144  out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
145  }
146 
147  fromHalf<Dim>(stream, in, out);
148  return out;
149 }
150 
151 __half hostFloat2Half(float v);
152 
153 #endif // FAISS_USE_FLOAT16
154 
155 } } // namespace