Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Float16.cuh
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #pragma once
10 
11 #include <cuda.h>
12 #include "../GpuResources.h"
13 #include "DeviceTensor.cuh"
14 
15 // For float16, We use the half datatype, expecting it to be a struct
16 // as in CUDA 7.5.
17 #if CUDA_VERSION >= 7050
18 #define FAISS_USE_FLOAT16 1
19 
20 // Some compute capabilities have full float16 ALUs.
21 #if __CUDA_ARCH__ >= 530
22 #define FAISS_USE_FULL_FLOAT16 1
23 #endif // __CUDA_ARCH__ types
24 
25 #endif // CUDA_VERSION
26 
27 #ifdef FAISS_USE_FLOAT16
28 #include <cuda_fp16.h>
29 #endif
30 
31 namespace faiss { namespace gpu {
32 
33 #ifdef FAISS_USE_FLOAT16
34 
35 // 64 bytes containing 4 half (float16) values
36 struct Half4 {
37  half2 a;
38  half2 b;
39 };
40 
41 inline __device__ float4 half4ToFloat4(Half4 v) {
42  float2 a = __half22float2(v.a);
43  float2 b = __half22float2(v.b);
44 
45  float4 out;
46  out.x = a.x;
47  out.y = a.y;
48  out.z = b.x;
49  out.w = b.y;
50 
51  return out;
52 }
53 
54 inline __device__ Half4 float4ToHalf4(float4 v) {
55  float2 a;
56  a.x = v.x;
57  a.y = v.y;
58 
59  float2 b;
60  b.x = v.z;
61  b.y = v.w;
62 
63  Half4 out;
64  out.a = __float22half2_rn(a);
65  out.b = __float22half2_rn(b);
66 
67  return out;
68 }
69 
70 // 128 bytes containing 8 half (float16) values
71 struct Half8 {
72  Half4 a;
73  Half4 b;
74 };
75 
76 /// Returns true if the given device supports native float16 math
77 bool getDeviceSupportsFloat16Math(int device);
78 
79 /// Copies `in` to `out` while performing a float32 -> float16 conversion
80 void runConvertToFloat16(half* out,
81  const float* in,
82  size_t num,
83  cudaStream_t stream);
84 
85 /// Copies `in` to `out` while performing a float16 -> float32
86 /// conversion
87 void runConvertToFloat32(float* out,
88  const half* in,
89  size_t num,
90  cudaStream_t stream);
91 
92 template <int Dim>
93 void toHalf(cudaStream_t stream,
94  Tensor<float, Dim, true>& in,
95  Tensor<half, Dim, true>& out) {
96  FAISS_ASSERT(in.numElements() == out.numElements());
97 
98  // The memory is contiguous (the `true`), so apply a pointwise
99  // kernel to convert
100  runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
101 }
102 
103 template <int Dim>
104 DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
105  cudaStream_t stream,
106  Tensor<float, Dim, true>& in) {
107  DeviceTensor<half, Dim, true> out;
108  if (resources) {
109  out = std::move(DeviceTensor<half, Dim, true>(
110  resources->getMemoryManagerCurrentDevice(),
111  in.sizes(),
112  stream));
113  } else {
114  out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
115  }
116 
117  toHalf<Dim>(stream, in, out);
118  return out;
119 }
120 
121 template <int Dim>
122 void fromHalf(cudaStream_t stream,
123  Tensor<half, Dim, true>& in,
124  Tensor<float, Dim, true>& out) {
125  FAISS_ASSERT(in.numElements() == out.numElements());
126 
127  // The memory is contiguous (the `true`), so apply a pointwise
128  // kernel to convert
129  runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
130 }
131 
132 template <int Dim>
133 DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
134  cudaStream_t stream,
135  Tensor<half, Dim, true>& in) {
136  DeviceTensor<float, Dim, true> out;
137  if (resources) {
138  out = std::move(DeviceTensor<float, Dim, true>(
139  resources->getMemoryManagerCurrentDevice(),
140  in.sizes(),
141  stream));
142  } else {
143  out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
144  }
145 
146  fromHalf<Dim>(stream, in, out);
147  return out;
148 }
149 
150 __half hostFloat2Half(float v);
151 
152 #endif // FAISS_USE_FLOAT16
153 
154 } } // namespace