14 #include "../GpuResources.h"
15 #include "DeviceTensor.cuh"
19 #if CUDA_VERSION >= 7050
20 #define FAISS_USE_FLOAT16 1
23 #if __CUDA_ARCH__ >= 530
24 #define FAISS_USE_FULL_FLOAT16 1
25 #endif // __CUDA_ARCH__ types
27 #endif // CUDA_VERSION
29 #ifdef FAISS_USE_FLOAT16
30 #include <cuda_fp16.h>
33 namespace faiss {
namespace gpu {
35 #ifdef FAISS_USE_FLOAT16
43 inline __device__ float4 half4ToFloat4(Half4 v) {
44 float2 a = __half22float2(v.a);
45 float2 b = __half22float2(v.b);
56 inline __device__ Half4 float4ToHalf4(float4 v) {
66 out.a = __float22half2_rn(a);
67 out.b = __float22half2_rn(b);
79 bool getDeviceSupportsFloat16Math(
int device);
82 void runConvertToFloat16(half* out,
89 void runConvertToFloat32(
float* out,
95 void toHalf(cudaStream_t stream,
96 Tensor<float, Dim, true>& in,
97 Tensor<half, Dim, true>& out) {
98 FAISS_ASSERT(in.numElements() == out.numElements());
102 runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
106 DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
108 Tensor<float, Dim, true>& in) {
109 DeviceTensor<half, Dim, true> out;
111 out = std::move(DeviceTensor<half, Dim, true>(
112 resources->getMemoryManagerCurrentDevice(),
116 out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
119 toHalf<Dim>(stream, in, out);
124 void fromHalf(cudaStream_t stream,
125 Tensor<half, Dim, true>& in,
126 Tensor<float, Dim, true>& out) {
127 FAISS_ASSERT(in.numElements() == out.numElements());
131 runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
135 DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
137 Tensor<half, Dim, true>& in) {
138 DeviceTensor<float, Dim, true> out;
140 out = std::move(DeviceTensor<float, Dim, true>(
141 resources->getMemoryManagerCurrentDevice(),
145 out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
148 fromHalf<Dim>(stream, in, out);
152 half hostFloat2Half(
float v);
154 #endif // FAISS_USE_FLOAT16