15 #include "../GpuResources.h"
16 #include "DeviceTensor.cuh"
20 #if CUDA_VERSION >= 7050
21 #define FAISS_USE_FLOAT16 1
24 #if __CUDA_ARCH__ >= 530
25 #define FAISS_USE_FULL_FLOAT16 1
26 #endif // __CUDA_ARCH__ types
28 #endif // CUDA_VERSION
30 #ifdef FAISS_USE_FLOAT16
31 #include <cuda_fp16.h>
34 namespace faiss {
namespace gpu {
36 #ifdef FAISS_USE_FLOAT16
44 inline __device__ float4 half4ToFloat4(Half4 v) {
45 float2 a = __half22float2(v.a);
46 float2 b = __half22float2(v.b);
57 inline __device__ Half4 float4ToHalf4(float4 v) {
67 out.a = __float22half2_rn(a);
68 out.b = __float22half2_rn(b);
80 bool getDeviceSupportsFloat16Math(
int device);
83 void runConvertToFloat16(half* out,
90 void runConvertToFloat32(
float* out,
96 void toHalf(cudaStream_t stream,
97 Tensor<float, Dim, true>& in,
98 Tensor<half, Dim, true>& out) {
99 FAISS_ASSERT(in.numElements() == out.numElements());
103 runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
107 DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
109 Tensor<float, Dim, true>& in) {
110 DeviceTensor<half, Dim, true> out;
112 out = std::move(DeviceTensor<half, Dim, true>(
113 resources->getMemoryManagerCurrentDevice(),
117 out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
120 toHalf<Dim>(stream, in, out);
125 void fromHalf(cudaStream_t stream,
126 Tensor<half, Dim, true>& in,
127 Tensor<float, Dim, true>& out) {
128 FAISS_ASSERT(in.numElements() == out.numElements());
132 runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
136 DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
138 Tensor<half, Dim, true>& in) {
139 DeviceTensor<float, Dim, true> out;
141 out = std::move(DeviceTensor<float, Dim, true>(
142 resources->getMemoryManagerCurrentDevice(),
146 out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
149 fromHalf<Dim>(stream, in, out);
153 half hostFloat2Half(
float v);
155 #endif // FAISS_USE_FLOAT16