12 #include "../GpuResources.h"
13 #include "DeviceTensor.cuh"
17 #if CUDA_VERSION >= 7050
18 #define FAISS_USE_FLOAT16 1
21 #if __CUDA_ARCH__ >= 530
22 #define FAISS_USE_FULL_FLOAT16 1
23 #endif // __CUDA_ARCH__ types
25 #endif // CUDA_VERSION
27 #ifdef FAISS_USE_FLOAT16
28 #include <cuda_fp16.h>
31 namespace faiss {
namespace gpu {
33 #ifdef FAISS_USE_FLOAT16
41 inline __device__ float4 half4ToFloat4(Half4 v) {
42 float2 a = __half22float2(v.a);
43 float2 b = __half22float2(v.b);
54 inline __device__ Half4 float4ToHalf4(float4 v) {
64 out.a = __float22half2_rn(a);
65 out.b = __float22half2_rn(b);
77 bool getDeviceSupportsFloat16Math(
int device);
80 void runConvertToFloat16(half* out,
87 void runConvertToFloat32(
float* out,
93 void toHalf(cudaStream_t stream,
94 Tensor<float, Dim, true>& in,
95 Tensor<half, Dim, true>& out) {
96 FAISS_ASSERT(in.numElements() == out.numElements());
100 runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
104 DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
106 Tensor<float, Dim, true>& in) {
107 DeviceTensor<half, Dim, true> out;
109 out = std::move(DeviceTensor<half, Dim, true>(
110 resources->getMemoryManagerCurrentDevice(),
114 out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
117 toHalf<Dim>(stream, in, out);
122 void fromHalf(cudaStream_t stream,
123 Tensor<half, Dim, true>& in,
124 Tensor<float, Dim, true>& out) {
125 FAISS_ASSERT(in.numElements() == out.numElements());
129 runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
133 DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
135 Tensor<half, Dim, true>& in) {
136 DeviceTensor<float, Dim, true> out;
138 out = std::move(DeviceTensor<float, Dim, true>(
139 resources->getMemoryManagerCurrentDevice(),
143 out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
146 fromHalf<Dim>(stream, in, out);
150 __half hostFloat2Half(
float v);
152 #endif // FAISS_USE_FLOAT16