155 lines
3.8 KiB
Plaintext
155 lines
3.8 KiB
Plaintext
/**
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This source code is licensed under the MIT license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
|
|
#pragma once
|
|
|
|
#include <cuda.h>
|
|
#include "../GpuResources.h"
|
|
#include "DeviceTensor.cuh"
|
|
|
|
// For float16, We use the half datatype, expecting it to be a struct
|
|
// as in CUDA 7.5.
|
|
#if CUDA_VERSION >= 7050
|
|
#define FAISS_USE_FLOAT16 1
|
|
|
|
// Some compute capabilities have full float16 ALUs.
|
|
#if __CUDA_ARCH__ >= 530
|
|
#define FAISS_USE_FULL_FLOAT16 1
|
|
#endif // __CUDA_ARCH__ types
|
|
|
|
#endif // CUDA_VERSION
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
#include <cuda_fp16.h>
|
|
#endif
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
|
|
// 64 bytes containing 4 half (float16) values
|
|
struct Half4 {
|
|
half2 a;
|
|
half2 b;
|
|
};
|
|
|
|
inline __device__ float4 half4ToFloat4(Half4 v) {
|
|
float2 a = __half22float2(v.a);
|
|
float2 b = __half22float2(v.b);
|
|
|
|
float4 out;
|
|
out.x = a.x;
|
|
out.y = a.y;
|
|
out.z = b.x;
|
|
out.w = b.y;
|
|
|
|
return out;
|
|
}
|
|
|
|
inline __device__ Half4 float4ToHalf4(float4 v) {
|
|
float2 a;
|
|
a.x = v.x;
|
|
a.y = v.y;
|
|
|
|
float2 b;
|
|
b.x = v.z;
|
|
b.y = v.w;
|
|
|
|
Half4 out;
|
|
out.a = __float22half2_rn(a);
|
|
out.b = __float22half2_rn(b);
|
|
|
|
return out;
|
|
}
|
|
|
|
// 128 bytes containing 8 half (float16) values
|
|
struct Half8 {
|
|
Half4 a;
|
|
Half4 b;
|
|
};
|
|
|
|
/// Returns true if the given device supports native float16 math
|
|
bool getDeviceSupportsFloat16Math(int device);
|
|
|
|
/// Copies `in` to `out` while performing a float32 -> float16 conversion
|
|
void runConvertToFloat16(half* out,
|
|
const float* in,
|
|
size_t num,
|
|
cudaStream_t stream);
|
|
|
|
/// Copies `in` to `out` while performing a float16 -> float32
|
|
/// conversion
|
|
void runConvertToFloat32(float* out,
|
|
const half* in,
|
|
size_t num,
|
|
cudaStream_t stream);
|
|
|
|
template <int Dim>
|
|
void toHalf(cudaStream_t stream,
|
|
Tensor<float, Dim, true>& in,
|
|
Tensor<half, Dim, true>& out) {
|
|
FAISS_ASSERT(in.numElements() == out.numElements());
|
|
|
|
// The memory is contiguous (the `true`), so apply a pointwise
|
|
// kernel to convert
|
|
runConvertToFloat16(out.data(), in.data(), in.numElements(), stream);
|
|
}
|
|
|
|
template <int Dim>
|
|
DeviceTensor<half, Dim, true> toHalf(GpuResources* resources,
|
|
cudaStream_t stream,
|
|
Tensor<float, Dim, true>& in) {
|
|
DeviceTensor<half, Dim, true> out;
|
|
if (resources) {
|
|
out = std::move(DeviceTensor<half, Dim, true>(
|
|
resources->getMemoryManagerCurrentDevice(),
|
|
in.sizes(),
|
|
stream));
|
|
} else {
|
|
out = std::move(DeviceTensor<half, Dim, true>(in.sizes()));
|
|
}
|
|
|
|
toHalf<Dim>(stream, in, out);
|
|
return out;
|
|
}
|
|
|
|
template <int Dim>
|
|
void fromHalf(cudaStream_t stream,
|
|
Tensor<half, Dim, true>& in,
|
|
Tensor<float, Dim, true>& out) {
|
|
FAISS_ASSERT(in.numElements() == out.numElements());
|
|
|
|
// The memory is contiguous (the `true`), so apply a pointwise
|
|
// kernel to convert
|
|
runConvertToFloat32(out.data(), in.data(), in.numElements(), stream);
|
|
}
|
|
|
|
template <int Dim>
|
|
DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
|
|
cudaStream_t stream,
|
|
Tensor<half, Dim, true>& in) {
|
|
DeviceTensor<float, Dim, true> out;
|
|
if (resources) {
|
|
out = std::move(DeviceTensor<float, Dim, true>(
|
|
resources->getMemoryManagerCurrentDevice(),
|
|
in.sizes(),
|
|
stream));
|
|
} else {
|
|
out = std::move(DeviceTensor<float, Dim, true>(in.sizes()));
|
|
}
|
|
|
|
fromHalf<Dim>(stream, in, out);
|
|
return out;
|
|
}
|
|
|
|
__half hostFloat2Half(float v);
|
|
|
|
#endif // FAISS_USE_FLOAT16
|
|
|
|
} } // namespace
|