/** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace faiss { namespace gpu { inline bool isSQSupported(ScalarQuantizer::QuantizerType qtype) { switch (qtype) { case ScalarQuantizer::QuantizerType::QT_8bit: case ScalarQuantizer::QuantizerType::QT_8bit_uniform: case ScalarQuantizer::QuantizerType::QT_8bit_direct: case ScalarQuantizer::QuantizerType::QT_4bit: case ScalarQuantizer::QuantizerType::QT_4bit_uniform: case ScalarQuantizer::QuantizerType::QT_fp16: return true; default: return false; } } // Wrapper around the CPU ScalarQuantizer that allows storage of parameters in // GPU memory struct GpuScalarQuantizer : public ScalarQuantizer { GpuScalarQuantizer(const ScalarQuantizer& sq) : ScalarQuantizer(sq), gpuTrained(DeviceTensor({(int) sq.trained.size()})) { HostTensor cpuTrained((float*) sq.trained.data(), {(int) sq.trained.size()}); // Just use the default stream, as we're allocating memory above in any case gpuTrained.copyFrom(cpuTrained, 0); CUDA_VERIFY(cudaStreamSynchronize(0)); } // ScalarQuantizer::trained copied to GPU memory DeviceTensor gpuTrained; }; // // Quantizer codecs // // QT is the quantizer type implemented // DimMultiple is the minimum guaranteed dimension multiple of the vectors // encoded (used for ensuring alignment for memory load/stores) template struct Codec { }; ///// // // 32 bit encodings // (does not use qtype) // ///// struct CodecFloat { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 1; CodecFloat(int vecBytes) : bytesPerVec(vecBytes) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ void decode(void* data, int vec, int d, float* out) const { float* p = (float*) &((uint8_t*) data)[vec * bytesPerVec]; out[0] = p[d]; } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { // doesn't need implementing (kDimPerIter == 1) return 0.0f; } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { float* p = (float*) &((uint8_t*) data)[vec * bytesPerVec]; p[d] = v[0]; } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { // doesn't need implementing (kDimPerIter == 1) } int bytesPerVec; }; ///// // // 16 bit encodings // ///// // Arbitrary dimension fp16 template <> struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 1; Codec(int vecBytes) : bytesPerVec(vecBytes) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ void decode(void* data, int vec, int d, float* out) const { half* p = (half*) &((uint8_t*) data)[vec * bytesPerVec]; out[0] = Convert()(p[d]); } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { // doesn't need implementing (kDimPerIter == 1) return 0.0f; } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { half* p = (half*) &((uint8_t*) data)[vec * bytesPerVec]; p[d] = Convert()(v[0]); } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { // doesn't need implementing (kDimPerIter == 1) } int bytesPerVec; }; // dim % 2 == 0, ensures uint32 alignment template <> struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 2; Codec(int vecBytes) : bytesPerVec(vecBytes) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ void decode(void* data, int vec, int d, float* out) const { half2* p = (half2*) &((uint8_t*) data)[vec * bytesPerVec]; half2 pd = p[d]; out[0] = Convert()(__low2half(pd)); out[1] = Convert()(__high2half(pd)); } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { // should not be called assert(false); return 0; } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { half2* p = (half2*) &((uint8_t*) data)[vec * bytesPerVec]; half h0 = Convert()(v[0]); half h1 = Convert()(v[1]); p[d] = __halves2half2(h0, h1); } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { // should not be called assert(false); } int bytesPerVec; }; ///// // // 8 bit encodings // ///// template struct Get8BitType { }; template <> struct Get8BitType<1> { using T = uint8_t; }; template <> struct Get8BitType<2> { using T = uint16_t; }; template <> struct Get8BitType<4> { using T = uint32_t; }; // Uniform quantization across all dimensions template struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = DimMultiple; using MemT = typename Get8BitType::T; Codec(int vecBytes, float min, float diff) : bytesPerVec(vecBytes), vmin(min), vdiff(diff) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ float decodeHelper(uint8_t v) const { float x = (((float) v) + 0.5f) / 255.0f; return vmin + x * vdiff; } inline __device__ void decode(void* data, int vec, int d, float* out) const { MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec]; MemT pv = p[d]; uint8_t x[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { x[i] = (uint8_t) ((pv >> (i * 8)) & 0xffU); } float xDec[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { xDec[i] = decodeHelper(x[i]); } #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { out[i] = xDec[i]; } } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { if (DimMultiple > 1) { // should not be called assert(false); } // otherwise does not need implementing return 0; } inline __device__ uint8_t encodeHelper(float v) const { float x = (v - vmin) / vdiff; x = fminf(1.0f, fmaxf(0.0f, x)); return (uint8_t) (255 * x); } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec]; MemT x[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { x[i] = encodeHelper(v[i]); } MemT out = 0; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { out |= (x[i] << (i * 8)); } p[d] = out; } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { if (DimMultiple > 1) { // should not be called assert(false); } // otherwise does not need implementing } int bytesPerVec; const float vmin; const float vdiff; }; // Uniform quantization per each dimension template struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = DimMultiple; using MemT = typename Get8BitType::T; Codec(int vecBytes, float* min, float* diff) : bytesPerVec(vecBytes), vmin(min), vdiff(diff), smemVmin(nullptr), smemVdiff(nullptr) { } size_t getSmemSize(int dim) { return sizeof(float) * dim * 2; } inline __device__ void setSmem(float* smem, int dim) { smemVmin = smem; smemVdiff = smem + dim; for (int i = threadIdx.x; i < dim; i += blockDim.x) { smemVmin[i] = vmin[i]; smemVdiff[i] = vdiff[i]; } } inline __device__ float decodeHelper(uint8_t v, int realDim) const { float x = (((float) v) + 0.5f) / 255.0f; return smemVmin[realDim] + x * smemVdiff[realDim]; } inline __device__ void decode(void* data, int vec, int d, float* out) const { MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec]; MemT pv = p[d]; int realDim = d * kDimPerIter; uint8_t x[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { x[i] = (uint8_t) ((pv >> (i * 8)) & 0xffU); } float xDec[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { xDec[i] = decodeHelper(x[i], realDim + i); } #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { out[i] = xDec[i]; } } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { if (DimMultiple > 1) { // should not be called assert(false); } // otherwise does not need implementing return 0; } inline __device__ uint8_t encodeHelper(float v, int realDim) const { float x = (v - vmin[realDim]) / vdiff[realDim]; x = fminf(1.0f, fmaxf(0.0f, x)); return (uint8_t) (255 * x); } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { MemT* p = (MemT*) &((uint8_t*) data)[vec * bytesPerVec]; int realDim = d * kDimPerIter; MemT x[kDimPerIter]; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { x[i] = encodeHelper(v[i], realDim + i); } MemT out = 0; #pragma unroll for (int i = 0; i < kDimPerIter; ++i) { out |= (x[i] << (i * 8)); } p[d] = out; } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { if (DimMultiple > 1) { // should not be called assert(false); } // otherwise does not need implementing } int bytesPerVec; // gmem pointers const float* vmin; const float* vdiff; // smem pointers (configured in the kernel) float* smemVmin; float* smemVdiff; }; template <> struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 1; Codec(int vecBytes) : bytesPerVec(vecBytes) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ void decode(void* data, int vec, int d, float* out) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; out[0] = (float) p[d]; } inline __device__ float decodePartial(void* data, int vec, int d, int subD) const { // doesn't need implementing (kDimPerIter == 1) return 0.0f; } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; p[d] = (uint8_t) v[0]; } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, float v[kDimPerIter]) const { // doesn't need implementing (kDimPerIter == 1) } int bytesPerVec; }; ///// // // 4 bit encodings // ///// // Uniform quantization across all dimensions template <> struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 2; Codec(int vecBytes, float min, float diff) : bytesPerVec(vecBytes), vmin(min), vdiff(diff) { } size_t getSmemSize(int dim) { return 0; } inline __device__ void setSmem(float* smem, int dim) { } inline __device__ float decodeHelper(uint8_t v) const { float x = (((float) v) + 0.5f) / 15.0f; return vmin + x * vdiff; } inline __device__ void decode(void* data, int vec, int d, float* out) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; uint8_t pv = p[d]; out[0] = decodeHelper(pv & 0xf); out[1] = decodeHelper(pv >> 4); } inline __device__ float decodePartial(void* data, int vec, int d, int subD /* unused */) const { // We can only be called for a single input uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; uint8_t pv = p[d]; return decodeHelper(pv & 0xf); } inline __device__ uint8_t encodeHelper(float v) const { float x = (v - vmin) / vdiff; x = fminf(1.0f, fmaxf(0.0f, x)); return (uint8_t) (x * 15.0f); } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; p[d] = encodeHelper(v[0]) | (encodeHelper(v[1]) << 4); } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, /* unused */ float v[kDimPerIter]) const { // We can only be called for a single output uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; p[d] = encodeHelper(v[0]); } int bytesPerVec; const float vmin; const float vdiff; }; template <> struct Codec { /// How many dimensions per iteration we are handling for encoding or decoding static constexpr int kDimPerIter = 2; Codec(int vecBytes, float* min, float* diff) : bytesPerVec(vecBytes), vmin(min), vdiff(diff), smemVmin(nullptr), smemVdiff(nullptr) { } size_t getSmemSize(int dim) { return sizeof(float) * dim * 2; } inline __device__ void setSmem(float* smem, int dim) { smemVmin = smem; smemVdiff = smem + dim; for (int i = threadIdx.x; i < dim; i += blockDim.x) { smemVmin[i] = vmin[i]; smemVdiff[i] = vdiff[i]; } } inline __device__ float decodeHelper(uint8_t v, int realDim) const { float x = (((float) v) + 0.5f) / 15.0f; return smemVmin[realDim] + x * smemVdiff[realDim]; } inline __device__ void decode(void* data, int vec, int d, float* out) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; uint8_t pv = p[d]; int realDim = d * kDimPerIter; out[0] = decodeHelper(pv & 0xf, realDim); out[1] = decodeHelper(pv >> 4, realDim + 1); } inline __device__ float decodePartial(void* data, int vec, int d, int subD /* unused */) const { // We can only be called for a single input uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; uint8_t pv = p[d]; int realDim = d * kDimPerIter; return decodeHelper(pv & 0xf, realDim); } inline __device__ uint8_t encodeHelper(float v, int realDim) const { float x = (v - vmin[realDim]) / vdiff[realDim]; x = fminf(1.0f, fmaxf(0.0f, x)); return (uint8_t) (x * 15.0f); } inline __device__ void encode(void* data, int vec, int d, float v[kDimPerIter]) const { uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; int realDim = d * kDimPerIter; p[d] = encodeHelper(v[0], realDim) | (encodeHelper(v[1], realDim + 1) << 4); } inline __device__ void encodePartial(void* data, int vec, int d, int remaining, /* unused */ float v[kDimPerIter]) const { // We can only be called for a single output uint8_t* p = &((uint8_t*) data)[vec * bytesPerVec]; int realDim = d * kDimPerIter; p[d] = encodeHelper(v[0], realDim); } int bytesPerVec; // gmem pointers const float* vmin; const float* vdiff; // smem pointers float* smemVmin; float* smemVdiff; }; } } // namespace