mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
344 lines
8.7 KiB
Plaintext
344 lines
8.7 KiB
Plaintext
/**
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This source code is licensed under the MIT license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
|
|
#pragma once
|
|
|
|
#include <faiss/gpu/utils/Tensor.cuh>
|
|
|
|
//
|
|
// Shared utilities for brute-force distance calculations
|
|
//
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
struct IPDistance {
|
|
__host__ __device__ IPDistance() : dist(0) {}
|
|
|
|
static constexpr bool kDirection = true; // maximize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
dist += a * b;
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const IPDistance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ IPDistance zero() const {
|
|
return IPDistance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
struct L1Distance {
|
|
__host__ __device__ L1Distance() : dist(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
dist += fabsf(a - b);
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const L1Distance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ L1Distance zero() const {
|
|
return L1Distance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
struct L2Distance {
|
|
__host__ __device__ L2Distance() : dist(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
float v = a - b;
|
|
dist += v * v;
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const L2Distance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ L2Distance zero() const {
|
|
return L2Distance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
struct LpDistance {
|
|
__host__ __device__ LpDistance()
|
|
: p(2), dist(0) {}
|
|
|
|
__host__ __device__ LpDistance(float arg)
|
|
: p(arg), dist(0) {}
|
|
|
|
__host__ __device__ LpDistance(const LpDistance& v)
|
|
: p(v.p), dist(v.dist) {}
|
|
|
|
__host__ __device__ LpDistance& operator=(const LpDistance& v) {
|
|
p = v.p;
|
|
dist = v.dist;
|
|
return *this;
|
|
}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
dist += powf(fabsf(a - b), p);
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const LpDistance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ LpDistance zero() const {
|
|
return LpDistance(p);
|
|
}
|
|
|
|
float p;
|
|
float dist;
|
|
};
|
|
|
|
struct LinfDistance {
|
|
__host__ __device__ LinfDistance() : dist(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
dist = fmaxf(dist, fabsf(a - b));
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const LinfDistance& v) {
|
|
dist = fmaxf(dist, v.dist);
|
|
}
|
|
|
|
__host__ __device__ LinfDistance zero() const {
|
|
return LinfDistance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
struct CanberraDistance {
|
|
__host__ __device__ CanberraDistance() : dist(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
float denom = fabsf(a) + fabsf(b);
|
|
dist += fabsf(a - b) / denom;
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const CanberraDistance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ CanberraDistance zero() const {
|
|
return CanberraDistance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
struct BrayCurtisDistance {
|
|
__host__ __device__ BrayCurtisDistance()
|
|
: numerator(0), denominator(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
numerator += fabsf(a - b);
|
|
denominator += fabsf(a + b);
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return (numerator / denominator);
|
|
}
|
|
|
|
__host__ __device__ void combine(const BrayCurtisDistance& v) {
|
|
numerator += v.numerator;
|
|
denominator += v.denominator;
|
|
}
|
|
|
|
__host__ __device__ BrayCurtisDistance zero() const {
|
|
return BrayCurtisDistance();
|
|
}
|
|
|
|
float numerator;
|
|
float denominator;
|
|
};
|
|
|
|
struct JensenShannonDistance {
|
|
__host__ __device__ JensenShannonDistance()
|
|
: dist(0) {}
|
|
|
|
static constexpr bool kDirection = false; // minimize
|
|
static constexpr float kIdentityData = 0;
|
|
static constexpr float kMaxDistance = std::numeric_limits<float>::max();
|
|
|
|
__host__ __device__ void handle(float a, float b) {
|
|
float m = 0.5f * (a + b);
|
|
|
|
float x = m / a;
|
|
float y = m / b;
|
|
|
|
float kl1 = -a * log(x);
|
|
float kl2 = -b * log(y);
|
|
|
|
dist += kl1 + kl2;
|
|
}
|
|
|
|
__host__ __device__ float reduce() {
|
|
return 0.5 * dist;
|
|
}
|
|
|
|
__host__ __device__ void combine(const JensenShannonDistance& v) {
|
|
dist += v.dist;
|
|
}
|
|
|
|
__host__ __device__ JensenShannonDistance zero() const {
|
|
return JensenShannonDistance();
|
|
}
|
|
|
|
float dist;
|
|
};
|
|
|
|
template <typename T, bool InnerContig>
|
|
Tensor<T, 2, InnerContig> sliceCentroids(Tensor<T, 2, InnerContig>& centroids,
|
|
bool centroidsRowMajor,
|
|
int startCentroid,
|
|
int num) {
|
|
// Row major is (num, dim)
|
|
// Col major is (dim, num)
|
|
if (startCentroid == 0 &&
|
|
num == centroids.getSize(centroidsRowMajor ? 0 : 1)) {
|
|
return centroids;
|
|
}
|
|
|
|
return centroids.narrow(centroidsRowMajor ? 0 : 1, startCentroid, num);
|
|
}
|
|
|
|
// For each chunk of k indices, increment the index by chunk * increment
|
|
template <typename T>
|
|
__global__ void incrementIndex(Tensor<T, 2, true> indices,
|
|
int k,
|
|
int increment) {
|
|
for (int i = threadIdx.x; i < k; i += blockDim.x) {
|
|
indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
|
|
}
|
|
}
|
|
|
|
// Used to update result indices in distance computation where the number of
|
|
// centroids is high, and is tiled
|
|
template <typename T>
|
|
void runIncrementIndex(Tensor<T, 2, true>& indices,
|
|
int k,
|
|
int increment,
|
|
cudaStream_t stream) {
|
|
dim3 grid(indices.getSize(1) / k, indices.getSize(0));
|
|
int block = std::min(k, 512);
|
|
|
|
// should be exact
|
|
FAISS_ASSERT(grid.x * k == indices.getSize(1));
|
|
|
|
incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
|
|
}
|
|
|
|
// If the inner size (dim) of the vectors is small, we want a larger query tile
|
|
// size, like 1024
|
|
inline void chooseTileSize(int numQueries,
|
|
int numCentroids,
|
|
int dim,
|
|
int elementSize,
|
|
size_t tempMemAvailable,
|
|
int& tileRows,
|
|
int& tileCols) {
|
|
// The matrix multiplication should be large enough to be efficient, but if it
|
|
// is too large, we seem to lose efficiency as opposed to double-streaming.
|
|
// Each tile size here defines 1/2 of the memory use due to double streaming.
|
|
// We ignore available temporary memory, as that is adjusted independently by
|
|
// the user and can thus meet these requirements (or not).
|
|
// For <= 4 GB GPUs, prefer 512 MB of usage.
|
|
// For <= 8 GB GPUs, prefer 768 MB of usage.
|
|
// Otherwise, prefer 1 GB of usage.
|
|
auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
|
|
|
|
int targetUsage = 0;
|
|
|
|
if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
|
|
targetUsage = 512 * 1024 * 1024;
|
|
} else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
|
|
targetUsage = 768 * 1024 * 1024;
|
|
} else {
|
|
targetUsage = 1024 * 1024 * 1024;
|
|
}
|
|
|
|
targetUsage /= 2 * elementSize;
|
|
|
|
// 512 seems to be a batch size sweetspot for float32.
|
|
// If we are on float16, increase to 512.
|
|
// If the k size (vec dim) of the matrix multiplication is small (<= 32),
|
|
// increase to 1024.
|
|
int preferredTileRows = 512;
|
|
if (dim <= 32) {
|
|
preferredTileRows = 1024;
|
|
}
|
|
|
|
tileRows = std::min(preferredTileRows, numQueries);
|
|
|
|
// tileCols is the remainder size
|
|
tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
|
|
}
|
|
|
|
} } // namespace
|