761 lines
26 KiB
Plaintext
761 lines
26 KiB
Plaintext
/**
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This source code is licensed under the MIT license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
|
|
#include <faiss/gpu/impl/IVFPQ.cuh>
|
|
#include <faiss/gpu/GpuResources.h>
|
|
#include <faiss/gpu/impl/BroadcastSum.cuh>
|
|
#include <faiss/gpu/impl/Distance.cuh>
|
|
#include <faiss/gpu/impl/FlatIndex.cuh>
|
|
#include <faiss/gpu/impl/IVFAppend.cuh>
|
|
#include <faiss/gpu/impl/L2Norm.cuh>
|
|
#include <faiss/gpu/impl/PQCodeDistances.cuh>
|
|
#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
|
|
#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
|
|
#include <faiss/gpu/impl/RemapIndices.h>
|
|
#include <faiss/gpu/impl/VectorResidual.cuh>
|
|
#include <faiss/gpu/utils/ConversionOperators.cuh>
|
|
#include <faiss/gpu/utils/DeviceDefs.cuh>
|
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
#include <faiss/gpu/utils/HostTensor.cuh>
|
|
#include <faiss/gpu/utils/MatrixMult.cuh>
|
|
#include <faiss/gpu/utils/NoTypeTensor.cuh>
|
|
#include <faiss/gpu/utils/Transpose.cuh>
|
|
#include <limits>
|
|
#include <thrust/host_vector.h>
|
|
#include <unordered_map>
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
IVFPQ::IVFPQ(GpuResources* resources,
|
|
faiss::MetricType metric,
|
|
float metricArg,
|
|
FlatIndex* quantizer,
|
|
int numSubQuantizers,
|
|
int bitsPerSubQuantizer,
|
|
float* pqCentroidData,
|
|
IndicesOptions indicesOptions,
|
|
bool useFloat16LookupTables,
|
|
MemorySpace space) :
|
|
IVFBase(resources,
|
|
metric,
|
|
metricArg,
|
|
quantizer,
|
|
numSubQuantizers,
|
|
indicesOptions,
|
|
space),
|
|
numSubQuantizers_(numSubQuantizers),
|
|
bitsPerSubQuantizer_(bitsPerSubQuantizer),
|
|
numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
|
|
dimPerSubQuantizer_(dim_ / numSubQuantizers),
|
|
precomputedCodes_(false),
|
|
useFloat16LookupTables_(useFloat16LookupTables) {
|
|
FAISS_ASSERT(pqCentroidData);
|
|
|
|
FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
|
|
FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
|
|
FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));
|
|
|
|
setPQCentroids_(pqCentroidData);
|
|
}
|
|
|
|
IVFPQ::~IVFPQ() {
|
|
}
|
|
|
|
|
|
bool
|
|
IVFPQ::isSupportedPQCodeLength(int size) {
|
|
switch (size) {
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
case 8:
|
|
case 12:
|
|
case 16:
|
|
case 20:
|
|
case 24:
|
|
case 28:
|
|
case 32:
|
|
case 40:
|
|
case 48:
|
|
case 56: // only supported with float16
|
|
case 64: // only supported with float16
|
|
case 96: // only supported with float16
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) {
|
|
return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
|
|
}
|
|
|
|
void
|
|
IVFPQ::setPrecomputedCodes(bool enable) {
|
|
if (enable && metric_ == MetricType::METRIC_INNER_PRODUCT) {
|
|
FAISS_THROW_MSG("Precomputed codes are not needed for GpuIndexIVFPQ "
|
|
"with METRIC_INNER_PRODUCT");
|
|
}
|
|
|
|
if (precomputedCodes_ != enable) {
|
|
precomputedCodes_ = enable;
|
|
|
|
if (precomputedCodes_) {
|
|
precomputeCodes_();
|
|
} else {
|
|
// Clear out old precomputed code data
|
|
precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
|
|
precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|
Tensor<long, 1, true>& indices) {
|
|
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
|
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
|
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// Number of valid vectors that we actually add; we return this
|
|
int numAdded = 0;
|
|
|
|
// We don't actually need this
|
|
DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
|
|
// We use this
|
|
DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
|
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
|
|
|
quantizer_->query(vecs,
|
|
1,
|
|
metric_,
|
|
metricArg_,
|
|
listDistance,
|
|
listIds2d,
|
|
false);
|
|
|
|
// Copy the lists that we wish to append to back to the CPU
|
|
// FIXME: really this can be into pinned memory and a true async
|
|
// copy on a different stream; we can start the copy early, but it's
|
|
// tiny
|
|
HostTensor<int, 1, true> listIdsHost(listIds, stream);
|
|
|
|
// Calculate the residual for each closest centroid
|
|
DeviceTensor<float, 2, true> residuals(
|
|
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
|
|
|
|
if (quantizer_->getUseFloat16()) {
|
|
auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
|
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
|
} else {
|
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
|
}
|
|
|
|
// Residuals are in the form
|
|
// (vec x numSubQuantizer x dimPerSubQuantizer)
|
|
// transpose to
|
|
// (numSubQuantizer x vec x dimPerSubQuantizer)
|
|
auto residualsView = residuals.view<3>(
|
|
{residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
|
|
DeviceTensor<float, 3, true> residualsTranspose(
|
|
mem,
|
|
{numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
|
|
stream);
|
|
|
|
runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
|
|
|
|
// Get the product quantizer centroids in the form
|
|
// (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
|
|
// which is pqCentroidsMiddleCode_
|
|
|
|
// We now have a batch operation to find the top-1 distances:
|
|
// batch size: numSubQuantizer
|
|
// centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
|
|
// residuals: (vec x dimPerSubQuantizer)
|
|
// => (numSubQuantizer x vec x 1)
|
|
|
|
DeviceTensor<float, 3, true> closestSubQDistance(
|
|
mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
|
|
DeviceTensor<int, 3, true> closestSubQIndex(
|
|
mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
|
|
|
|
for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
|
|
auto closestSubQDistanceView = closestSubQDistance[subQ].view();
|
|
auto closestSubQIndexView = closestSubQIndex[subQ].view();
|
|
|
|
auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
|
|
auto residualsTransposeView = residualsTranspose[subQ].view();
|
|
|
|
runL2Distance(resources_,
|
|
pqCentroidsMiddleCodeView,
|
|
true, // pqCentroidsMiddleCodeView is row major
|
|
nullptr, // no precomputed norms
|
|
residualsTransposeView,
|
|
true, // residualsTransposeView is row major
|
|
1,
|
|
closestSubQDistanceView,
|
|
closestSubQIndexView,
|
|
// We don't care about distances
|
|
true);
|
|
}
|
|
|
|
// Now, we have the nearest sub-q centroid for each slice of the
|
|
// residual vector.
|
|
auto closestSubQIndexView = closestSubQIndex.view<2>(
|
|
{numSubQuantizers_, residuals.getSize(0)});
|
|
|
|
// Transpose this for easy use
|
|
DeviceTensor<int, 2, true> encodings(
|
|
mem, {residuals.getSize(0), numSubQuantizers_}, stream);
|
|
|
|
runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
|
|
|
|
// Now we add the encoded vectors to the individual lists
|
|
// First, make sure that there is space available for adding the new
|
|
// encoded vectors and indices
|
|
|
|
// list id -> # being added
|
|
std::unordered_map<int, int> assignCounts;
|
|
|
|
// vector id -> offset in list
|
|
// (we already have vector id -> list id in listIds)
|
|
HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
|
|
|
|
for (int i = 0; i < listIdsHost.getSize(0); ++i) {
|
|
int listId = listIdsHost[i];
|
|
|
|
// Add vector could be invalid (contains NaNs etc)
|
|
if (listId < 0) {
|
|
listOffsetHost[i] = -1;
|
|
continue;
|
|
}
|
|
|
|
FAISS_ASSERT(listId < numLists_);
|
|
++numAdded;
|
|
|
|
int offset = deviceListData_[listId]->size() / bytesPerVector_;
|
|
|
|
auto it = assignCounts.find(listId);
|
|
if (it != assignCounts.end()) {
|
|
offset += it->second;
|
|
it->second++;
|
|
} else {
|
|
assignCounts[listId] = 1;
|
|
}
|
|
|
|
listOffsetHost[i] = offset;
|
|
}
|
|
|
|
// If we didn't add anything (all invalid vectors), no need to
|
|
// continue
|
|
if (numAdded == 0) {
|
|
return 0;
|
|
}
|
|
|
|
// We need to resize the data structures for the inverted lists on
|
|
// the GPUs, which means that they might need reallocation, which
|
|
// means that their base address may change. Figure out the new base
|
|
// addresses, and update those in a batch on the device
|
|
{
|
|
// Resize all of the lists that we are appending to
|
|
for (auto& counts : assignCounts) {
|
|
auto& codes = deviceListData_[counts.first];
|
|
codes->resize(codes->size() + counts.second * bytesPerVector_,
|
|
stream);
|
|
int newNumVecs = (int) (codes->size() / bytesPerVector_);
|
|
|
|
auto& indices = deviceListIndices_[counts.first];
|
|
if ((indicesOptions_ == INDICES_32_BIT) ||
|
|
(indicesOptions_ == INDICES_64_BIT)) {
|
|
size_t indexSize =
|
|
(indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
|
|
|
|
indices->resize(indices->size() + counts.second * indexSize, stream);
|
|
} else if (indicesOptions_ == INDICES_CPU) {
|
|
// indices are stored on the CPU side
|
|
FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
|
|
|
|
auto& userIndices = listOffsetToUserIndex_[counts.first];
|
|
userIndices.resize(newNumVecs);
|
|
} else {
|
|
// indices are not stored on the GPU or CPU side
|
|
FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
|
|
}
|
|
|
|
// This is used by the multi-pass query to decide how much scratch
|
|
// space to allocate for intermediate results
|
|
maxListLength_ = std::max(maxListLength_, newNumVecs);
|
|
}
|
|
|
|
// Update all pointers and sizes on the device for lists that we
|
|
// appended to
|
|
{
|
|
std::vector<int> listIds(assignCounts.size());
|
|
int i = 0;
|
|
for (auto& counts : assignCounts) {
|
|
listIds[i++] = counts.first;
|
|
}
|
|
|
|
updateDeviceListInfo_(listIds, stream);
|
|
}
|
|
}
|
|
|
|
// If we're maintaining the indices on the CPU side, update our
|
|
// map. We already resized our map above.
|
|
if (indicesOptions_ == INDICES_CPU) {
|
|
// We need to maintain the indices on the CPU side
|
|
HostTensor<long, 1, true> hostIndices(indices, stream);
|
|
|
|
for (int i = 0; i < hostIndices.getSize(0); ++i) {
|
|
int listId = listIdsHost[i];
|
|
|
|
// Add vector could be invalid (contains NaNs etc)
|
|
if (listId < 0) {
|
|
continue;
|
|
}
|
|
|
|
int offset = listOffsetHost[i];
|
|
|
|
FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
|
|
auto& userIndices = listOffsetToUserIndex_[listId];
|
|
|
|
FAISS_ASSERT(offset < userIndices.size());
|
|
userIndices[offset] = hostIndices[i];
|
|
}
|
|
}
|
|
|
|
// We similarly need to actually append the new encoded vectors
|
|
{
|
|
DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
|
|
|
|
// This kernel will handle appending each encoded vector + index to
|
|
// the appropriate list
|
|
runIVFPQInvertedListAppend(listIds,
|
|
listOffset,
|
|
encodings,
|
|
indices,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
stream);
|
|
}
|
|
|
|
return numAdded;
|
|
}
|
|
|
|
void
|
|
IVFPQ::addCodeVectorsFromCpu(int listId,
|
|
const void* codes,
|
|
const long* indices,
|
|
size_t numVecs) {
|
|
// This list must already exist
|
|
FAISS_ASSERT(listId < deviceListData_.size());
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// If there's nothing to add, then there's nothing we have to do
|
|
if (numVecs == 0) {
|
|
return;
|
|
}
|
|
|
|
size_t lengthInBytes = numVecs * bytesPerVector_;
|
|
|
|
auto& listCodes = deviceListData_[listId];
|
|
auto prevCodeData = listCodes->data();
|
|
|
|
// We only have int32 length representations on the GPU per each
|
|
// list; the length is in sizeof(char)
|
|
FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
|
|
FAISS_ASSERT(listCodes->size() + lengthInBytes <=
|
|
(size_t) std::numeric_limits<int>::max());
|
|
|
|
listCodes->append((unsigned char*) codes,
|
|
lengthInBytes,
|
|
stream,
|
|
true /* exact reserved size */);
|
|
|
|
// Handle the indices as well
|
|
addIndicesFromCpu_(listId, indices, numVecs);
|
|
|
|
// This list address may have changed due to vector resizing, but
|
|
// only bother updating it on the device if it has changed
|
|
if (prevCodeData != listCodes->data()) {
|
|
deviceListDataPointers_[listId] = listCodes->data();
|
|
}
|
|
|
|
// And our size has changed too
|
|
int listLength = listCodes->size() / bytesPerVector_;
|
|
deviceListLengths_[listId] = listLength;
|
|
|
|
// We update this as well, since the multi-pass algorithm uses it
|
|
maxListLength_ = std::max(maxListLength_, listLength);
|
|
|
|
// device_vector add is potentially happening on a different stream
|
|
// than our default stream
|
|
if (resources_->getDefaultStreamCurrentDevice() != 0) {
|
|
streamWait({stream}, {0});
|
|
}
|
|
}
|
|
|
|
void
|
|
IVFPQ::setPQCentroids_(float* data) {
|
|
size_t pqSize =
|
|
numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
|
|
|
|
// Make sure the data is on the host
|
|
// FIXME: why are we doing this?
|
|
thrust::host_vector<float> hostMemory;
|
|
hostMemory.insert(hostMemory.end(), data, data + pqSize);
|
|
|
|
HostTensor<float, 3, true> pqHost(
|
|
hostMemory.data(),
|
|
{numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 3, true> pqDevice(
|
|
pqHost,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
DeviceTensor<float, 3, true> pqDeviceTranspose(
|
|
{numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
|
|
runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
|
|
|
|
// Also maintain the PQ centroids in the form
|
|
// (sub q)(code id)(sub dim)
|
|
DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
|
|
{numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
|
|
}
|
|
|
|
template <typename CentroidT>
|
|
void
|
|
IVFPQ::precomputeCodesT_() {
|
|
FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
|
|
|
|
//
|
|
// d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
|
|
// --------------- --------------------------- -------
|
|
// term 1 term 2 term 3
|
|
//
|
|
|
|
// Terms 1 and 3 are available only at query time. We compute term 2
|
|
// here.
|
|
|
|
// Compute ||y_R||^2 by treating
|
|
// (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
|
|
auto pqCentroidsMiddleCodeView =
|
|
pqCentroidsMiddleCode_.view<2>(
|
|
{numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 1, true> subQuantizerNorms(
|
|
{numSubQuantizers_ * numSubQuantizerCodes_});
|
|
|
|
runL2Norm(pqCentroidsMiddleCodeView, true,
|
|
subQuantizerNorms, true,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// Compute 2 * (y_C|y_R) via batch matrix multiplication
|
|
// batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
|
|
// => (sub q) x {(centroid id)(code id)}
|
|
// => (sub q)(centroid id)(code id)
|
|
|
|
// View (centroid id)(dim) as
|
|
// (centroid id)(sub q)(dim)
|
|
// Transpose (centroid id)(sub q)(sub dim) to
|
|
// (sub q)(centroid id)(sub dim)
|
|
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
|
|
auto centroidView = coarseCentroids.template view<3>(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
DeviceTensor<CentroidT, 3, true> centroidsTransposed(
|
|
{numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
|
|
|
|
runTransposeAny(centroidView, 0, 1, centroidsTransposed,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
DeviceTensor<float, 3, true> coarsePQProduct(
|
|
{numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
|
|
|
|
runIteratedMatrixMult(coarsePQProduct, false,
|
|
centroidsTransposed, false,
|
|
pqCentroidsMiddleCode_, true,
|
|
2.0f, 0.0f,
|
|
resources_->getBlasHandleCurrentDevice(),
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// Transpose (sub q)(centroid id)(code id) to
|
|
// (centroid id)(sub q)(code id)
|
|
DeviceTensor<float, 3, true> coarsePQProductTransposed(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
|
|
runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// View (centroid id)(sub q)(code id) as
|
|
// (centroid id)(sub q * code id)
|
|
auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
|
|
|
|
// Sum || y_R ||^2 + 2 * (y_C|y_R)
|
|
// i.e., add norms (sub q * code id)
|
|
// along columns of inner product (centroid id)(sub q * code id)
|
|
runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// We added into the view, so `coarsePQProductTransposed` is now our
|
|
// precomputed term 2.
|
|
if (useFloat16LookupTables_) {
|
|
precomputedCodeHalf_ =
|
|
convertTensor<float, half, 3>(resources_,
|
|
resources_->getDefaultStreamCurrentDevice(),
|
|
coarsePQProductTransposed);
|
|
} else {
|
|
precomputedCode_ = std::move(coarsePQProductTransposed);
|
|
}
|
|
}
|
|
|
|
void
|
|
IVFPQ::precomputeCodes_() {
|
|
if (quantizer_->getUseFloat16()) {
|
|
precomputeCodesT_<half>();
|
|
} else {
|
|
precomputeCodesT_<float>();
|
|
}
|
|
}
|
|
|
|
void
|
|
IVFPQ::query(Tensor<float, 2, true>& queries,
|
|
int nprobe,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
// These are caught at a higher level
|
|
FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
|
|
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
|
|
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
nprobe = std::min(nprobe, quantizer_->getSize());
|
|
|
|
FAISS_ASSERT(queries.getSize(1) == dim_);
|
|
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
|
|
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
|
|
|
|
// Reserve space for the closest coarse centroids
|
|
DeviceTensor<float, 2, true>
|
|
coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
|
|
DeviceTensor<int, 2, true>
|
|
coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
|
|
|
|
// Find the `nprobe` closest coarse centroids; we can use int
|
|
// indices both internally and externally
|
|
quantizer_->query(queries,
|
|
nprobe,
|
|
metric_,
|
|
metricArg_,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
true);
|
|
|
|
if (precomputedCodes_) {
|
|
FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
|
|
|
|
runPQPrecomputedCodes_(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
} else {
|
|
runPQNoPrecomputedCodes_(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
}
|
|
|
|
// If the GPU isn't storing indices (they are on the CPU side), we
|
|
// need to perform the re-mapping here
|
|
// FIXME: we might ultimately be calling this function with inputs
|
|
// from the CPU, these are unnecessary copies
|
|
if (indicesOptions_ == INDICES_CPU) {
|
|
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
|
|
|
|
ivfOffsetToUserIndex(hostOutIndices.data(),
|
|
numLists_,
|
|
hostOutIndices.getSize(0),
|
|
hostOutIndices.getSize(1),
|
|
listOffsetToUserIndex_);
|
|
|
|
// Copy back to GPU, since the input to this function is on the
|
|
// GPU
|
|
outIndices.copyFrom(hostOutIndices, stream);
|
|
}
|
|
}
|
|
|
|
std::vector<unsigned char>
|
|
IVFPQ::getListCodes(int listId) const {
|
|
FAISS_ASSERT(listId < deviceListData_.size());
|
|
|
|
return deviceListData_[listId]->copyToHost<unsigned char>(
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
}
|
|
|
|
Tensor<float, 3, true>
|
|
IVFPQ::getPQCentroids() {
|
|
return pqCentroidsMiddleCode_;
|
|
}
|
|
|
|
void
|
|
IVFPQ::runPQPrecomputedCodes_(
|
|
Tensor<float, 2, true>& queries,
|
|
DeviceTensor<float, 2, true>& coarseDistances,
|
|
DeviceTensor<int, 2, true>& coarseIndices,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
|
|
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// Compute precomputed code term 3, - 2 * (x|y_R)
|
|
// This is done via batch MM
|
|
// {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
|
|
// {sub q} x {(query id)(code id)}
|
|
DeviceTensor<float, 3, true> term3Transposed(
|
|
mem,
|
|
{queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
|
|
stream);
|
|
|
|
// These allocations within are only temporary, so release them when
|
|
// we're done to maximize free space
|
|
{
|
|
auto querySubQuantizerView = queries.view<3>(
|
|
{queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 3, true> queriesTransposed(
|
|
mem,
|
|
{numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
|
|
stream);
|
|
runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
|
|
|
|
DeviceTensor<float, 3, true> term3(
|
|
mem,
|
|
{numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
|
|
stream);
|
|
|
|
runIteratedMatrixMult(term3, false,
|
|
queriesTransposed, false,
|
|
pqCentroidsMiddleCode_, true,
|
|
-2.0f, 0.0f,
|
|
resources_->getBlasHandleCurrentDevice(),
|
|
stream);
|
|
|
|
runTransposeAny(term3, 0, 1, term3Transposed, stream);
|
|
}
|
|
|
|
NoTypeTensor<3, true> term2;
|
|
NoTypeTensor<3, true> term3;
|
|
DeviceTensor<half, 3, true> term3Half;
|
|
|
|
if (useFloat16LookupTables_) {
|
|
term3Half =
|
|
convertTensor<float, half, 3>(resources_, stream, term3Transposed);
|
|
|
|
term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
|
|
term3 = NoTypeTensor<3, true>(term3Half);
|
|
} else {
|
|
term2 = NoTypeTensor<3, true>(precomputedCode_);
|
|
term3 = NoTypeTensor<3, true>(term3Transposed);
|
|
}
|
|
|
|
runPQScanMultiPassPrecomputed(queries,
|
|
coarseDistances, // term 1
|
|
term2, // term 2
|
|
term3, // term 3
|
|
coarseIndices,
|
|
useFloat16LookupTables_,
|
|
bytesPerVector_,
|
|
numSubQuantizers_,
|
|
numSubQuantizerCodes_,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
deviceListLengths_,
|
|
maxListLength_,
|
|
k,
|
|
outDistances,
|
|
outIndices,
|
|
resources_);
|
|
}
|
|
|
|
template <typename CentroidT>
|
|
void
|
|
IVFPQ::runPQNoPrecomputedCodesT_(
|
|
Tensor<float, 2, true>& queries,
|
|
DeviceTensor<float, 2, true>& coarseDistances,
|
|
DeviceTensor<int, 2, true>& coarseIndices,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
|
|
|
|
runPQScanMultiPassNoPrecomputed(queries,
|
|
coarseCentroids,
|
|
pqCentroidsInnermostCode_,
|
|
coarseIndices,
|
|
useFloat16LookupTables_,
|
|
bytesPerVector_,
|
|
numSubQuantizers_,
|
|
numSubQuantizerCodes_,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
deviceListLengths_,
|
|
maxListLength_,
|
|
k,
|
|
metric_,
|
|
outDistances,
|
|
outIndices,
|
|
resources_);
|
|
}
|
|
|
|
void
|
|
IVFPQ::runPQNoPrecomputedCodes_(
|
|
Tensor<float, 2, true>& queries,
|
|
DeviceTensor<float, 2, true>& coarseDistances,
|
|
DeviceTensor<int, 2, true>& coarseIndices,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
if (quantizer_->getUseFloat16()) {
|
|
runPQNoPrecomputedCodesT_<half>(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
} else {
|
|
runPQNoPrecomputedCodesT_<float>(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
}
|
|
}
|
|
|
|
} } // namespace
|