mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops
709 lines
24 KiB
Plaintext
709 lines
24 KiB
Plaintext
/**
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the BSD+Patents license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
|
|
#include "IVFPQ.cuh"
|
|
#include "../GpuResources.h"
|
|
#include "BroadcastSum.cuh"
|
|
#include "Distance.cuh"
|
|
#include "FlatIndex.cuh"
|
|
#include "InvertedListAppend.cuh"
|
|
#include "L2Norm.cuh"
|
|
#include "PQCodeDistances.cuh"
|
|
#include "PQScanMultiPassNoPrecomputed.cuh"
|
|
#include "PQScanMultiPassPrecomputed.cuh"
|
|
#include "RemapIndices.h"
|
|
#include "VectorResidual.cuh"
|
|
#include "../utils/DeviceDefs.cuh"
|
|
#include "../utils/DeviceUtils.h"
|
|
#include "../utils/HostTensor.cuh"
|
|
#include "../utils/MatrixMult.cuh"
|
|
#include "../utils/NoTypeTensor.cuh"
|
|
#include "../utils/Transpose.cuh"
|
|
#include <limits>
|
|
#include <thrust/host_vector.h>
|
|
#include <unordered_map>
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
IVFPQ::IVFPQ(GpuResources* resources,
|
|
FlatIndex* quantizer,
|
|
int numSubQuantizers,
|
|
int bitsPerSubQuantizer,
|
|
float* pqCentroidData,
|
|
IndicesOptions indicesOptions,
|
|
bool useFloat16LookupTables,
|
|
MemorySpace space) :
|
|
IVFBase(resources,
|
|
quantizer,
|
|
numSubQuantizers,
|
|
indicesOptions,
|
|
space),
|
|
numSubQuantizers_(numSubQuantizers),
|
|
bitsPerSubQuantizer_(bitsPerSubQuantizer),
|
|
numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
|
|
dimPerSubQuantizer_(dim_ / numSubQuantizers),
|
|
precomputedCodes_(false),
|
|
useFloat16LookupTables_(useFloat16LookupTables) {
|
|
FAISS_ASSERT(pqCentroidData);
|
|
|
|
FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
|
|
FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
|
|
FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));
|
|
|
|
#ifndef FAISS_USE_FLOAT16
|
|
FAISS_ASSERT(!useFloat16LookupTables_);
|
|
#endif
|
|
|
|
setPQCentroids_(pqCentroidData);
|
|
}
|
|
|
|
IVFPQ::~IVFPQ() {
|
|
}
|
|
|
|
|
|
bool
|
|
IVFPQ::isSupportedPQCodeLength(int size) {
|
|
switch (size) {
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
case 8:
|
|
case 12:
|
|
case 16:
|
|
case 20:
|
|
case 24:
|
|
case 28:
|
|
case 32:
|
|
case 40:
|
|
case 48:
|
|
case 56: // only supported with float16
|
|
case 64: // only supported with float16
|
|
case 96: // only supported with float16
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) {
|
|
return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
|
|
}
|
|
|
|
void
|
|
IVFPQ::setPrecomputedCodes(bool enable) {
|
|
if (precomputedCodes_ != enable) {
|
|
precomputedCodes_ = enable;
|
|
|
|
if (precomputedCodes_) {
|
|
precomputeCodes_();
|
|
} else {
|
|
// Clear out old precomputed code data
|
|
precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|
Tensor<long, 1, true>& indices) {
|
|
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
|
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
|
|
|
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// Number of valid vectors that we actually add; we return this
|
|
int numAdded = 0;
|
|
|
|
// We don't actually need this
|
|
DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
|
|
// We use this
|
|
DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
|
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
|
|
|
quantizer_->query(vecs, 1, listDistance, listIds2d, false);
|
|
|
|
// Copy the lists that we wish to append to back to the CPU
|
|
// FIXME: really this can be into pinned memory and a true async
|
|
// copy on a different stream; we can start the copy early, but it's
|
|
// tiny
|
|
HostTensor<int, 1, true> listIdsHost(listIds, stream);
|
|
|
|
// Calculate the residual for each closest centroid
|
|
DeviceTensor<float, 2, true> residuals(
|
|
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
|
|
|
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
|
|
|
// Residuals are in the form
|
|
// (vec x numSubQuantizer x dimPerSubQuantizer)
|
|
// transpose to
|
|
// (numSubQuantizer x vec x dimPerSubQuantizer)
|
|
auto residualsView = residuals.view<3>(
|
|
{residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
|
|
DeviceTensor<float, 3, true> residualsTranspose(
|
|
mem,
|
|
{numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
|
|
stream);
|
|
|
|
runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
|
|
|
|
// Get the product quantizer centroids in the form
|
|
// (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
|
|
// which is pqCentroidsMiddleCode_
|
|
|
|
// We now have a batch operation to find the top-1 distances:
|
|
// batch size: numSubQuantizer
|
|
// centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
|
|
// residuals: (vec x dimPerSubQuantizer)
|
|
// => (numSubQuantizer x vec x 1)
|
|
|
|
DeviceTensor<float, 3, true> closestSubQDistance(
|
|
mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
|
|
DeviceTensor<int, 3, true> closestSubQIndex(
|
|
mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
|
|
|
|
for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
|
|
auto closestSubQDistanceView = closestSubQDistance[subQ].view();
|
|
auto closestSubQIndexView = closestSubQIndex[subQ].view();
|
|
|
|
auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
|
|
auto residualsTransposeView = residualsTranspose[subQ].view();
|
|
|
|
runL2Distance(resources_,
|
|
pqCentroidsMiddleCodeView,
|
|
nullptr, // no transposed storage
|
|
nullptr, // no precomputed norms
|
|
residualsTransposeView,
|
|
1,
|
|
closestSubQDistanceView,
|
|
closestSubQIndexView,
|
|
// We don't care about distances
|
|
true);
|
|
}
|
|
|
|
// Now, we have the nearest sub-q centroid for each slice of the
|
|
// residual vector.
|
|
auto closestSubQIndexView = closestSubQIndex.view<2>(
|
|
{numSubQuantizers_, residuals.getSize(0)});
|
|
|
|
// Transpose this for easy use
|
|
DeviceTensor<int, 2, true> encodings(
|
|
mem, {residuals.getSize(0), numSubQuantizers_}, stream);
|
|
|
|
runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
|
|
|
|
// Now we add the encoded vectors to the individual lists
|
|
// First, make sure that there is space available for adding the new
|
|
// encoded vectors and indices
|
|
|
|
// list id -> # being added
|
|
std::unordered_map<int, int> assignCounts;
|
|
|
|
// vector id -> offset in list
|
|
// (we already have vector id -> list id in listIds)
|
|
HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
|
|
|
|
for (int i = 0; i < listIdsHost.getSize(0); ++i) {
|
|
int listId = listIdsHost[i];
|
|
|
|
// Add vector could be invalid (contains NaNs etc)
|
|
if (listId < 0) {
|
|
listOffsetHost[i] = -1;
|
|
continue;
|
|
}
|
|
|
|
FAISS_ASSERT(listId < numLists_);
|
|
++numAdded;
|
|
|
|
int offset = deviceListData_[listId]->size() / bytesPerVector_;
|
|
|
|
auto it = assignCounts.find(listId);
|
|
if (it != assignCounts.end()) {
|
|
offset += it->second;
|
|
it->second++;
|
|
} else {
|
|
assignCounts[listId] = 1;
|
|
}
|
|
|
|
listOffsetHost[i] = offset;
|
|
}
|
|
|
|
// If we didn't add anything (all invalid vectors), no need to
|
|
// continue
|
|
if (numAdded == 0) {
|
|
return 0;
|
|
}
|
|
|
|
// We need to resize the data structures for the inverted lists on
|
|
// the GPUs, which means that they might need reallocation, which
|
|
// means that their base address may change. Figure out the new base
|
|
// addresses, and update those in a batch on the device
|
|
{
|
|
// Resize all of the lists that we are appending to
|
|
for (auto& counts : assignCounts) {
|
|
auto& codes = deviceListData_[counts.first];
|
|
codes->resize(codes->size() + counts.second * bytesPerVector_,
|
|
stream);
|
|
int newNumVecs = (int) (codes->size() / bytesPerVector_);
|
|
|
|
auto& indices = deviceListIndices_[counts.first];
|
|
if ((indicesOptions_ == INDICES_32_BIT) ||
|
|
(indicesOptions_ == INDICES_64_BIT)) {
|
|
size_t indexSize =
|
|
(indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
|
|
|
|
indices->resize(indices->size() + counts.second * indexSize, stream);
|
|
} else if (indicesOptions_ == INDICES_CPU) {
|
|
// indices are stored on the CPU side
|
|
FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
|
|
|
|
auto& userIndices = listOffsetToUserIndex_[counts.first];
|
|
userIndices.resize(newNumVecs);
|
|
} else {
|
|
// indices are not stored on the GPU or CPU side
|
|
FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
|
|
}
|
|
|
|
// This is used by the multi-pass query to decide how much scratch
|
|
// space to allocate for intermediate results
|
|
maxListLength_ = std::max(maxListLength_, newNumVecs);
|
|
}
|
|
|
|
// Update all pointers and sizes on the device for lists that we
|
|
// appended to
|
|
{
|
|
std::vector<int> listIds(assignCounts.size());
|
|
int i = 0;
|
|
for (auto& counts : assignCounts) {
|
|
listIds[i++] = counts.first;
|
|
}
|
|
|
|
updateDeviceListInfo_(listIds, stream);
|
|
}
|
|
}
|
|
|
|
// If we're maintaining the indices on the CPU side, update our
|
|
// map. We already resized our map above.
|
|
if (indicesOptions_ == INDICES_CPU) {
|
|
// We need to maintain the indices on the CPU side
|
|
HostTensor<long, 1, true> hostIndices(indices, stream);
|
|
|
|
for (int i = 0; i < hostIndices.getSize(0); ++i) {
|
|
int listId = listIdsHost[i];
|
|
|
|
// Add vector could be invalid (contains NaNs etc)
|
|
if (listId < 0) {
|
|
continue;
|
|
}
|
|
|
|
int offset = listOffsetHost[i];
|
|
|
|
FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
|
|
auto& userIndices = listOffsetToUserIndex_[listId];
|
|
|
|
FAISS_ASSERT(offset < userIndices.size());
|
|
userIndices[offset] = hostIndices[i];
|
|
}
|
|
}
|
|
|
|
// We similarly need to actually append the new encoded vectors
|
|
{
|
|
DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
|
|
|
|
// This kernel will handle appending each encoded vector + index to
|
|
// the appropriate list
|
|
runIVFPQInvertedListAppend(listIds,
|
|
listOffset,
|
|
encodings,
|
|
indices,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
stream);
|
|
}
|
|
|
|
return numAdded;
|
|
}
|
|
|
|
void
|
|
IVFPQ::addCodeVectorsFromCpu(int listId,
|
|
const void* codes,
|
|
const long* indices,
|
|
size_t numVecs) {
|
|
// This list must already exist
|
|
FAISS_ASSERT(listId < deviceListData_.size());
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// If there's nothing to add, then there's nothing we have to do
|
|
if (numVecs == 0) {
|
|
return;
|
|
}
|
|
|
|
size_t lengthInBytes = numVecs * bytesPerVector_;
|
|
|
|
auto& listCodes = deviceListData_[listId];
|
|
auto prevCodeData = listCodes->data();
|
|
|
|
// We only have int32 length representations on the GPU per each
|
|
// list; the length is in sizeof(char)
|
|
FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
|
|
FAISS_ASSERT(listCodes->size() + lengthInBytes <=
|
|
(size_t) std::numeric_limits<int>::max());
|
|
|
|
listCodes->append((unsigned char*) codes,
|
|
lengthInBytes,
|
|
stream,
|
|
true /* exact reserved size */);
|
|
|
|
// Handle the indices as well
|
|
addIndicesFromCpu_(listId, indices, numVecs);
|
|
|
|
// This list address may have changed due to vector resizing, but
|
|
// only bother updating it on the device if it has changed
|
|
if (prevCodeData != listCodes->data()) {
|
|
deviceListDataPointers_[listId] = listCodes->data();
|
|
}
|
|
|
|
// And our size has changed too
|
|
int listLength = listCodes->size() / bytesPerVector_;
|
|
deviceListLengths_[listId] = listLength;
|
|
|
|
// We update this as well, since the multi-pass algorithm uses it
|
|
maxListLength_ = std::max(maxListLength_, listLength);
|
|
|
|
// device_vector add is potentially happening on a different stream
|
|
// than our default stream
|
|
if (resources_->getDefaultStreamCurrentDevice() != 0) {
|
|
streamWait({stream}, {0});
|
|
}
|
|
}
|
|
|
|
void
|
|
IVFPQ::setPQCentroids_(float* data) {
|
|
size_t pqSize =
|
|
numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
|
|
|
|
// Make sure the data is on the host
|
|
// FIXME: why are we doing this?
|
|
thrust::host_vector<float> hostMemory;
|
|
hostMemory.insert(hostMemory.end(), data, data + pqSize);
|
|
|
|
HostTensor<float, 3, true> pqHost(
|
|
hostMemory.data(),
|
|
{numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 3, true> pqDevice(
|
|
pqHost,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
DeviceTensor<float, 3, true> pqDeviceTranspose(
|
|
{numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
|
|
runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
|
|
|
|
// Also maintain the PQ centroids in the form
|
|
// (sub q)(code id)(sub dim)
|
|
DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
|
|
{numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
|
|
}
|
|
|
|
void
|
|
IVFPQ::precomputeCodes_() {
|
|
//
|
|
// d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
|
|
// --------------- --------------------------- -------
|
|
// term 1 term 2 term 3
|
|
//
|
|
|
|
// Terms 1 and 3 are available only at query time. We compute term 2
|
|
// here.
|
|
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
|
|
|
// Compute ||y_R||^2 by treating
|
|
// (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
|
|
auto pqCentroidsMiddleCodeView =
|
|
pqCentroidsMiddleCode_.view<2>(
|
|
{numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 1, true> subQuantizerNorms(
|
|
{numSubQuantizers_ * numSubQuantizerCodes_});
|
|
|
|
runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms, true,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// Compute 2 * (y_C|y_R) via batch matrix multiplication
|
|
// batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
|
|
// => (sub q) x {(centroid id)(code id)}
|
|
// => (sub q)(centroid id)(code id)
|
|
|
|
// View (centroid id)(dim) as
|
|
// (centroid id)(sub q)(dim)
|
|
// Transpose (centroid id)(sub q)(sub dim) to
|
|
// (sub q)(centroid id)(sub dim)
|
|
auto centroidView = coarseCentroids.view<3>(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 3, true> centroidsTransposed(
|
|
{numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
|
|
|
|
runTransposeAny(centroidView, 0, 1, centroidsTransposed,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
DeviceTensor<float, 3, true> coarsePQProduct(
|
|
{numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
|
|
|
|
runIteratedMatrixMult(coarsePQProduct, false,
|
|
centroidsTransposed, false,
|
|
pqCentroidsMiddleCode_, true,
|
|
2.0f, 0.0f,
|
|
resources_->getBlasHandleCurrentDevice(),
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// Transpose (sub q)(centroid id)(code id) to
|
|
// (centroid id)(sub q)(code id)
|
|
DeviceTensor<float, 3, true> coarsePQProductTransposed(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
|
|
runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
// View (centroid id)(sub q)(code id) as
|
|
// (centroid id)(sub q * code id)
|
|
auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
|
|
{coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
|
|
|
|
// Sum || y_R ||^2 + 2 * (y_C|y_R)
|
|
// i.e., add norms (sub q * code id)
|
|
// along columns of inner product (centroid id)(sub q * code id)
|
|
runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
if (useFloat16LookupTables_) {
|
|
precomputedCodeHalf_ = toHalf(resources_,
|
|
resources_->getDefaultStreamCurrentDevice(),
|
|
coarsePQProductTransposed);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
// We added into the view, so `coarsePQProductTransposed` is now our
|
|
// precomputed term 2.
|
|
precomputedCode_ = std::move(coarsePQProductTransposed);
|
|
}
|
|
|
|
void
|
|
IVFPQ::query(Tensor<float, 2, true>& queries,
|
|
int nprobe,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
// Validate these at a top level
|
|
FAISS_ASSERT(nprobe <= 1024);
|
|
FAISS_ASSERT(k <= 1024);
|
|
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
nprobe = std::min(nprobe, quantizer_->getSize());
|
|
|
|
FAISS_ASSERT(queries.getSize(1) == dim_);
|
|
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
|
|
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
|
|
|
|
// Reserve space for the closest coarse centroids
|
|
DeviceTensor<float, 2, true>
|
|
coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
|
|
DeviceTensor<int, 2, true>
|
|
coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
|
|
|
|
// Find the `nprobe` closest coarse centroids; we can use int
|
|
// indices both internally and externally
|
|
quantizer_->query(queries,
|
|
nprobe,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
true);
|
|
|
|
if (precomputedCodes_) {
|
|
runPQPrecomputedCodes_(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
} else {
|
|
runPQNoPrecomputedCodes_(queries,
|
|
coarseDistances,
|
|
coarseIndices,
|
|
k,
|
|
outDistances,
|
|
outIndices);
|
|
}
|
|
|
|
// If the GPU isn't storing indices (they are on the CPU side), we
|
|
// need to perform the re-mapping here
|
|
// FIXME: we might ultimately be calling this function with inputs
|
|
// from the CPU, these are unnecessary copies
|
|
if (indicesOptions_ == INDICES_CPU) {
|
|
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
|
|
|
|
ivfOffsetToUserIndex(hostOutIndices.data(),
|
|
numLists_,
|
|
hostOutIndices.getSize(0),
|
|
hostOutIndices.getSize(1),
|
|
listOffsetToUserIndex_);
|
|
|
|
// Copy back to GPU, since the input to this function is on the
|
|
// GPU
|
|
outIndices.copyFrom(hostOutIndices, stream);
|
|
}
|
|
}
|
|
|
|
std::vector<unsigned char>
|
|
IVFPQ::getListCodes(int listId) const {
|
|
FAISS_ASSERT(listId < deviceListData_.size());
|
|
|
|
return deviceListData_[listId]->copyToHost<unsigned char>(
|
|
resources_->getDefaultStreamCurrentDevice());
|
|
}
|
|
|
|
Tensor<float, 3, true>
|
|
IVFPQ::getPQCentroids() {
|
|
return pqCentroidsMiddleCode_;
|
|
}
|
|
|
|
void
|
|
IVFPQ::runPQPrecomputedCodes_(
|
|
Tensor<float, 2, true>& queries,
|
|
DeviceTensor<float, 2, true>& coarseDistances,
|
|
DeviceTensor<int, 2, true>& coarseIndices,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
|
|
|
// Compute precomputed code term 3, - 2 * (x|y_R)
|
|
// This is done via batch MM
|
|
// {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
|
|
// {sub q} x {(query id)(code id)}
|
|
DeviceTensor<float, 3, true> term3Transposed(
|
|
mem,
|
|
{queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
|
|
stream);
|
|
|
|
// These allocations within are only temporary, so release them when
|
|
// we're done to maximize free space
|
|
{
|
|
auto querySubQuantizerView = queries.view<3>(
|
|
{queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
|
DeviceTensor<float, 3, true> queriesTransposed(
|
|
mem,
|
|
{numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
|
|
stream);
|
|
runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
|
|
|
|
DeviceTensor<float, 3, true> term3(
|
|
mem,
|
|
{numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
|
|
stream);
|
|
|
|
runIteratedMatrixMult(term3, false,
|
|
queriesTransposed, false,
|
|
pqCentroidsMiddleCode_, true,
|
|
-2.0f, 0.0f,
|
|
resources_->getBlasHandleCurrentDevice(),
|
|
stream);
|
|
|
|
runTransposeAny(term3, 0, 1, term3Transposed, stream);
|
|
}
|
|
|
|
NoTypeTensor<3, true> term2;
|
|
NoTypeTensor<3, true> term3;
|
|
#ifdef FAISS_USE_FLOAT16
|
|
DeviceTensor<half, 3, true> term3Half;
|
|
|
|
if (useFloat16LookupTables_) {
|
|
term3Half = toHalf(resources_, stream, term3Transposed);
|
|
term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
|
|
term3 = NoTypeTensor<3, true>(term3Half);
|
|
}
|
|
#endif
|
|
|
|
if (!useFloat16LookupTables_) {
|
|
term2 = NoTypeTensor<3, true>(precomputedCode_);
|
|
term3 = NoTypeTensor<3, true>(term3Transposed);
|
|
}
|
|
|
|
runPQScanMultiPassPrecomputed(queries,
|
|
coarseDistances, // term 1
|
|
term2, // term 2
|
|
term3, // term 3
|
|
coarseIndices,
|
|
useFloat16LookupTables_,
|
|
bytesPerVector_,
|
|
numSubQuantizers_,
|
|
numSubQuantizerCodes_,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
deviceListLengths_,
|
|
maxListLength_,
|
|
k,
|
|
outDistances,
|
|
outIndices,
|
|
resources_);
|
|
}
|
|
|
|
void
|
|
IVFPQ::runPQNoPrecomputedCodes_(
|
|
Tensor<float, 2, true>& queries,
|
|
DeviceTensor<float, 2, true>& coarseDistances,
|
|
DeviceTensor<int, 2, true>& coarseIndices,
|
|
int k,
|
|
Tensor<float, 2, true>& outDistances,
|
|
Tensor<long, 2, true>& outIndices) {
|
|
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
|
|
|
runPQScanMultiPassNoPrecomputed(queries,
|
|
coarseCentroids,
|
|
pqCentroidsInnermostCode_,
|
|
coarseIndices,
|
|
useFloat16LookupTables_,
|
|
bytesPerVector_,
|
|
numSubQuantizers_,
|
|
numSubQuantizerCodes_,
|
|
deviceListDataPointers_,
|
|
deviceListIndexPointers_,
|
|
indicesOptions_,
|
|
deviceListLengths_,
|
|
maxListLength_,
|
|
k,
|
|
outDistances,
|
|
outIndices,
|
|
resources_);
|
|
}
|
|
|
|
} } // namespace
|