11 #include "../GpuResources.h"
12 #include "BroadcastSum.cuh"
13 #include "Distance.cuh"
14 #include "FlatIndex.cuh"
15 #include "InvertedListAppend.cuh"
17 #include "PQCodeDistances.cuh"
18 #include "PQScanMultiPassNoPrecomputed.cuh"
19 #include "PQScanMultiPassPrecomputed.cuh"
20 #include "RemapIndices.h"
21 #include "VectorResidual.cuh"
22 #include "../utils/DeviceDefs.cuh"
23 #include "../utils/DeviceUtils.h"
24 #include "../utils/HostTensor.cuh"
25 #include "../utils/MatrixMult.cuh"
26 #include "../utils/NoTypeTensor.cuh"
27 #include "../utils/Transpose.cuh"
29 #include <thrust/host_vector.h>
30 #include <unordered_map>
32 namespace faiss {
namespace gpu {
37 int bitsPerSubQuantizer,
38 float* pqCentroidData,
39 IndicesOptions indicesOptions,
40 bool useFloat16LookupTables,
47 numSubQuantizers_(numSubQuantizers),
48 bitsPerSubQuantizer_(bitsPerSubQuantizer),
49 numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
50 dimPerSubQuantizer_(dim_ / numSubQuantizers),
51 precomputedCodes_(false),
52 useFloat16LookupTables_(useFloat16LookupTables) {
53 FAISS_ASSERT(pqCentroidData);
55 FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
56 FAISS_ASSERT(
dim_ % numSubQuantizers_ == 0);
59 #ifndef FAISS_USE_FLOAT16
60 FAISS_ASSERT(!useFloat16LookupTables_);
63 setPQCentroids_(pqCentroidData);
97 return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
102 if (precomputedCodes_ != enable) {
103 precomputedCodes_ = enable;
105 if (precomputedCodes_) {
111 #ifdef FAISS_USE_FLOAT16
136 auto listIds = listIds2d.
view<1>({vecs.
getSize(0)});
138 quantizer_->query(vecs, 1, listDistance, listIds2d,
false);
150 runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
156 auto residualsView = residuals.view<3>(
157 {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
161 {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
164 runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
177 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
179 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
181 for (
int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
182 auto closestSubQDistanceView = closestSubQDistance[subQ].
view();
183 auto closestSubQIndexView = closestSubQIndex[subQ].view();
185 auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].
view();
186 auto residualsTransposeView = residualsTranspose[subQ].view();
189 pqCentroidsMiddleCodeView,
192 residualsTransposeView,
194 closestSubQDistanceView,
195 closestSubQIndexView,
202 auto closestSubQIndexView = closestSubQIndex.view<2>(
203 {numSubQuantizers_, residuals.getSize(0)});
207 mem, {residuals.getSize(0), numSubQuantizers_}, stream);
209 runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
216 std::unordered_map<int, int> assignCounts;
222 for (
int i = 0; i < listIdsHost.
getSize(0); ++i) {
223 int listId = listIdsHost[i];
227 listOffsetHost[i] = -1;
236 auto it = assignCounts.find(listId);
237 if (it != assignCounts.end()) {
238 offset += it->second;
241 assignCounts[listId] = 1;
244 listOffsetHost[i] = offset;
259 for (
auto& counts : assignCounts) {
265 auto& indices = deviceListIndices_[counts.first];
271 indices->resize(indices->size() + counts.second * indexSize, stream);
277 userIndices.resize(newNumVecs);
291 std::vector<int> listIds(assignCounts.size());
293 for (
auto& counts : assignCounts) {
294 listIds[i++] = counts.first;
307 for (
int i = 0; i < hostIndices.
getSize(0); ++i) {
308 int listId = listIdsHost[i];
315 int offset = listOffsetHost[i];
320 FAISS_ASSERT(offset < userIndices.size());
321 userIndices[offset] = hostIndices[i];
331 runIVFPQInvertedListAppend(listIds,
361 auto prevCodeData = listCodes->data();
365 FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
366 FAISS_ASSERT(listCodes->size() + lengthInBytes <=
367 (size_t) std::numeric_limits<int>::max());
369 listCodes->append((
unsigned char*) codes,
379 if (prevCodeData != listCodes->data()) {
393 streamWait({stream}, {0});
398 IVFPQ::setPQCentroids_(
float* data) {
400 numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
404 thrust::host_vector<float> hostMemory;
405 hostMemory.insert(hostMemory.end(), data, data + pqSize);
409 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
410 DeviceTensor<float, 3, true> pqDevice(
414 DeviceTensor<float, 3, true> pqDeviceTranspose(
415 {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
416 runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
419 pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
423 DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
424 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
425 runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
428 pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
432 IVFPQ::precomputeCodes_() {
446 auto pqCentroidsMiddleCodeView =
447 pqCentroidsMiddleCode_.
view<2>(
448 {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
449 DeviceTensor<float, 1, true> subQuantizerNorms(
450 {numSubQuantizers_ * numSubQuantizerCodes_});
452 runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms,
true,
464 auto centroidView = coarseCentroids.view<3>(
465 {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
466 DeviceTensor<float, 3, true> centroidsTransposed(
467 {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
469 runTransposeAny(centroidView, 0, 1, centroidsTransposed,
472 DeviceTensor<float, 3, true> coarsePQProduct(
473 {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
475 runIteratedMatrixMult(coarsePQProduct,
false,
476 centroidsTransposed,
false,
477 pqCentroidsMiddleCode_,
true,
484 DeviceTensor<float, 3, true> coarsePQProductTransposed(
485 {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
486 runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
491 auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
492 {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
497 runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
500 #ifdef FAISS_USE_FLOAT16
501 if (useFloat16LookupTables_) {
504 coarsePQProductTransposed);
511 precomputedCode_ = std::move(coarsePQProductTransposed);
521 FAISS_ASSERT(nprobe <= 1024);
522 FAISS_ASSERT(k <= 1024);
534 coarseDistances(mem, {queries.
getSize(0), nprobe}, stream);
536 coarseIndices(mem, {queries.
getSize(0), nprobe}, stream);
546 if (precomputedCodes_) {
547 runPQPrecomputedCodes_(queries,
554 runPQNoPrecomputedCodes_(queries,
569 ivfOffsetToUserIndex(hostOutIndices.
data(),
577 outIndices.
copyFrom(hostOutIndices, stream);
581 std::vector<unsigned char>
591 return pqCentroidsMiddleCode_;
595 IVFPQ::runPQPrecomputedCodes_(
611 {queries.
getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
617 auto querySubQuantizerView = queries.
view<3>(
618 {queries.
getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
619 DeviceTensor<float, 3, true> queriesTransposed(
621 {numSubQuantizers_, queries.
getSize(0), dimPerSubQuantizer_},
623 runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
625 DeviceTensor<float, 3, true> term3(
627 {numSubQuantizers_, queries.
getSize(0), numSubQuantizerCodes_},
630 runIteratedMatrixMult(term3,
false,
631 queriesTransposed,
false,
632 pqCentroidsMiddleCode_,
true,
637 runTransposeAny(term3, 0, 1, term3Transposed, stream);
640 NoTypeTensor<3, true> term2;
641 NoTypeTensor<3, true> term3;
642 #ifdef FAISS_USE_FLOAT16
643 DeviceTensor<half, 3, true> term3Half;
645 if (useFloat16LookupTables_) {
646 term3Half = toHalf(
resources_, stream, term3Transposed);
647 term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
648 term3 = NoTypeTensor<3, true>(term3Half);
652 if (!useFloat16LookupTables_) {
653 term2 = NoTypeTensor<3, true>(precomputedCode_);
654 term3 = NoTypeTensor<3, true>(term3Transposed);
657 runPQScanMultiPassPrecomputed(queries,
662 useFloat16LookupTables_,
665 numSubQuantizerCodes_,
678 IVFPQ::runPQNoPrecomputedCodes_(
679 Tensor<float, 2, true>& queries,
680 DeviceTensor<float, 2, true>& coarseDistances,
681 DeviceTensor<int, 2, true>& coarseIndices,
683 Tensor<float, 2, true>& outDistances,
684 Tensor<long, 2, true>& outIndices) {
688 runPQScanMultiPassNoPrecomputed(queries,
690 pqCentroidsInnermostCode_,
692 useFloat16LookupTables_,
695 numSubQuantizerCodes_,
const int numLists_
Number of inverted lists we maintain.
int maxListLength_
Maximum list length seen.
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
int getSize() const
Returns the number of vectors we contain.
std::vector< std::vector< long > > listOffsetToUserIndex_
Holder of GPU resources for a particular flat index.
__host__ __device__ Tensor< T, SubDim, InnerContig, IndexT, PtrTraits > view(DataPtrType at)
Base inverted list functionality for IVFFlat and IVFPQ.
thrust::device_vector< int > deviceListLengths_
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
thrust::device_vector< void * > deviceListIndexPointers_
cublasHandle_t getBlasHandleCurrentDevice()
Calls getBlasHandle with the current device.
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
__host__ void copyFrom(Tensor< T, Dim, InnerContig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Tensor< float, 3, true > getPQCentroids()
FlatIndex * quantizer_
Quantizer object.
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
__host__ __device__ IndexT getSize(int i) const
thrust::device_vector< void * > deviceListDataPointers_
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
GpuResources * resources_
Collection of GPU resources that we use.
const int bytesPerVector_
Number of bytes per vector in the list.
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables, MemorySpace space)
const int dim_
Expected dimensionality of the vectors.
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
static bool isSupportedNoPrecomputedSubDimSize(int dims)