12 #include "../GpuResources.h"
13 #include "BroadcastSum.cuh"
14 #include "Distance.cuh"
15 #include "FlatIndex.cuh"
16 #include "InvertedListAppend.cuh"
18 #include "PQCodeDistances.cuh"
19 #include "PQScanMultiPassNoPrecomputed.cuh"
20 #include "PQScanMultiPassPrecomputed.cuh"
21 #include "RemapIndices.h"
22 #include "VectorResidual.cuh"
23 #include "../utils/DeviceDefs.cuh"
24 #include "../utils/DeviceUtils.h"
25 #include "../utils/HostTensor.cuh"
26 #include "../utils/MatrixMult.cuh"
27 #include "../utils/NoTypeTensor.cuh"
28 #include "../utils/Transpose.cuh"
30 #include <thrust/host_vector.h>
31 #include <unordered_map>
33 namespace faiss {
namespace gpu {
38 int bitsPerSubQuantizer,
39 float* pqCentroidData,
40 IndicesOptions indicesOptions,
41 bool useFloat16LookupTables,
48 numSubQuantizers_(numSubQuantizers),
49 bitsPerSubQuantizer_(bitsPerSubQuantizer),
50 numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
51 dimPerSubQuantizer_(dim_ / numSubQuantizers),
52 precomputedCodes_(false),
53 useFloat16LookupTables_(useFloat16LookupTables) {
54 FAISS_ASSERT(pqCentroidData);
56 FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
57 FAISS_ASSERT(
dim_ % numSubQuantizers_ == 0);
60 #ifndef FAISS_USE_FLOAT16
61 FAISS_ASSERT(!useFloat16LookupTables_);
64 setPQCentroids_(pqCentroidData);
98 return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
103 if (precomputedCodes_ != enable) {
104 precomputedCodes_ = enable;
106 if (precomputedCodes_) {
112 #ifdef FAISS_USE_FLOAT16
137 auto listIds = listIds2d.
view<1>({vecs.
getSize(0)});
139 quantizer_->query(vecs, 1, listDistance, listIds2d,
false);
151 runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
157 auto residualsView = residuals.view<3>(
158 {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
162 {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
165 runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
178 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
180 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
182 for (
int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
183 auto closestSubQDistanceView = closestSubQDistance[subQ].
view();
184 auto closestSubQIndexView = closestSubQIndex[subQ].view();
186 auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].
view();
187 auto residualsTransposeView = residualsTranspose[subQ].view();
190 pqCentroidsMiddleCodeView,
193 residualsTransposeView,
195 closestSubQDistanceView,
196 closestSubQIndexView,
206 auto closestSubQIndexView = closestSubQIndex.view<2>(
207 {numSubQuantizers_, residuals.getSize(0)});
211 mem, {residuals.getSize(0), numSubQuantizers_}, stream);
213 runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
220 std::unordered_map<int, int> assignCounts;
226 for (
int i = 0; i < listIdsHost.
getSize(0); ++i) {
227 int listId = listIdsHost[i];
231 listOffsetHost[i] = -1;
240 auto it = assignCounts.find(listId);
241 if (it != assignCounts.end()) {
242 offset += it->second;
245 assignCounts[listId] = 1;
248 listOffsetHost[i] = offset;
263 for (
auto& counts : assignCounts) {
269 auto& indices = deviceListIndices_[counts.first];
275 indices->resize(indices->size() + counts.second * indexSize, stream);
281 userIndices.resize(newNumVecs);
295 std::vector<int> listIds(assignCounts.size());
297 for (
auto& counts : assignCounts) {
298 listIds[i++] = counts.first;
311 for (
int i = 0; i < hostIndices.
getSize(0); ++i) {
312 int listId = listIdsHost[i];
319 int offset = listOffsetHost[i];
324 FAISS_ASSERT(offset < userIndices.size());
325 userIndices[offset] = hostIndices[i];
335 runIVFPQInvertedListAppend(listIds,
365 auto prevCodeData = listCodes->data();
369 FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
370 FAISS_ASSERT(listCodes->size() + lengthInBytes <=
371 (size_t) std::numeric_limits<int>::max());
373 listCodes->append((
unsigned char*) codes,
383 if (prevCodeData != listCodes->data()) {
397 streamWait({stream}, {0});
402 IVFPQ::setPQCentroids_(
float* data) {
404 numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
408 thrust::host_vector<float> hostMemory;
409 hostMemory.insert(hostMemory.end(), data, data + pqSize);
413 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
414 DeviceTensor<float, 3, true> pqDevice(
418 DeviceTensor<float, 3, true> pqDeviceTranspose(
419 {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
420 runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
423 pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
427 DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
428 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
429 runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
432 pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
436 IVFPQ::precomputeCodes_() {
450 auto pqCentroidsMiddleCodeView =
451 pqCentroidsMiddleCode_.
view<2>(
452 {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
453 DeviceTensor<float, 1, true> subQuantizerNorms(
454 {numSubQuantizers_ * numSubQuantizerCodes_});
456 runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms,
true,
468 auto centroidView = coarseCentroids.view<3>(
469 {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
470 DeviceTensor<float, 3, true> centroidsTransposed(
471 {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
473 runTransposeAny(centroidView, 0, 1, centroidsTransposed,
476 DeviceTensor<float, 3, true> coarsePQProduct(
477 {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
479 runIteratedMatrixMult(coarsePQProduct,
false,
480 centroidsTransposed,
false,
481 pqCentroidsMiddleCode_,
true,
488 DeviceTensor<float, 3, true> coarsePQProductTransposed(
489 {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
490 runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
495 auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
496 {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
501 runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
504 #ifdef FAISS_USE_FLOAT16
505 if (useFloat16LookupTables_) {
508 coarsePQProductTransposed);
515 precomputedCode_ = std::move(coarsePQProductTransposed);
525 FAISS_ASSERT(nprobe <= 1024);
526 FAISS_ASSERT(k <= 1024);
538 coarseDistances(mem, {queries.
getSize(0), nprobe}, stream);
540 coarseIndices(mem, {queries.
getSize(0), nprobe}, stream);
550 if (precomputedCodes_) {
551 runPQPrecomputedCodes_(queries,
558 runPQNoPrecomputedCodes_(queries,
573 ivfOffsetToUserIndex(hostOutIndices.
data(),
581 outIndices.
copyFrom(hostOutIndices, stream);
585 std::vector<unsigned char>
595 return pqCentroidsMiddleCode_;
599 IVFPQ::runPQPrecomputedCodes_(
615 {queries.
getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
621 auto querySubQuantizerView = queries.
view<3>(
622 {queries.
getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
623 DeviceTensor<float, 3, true> queriesTransposed(
625 {numSubQuantizers_, queries.
getSize(0), dimPerSubQuantizer_},
627 runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
629 DeviceTensor<float, 3, true> term3(
631 {numSubQuantizers_, queries.
getSize(0), numSubQuantizerCodes_},
634 runIteratedMatrixMult(term3,
false,
635 queriesTransposed,
false,
636 pqCentroidsMiddleCode_,
true,
641 runTransposeAny(term3, 0, 1, term3Transposed, stream);
644 NoTypeTensor<3, true> term2;
645 NoTypeTensor<3, true> term3;
646 #ifdef FAISS_USE_FLOAT16
647 DeviceTensor<half, 3, true> term3Half;
649 if (useFloat16LookupTables_) {
650 term3Half = toHalf(
resources_, stream, term3Transposed);
651 term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
652 term3 = NoTypeTensor<3, true>(term3Half);
656 if (!useFloat16LookupTables_) {
657 term2 = NoTypeTensor<3, true>(precomputedCode_);
658 term3 = NoTypeTensor<3, true>(term3Transposed);
661 runPQScanMultiPassPrecomputed(queries,
666 useFloat16LookupTables_,
669 numSubQuantizerCodes_,
682 IVFPQ::runPQNoPrecomputedCodes_(
683 Tensor<float, 2, true>& queries,
684 DeviceTensor<float, 2, true>& coarseDistances,
685 DeviceTensor<int, 2, true>& coarseIndices,
687 Tensor<float, 2, true>& outDistances,
688 Tensor<long, 2, true>& outIndices) {
692 runPQScanMultiPassNoPrecomputed(queries,
694 pqCentroidsInnermostCode_,
696 useFloat16LookupTables_,
699 numSubQuantizerCodes_,
const int numLists_
Number of inverted lists we maintain.
int maxListLength_
Maximum list length seen.
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
int getSize() const
Returns the number of vectors we contain.
std::vector< std::vector< long > > listOffsetToUserIndex_
Holder of GPU resources for a particular flat index.
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Base inverted list functionality for IVFFlat and IVFPQ.
thrust::device_vector< int > deviceListLengths_
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
thrust::device_vector< void * > deviceListIndexPointers_
cublasHandle_t getBlasHandleCurrentDevice()
Calls getBlasHandle with the current device.
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Tensor< float, 3, true > getPQCentroids()
FlatIndex * quantizer_
Quantizer object.
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
thrust::device_vector< void * > deviceListDataPointers_
GpuResources * resources_
Collection of GPU resources that we use.
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
const int bytesPerVector_
Number of bytes per vector in the list.
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
__host__ __device__ IndexT getSize(int i) const
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables, MemorySpace space)
const int dim_
Expected dimensionality of the vectors.
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
static bool isSupportedNoPrecomputedSubDimSize(int dims)