13 #include "../GpuResources.h"
14 #include "BroadcastSum.cuh"
15 #include "Distance.cuh"
16 #include "FlatIndex.cuh"
17 #include "InvertedListAppend.cuh"
19 #include "PQCodeDistances.cuh"
20 #include "PQScanMultiPassNoPrecomputed.cuh"
21 #include "PQScanMultiPassPrecomputed.cuh"
22 #include "RemapIndices.h"
23 #include "VectorResidual.cuh"
24 #include "../utils/DeviceDefs.cuh"
25 #include "../utils/DeviceUtils.h"
26 #include "../utils/HostTensor.cuh"
27 #include "../utils/MatrixMult.cuh"
28 #include "../utils/NoTypeTensor.cuh"
29 #include "../utils/Transpose.cuh"
31 #include <thrust/host_vector.h>
32 #include <unordered_map>
34 namespace faiss {
namespace gpu {
39 int bitsPerSubQuantizer,
40 float* pqCentroidData,
41 IndicesOptions indicesOptions,
42 bool useFloat16LookupTables) :
47 numSubQuantizers_(numSubQuantizers),
48 bitsPerSubQuantizer_(bitsPerSubQuantizer),
49 numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
50 dimPerSubQuantizer_(dim_ / numSubQuantizers),
51 precomputedCodes_(false),
52 useFloat16LookupTables_(useFloat16LookupTables) {
53 FAISS_ASSERT(pqCentroidData);
55 FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
56 FAISS_ASSERT(
dim_ % numSubQuantizers_ == 0);
59 #ifndef FAISS_USE_FLOAT16
60 FAISS_ASSERT(!useFloat16LookupTables_);
63 setPQCentroids_(pqCentroidData);
97 return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
102 if (precomputedCodes_ != enable) {
103 precomputedCodes_ = enable;
105 if (precomputedCodes_) {
111 #ifdef FAISS_USE_FLOAT16
126 auto& mem =
resources_->getMemoryManagerCurrentDevice();
127 auto stream =
resources_->getDefaultStreamCurrentDevice();
136 auto listIds = listIds2d.
view<1>({vecs.
getSize(0)});
138 quantizer_->query(vecs, 1, listDistance, listIds2d,
false);
150 runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
156 auto residualsView = residuals.view<3>(
157 {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
161 {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
164 runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
177 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
179 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
181 for (
int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
182 auto closestSubQDistanceView = closestSubQDistance[subQ].
view();
183 auto closestSubQIndexView = closestSubQIndex[subQ].view();
185 auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].
view();
186 auto residualsTransposeView = residualsTranspose[subQ].view();
189 pqCentroidsMiddleCodeView,
192 residualsTransposeView,
194 closestSubQDistanceView,
195 closestSubQIndexView,
205 auto closestSubQIndexView = closestSubQIndex.view<2>(
206 {numSubQuantizers_, residuals.getSize(0)});
210 mem, {residuals.getSize(0), numSubQuantizers_}, stream);
212 runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
219 std::unordered_map<int, int> assignCounts;
225 for (
int i = 0; i < listIdsHost.
getSize(0); ++i) {
226 int listId = listIdsHost[i];
230 listOffsetHost[i] = -1;
239 auto it = assignCounts.find(listId);
240 if (it != assignCounts.end()) {
241 offset += it->second;
244 assignCounts[listId] = 1;
247 listOffsetHost[i] = offset;
262 for (
auto& counts : assignCounts) {
268 auto& indices = deviceListIndices_[counts.first];
274 indices->resize(indices->size() + counts.second * indexSize, stream);
280 userIndices.resize(newNumVecs);
294 std::vector<int> listIds(assignCounts.size());
296 for (
auto& counts : assignCounts) {
297 listIds[i++] = counts.first;
310 for (
int i = 0; i < hostIndices.
getSize(0); ++i) {
311 int listId = listIdsHost[i];
318 int offset = listOffsetHost[i];
323 FAISS_ASSERT(offset < userIndices.size());
324 userIndices[offset] = hostIndices[i];
334 runIVFPQInvertedListAppend(listIds,
354 auto stream =
resources_->getDefaultStreamCurrentDevice();
364 auto prevCodeData = listCodes->data();
368 FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
369 FAISS_ASSERT(listCodes->size() + lengthInBytes <=
370 (size_t) std::numeric_limits<int>::max());
372 listCodes->append((
unsigned char*) codes,
382 if (prevCodeData != listCodes->data()) {
395 if (
resources_->getDefaultStreamCurrentDevice() != 0) {
396 streamWait({stream}, {0});
401 IVFPQ::setPQCentroids_(
float* data) {
403 numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
407 thrust::host_vector<float> hostMemory;
408 hostMemory.insert(hostMemory.end(), data, data + pqSize);
412 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
413 DeviceTensor<float, 3, true> pqDevice(
417 DeviceTensor<float, 3, true> pqDeviceTranspose(
418 {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
419 runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
422 pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
426 DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
427 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
428 runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
431 pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
435 IVFPQ::precomputeCodes_() {
449 auto pqCentroidsMiddleCodeView =
450 pqCentroidsMiddleCode_.
view<2>(
451 {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
452 DeviceTensor<float, 1, true> subQuantizerNorms(
453 {numSubQuantizers_ * numSubQuantizerCodes_});
455 runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms,
true,
467 auto centroidView = coarseCentroids.view<3>(
468 {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
469 DeviceTensor<float, 3, true> centroidsTransposed(
470 {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
472 runTransposeAny(centroidView, 0, 1, centroidsTransposed,
475 DeviceTensor<float, 3, true> coarsePQProduct(
476 {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
478 runIteratedMatrixMult(coarsePQProduct,
false,
479 centroidsTransposed,
false,
480 pqCentroidsMiddleCode_,
true,
487 DeviceTensor<float, 3, true> coarsePQProductTransposed(
488 {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
489 runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
494 auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
495 {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
500 runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
503 #ifdef FAISS_USE_FLOAT16
504 if (useFloat16LookupTables_) {
507 coarsePQProductTransposed);
514 precomputedCode_ = std::move(coarsePQProductTransposed);
524 FAISS_ASSERT(nprobe <= 1024);
525 FAISS_ASSERT(k <= 1024);
527 auto& mem =
resources_->getMemoryManagerCurrentDevice();
528 auto stream =
resources_->getDefaultStreamCurrentDevice();
537 coarseDistances(mem, {queries.
getSize(0), nprobe}, stream);
539 coarseIndices(mem, {queries.
getSize(0), nprobe}, stream);
549 if (precomputedCodes_) {
550 runPQPrecomputedCodes_(queries,
557 runPQNoPrecomputedCodes_(queries,
572 ivfOffsetToUserIndex(hostOutIndices.
data(),
580 outIndices.
copyFrom(hostOutIndices, stream);
584 std::vector<unsigned char>
594 return pqCentroidsMiddleCode_;
598 IVFPQ::runPQPrecomputedCodes_(
605 auto& mem =
resources_->getMemoryManagerCurrentDevice();
606 auto stream =
resources_->getDefaultStreamCurrentDevice();
614 {queries.
getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
620 auto querySubQuantizerView = queries.
view<3>(
621 {queries.
getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
622 DeviceTensor<float, 3, true> queriesTransposed(
624 {numSubQuantizers_, queries.
getSize(0), dimPerSubQuantizer_},
626 runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
628 DeviceTensor<float, 3, true> term3(
630 {numSubQuantizers_, queries.
getSize(0), numSubQuantizerCodes_},
633 runIteratedMatrixMult(term3,
false,
634 queriesTransposed,
false,
635 pqCentroidsMiddleCode_,
true,
640 runTransposeAny(term3, 0, 1, term3Transposed, stream);
643 NoTypeTensor<3, true> term2;
644 NoTypeTensor<3, true> term3;
645 #ifdef FAISS_USE_FLOAT16
646 DeviceTensor<half, 3, true> term3Half;
648 if (useFloat16LookupTables_) {
649 term3Half = toHalf(
resources_, stream, term3Transposed);
650 term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
651 term3 = NoTypeTensor<3, true>(term3Half);
655 if (!useFloat16LookupTables_) {
656 term2 = NoTypeTensor<3, true>(precomputedCode_);
657 term3 = NoTypeTensor<3, true>(term3Transposed);
660 runPQScanMultiPassPrecomputed(queries,
665 useFloat16LookupTables_,
668 numSubQuantizerCodes_,
681 IVFPQ::runPQNoPrecomputedCodes_(
682 Tensor<float, 2, true>& queries,
683 DeviceTensor<float, 2, true>& coarseDistances,
684 DeviceTensor<int, 2, true>& coarseIndices,
686 Tensor<float, 2, true>& outDistances,
687 Tensor<long, 2, true>& outIndices) {
691 runPQScanMultiPassNoPrecomputed(queries,
693 pqCentroidsInnermostCode_,
695 useFloat16LookupTables_,
698 numSubQuantizerCodes_,
const int numLists_
Number of inverted lists we maintain.
int maxListLength_
Maximum list length seen.
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
int getSize() const
Returns the number of vectors we contain.
std::vector< std::vector< long > > listOffsetToUserIndex_
Holder of GPU resources for a particular flat index.
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Base inverted list functionality for IVFFlat and IVFPQ.
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables)
thrust::device_vector< int > deviceListLengths_
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
thrust::device_vector< void * > deviceListIndexPointers_
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Tensor< float, 3, true > getPQCentroids()
FlatIndex * quantizer_
Quantizer object.
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
thrust::device_vector< void * > deviceListDataPointers_
GpuResources * resources_
Collection of GPU resources that we use.
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
const int bytesPerVector_
Number of bytes per vector in the list.
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
__host__ __device__ IndexT getSize(int i) const
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
const int dim_
Expected dimensionality of the vectors.
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
static bool isSupportedNoPrecomputedSubDimSize(int dims)