12 #include "../GpuResources.h"
13 #include "BroadcastSum.cuh"
14 #include "Distance.cuh"
15 #include "FlatIndex.cuh"
16 #include "InvertedListAppend.cuh"
18 #include "PQCodeDistances.cuh"
19 #include "PQScanMultiPassNoPrecomputed.cuh"
20 #include "PQScanMultiPassPrecomputed.cuh"
21 #include "RemapIndices.h"
22 #include "VectorResidual.cuh"
23 #include "../utils/DeviceDefs.cuh"
24 #include "../utils/DeviceUtils.h"
25 #include "../utils/HostTensor.cuh"
26 #include "../utils/MatrixMult.cuh"
27 #include "../utils/NoTypeTensor.cuh"
28 #include "../utils/Transpose.cuh"
30 #include <thrust/host_vector.h>
31 #include <unordered_map>
33 namespace faiss {
namespace gpu {
38 int bitsPerSubQuantizer,
39 float* pqCentroidData,
40 IndicesOptions indicesOptions,
41 bool useFloat16LookupTables,
48 numSubQuantizers_(numSubQuantizers),
49 bitsPerSubQuantizer_(bitsPerSubQuantizer),
50 numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
51 dimPerSubQuantizer_(dim_ / numSubQuantizers),
52 precomputedCodes_(false),
53 useFloat16LookupTables_(useFloat16LookupTables) {
54 FAISS_ASSERT(pqCentroidData);
56 FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
57 FAISS_ASSERT(
dim_ % numSubQuantizers_ == 0);
60 #ifndef FAISS_USE_FLOAT16
61 FAISS_ASSERT(!useFloat16LookupTables_);
64 setPQCentroids_(pqCentroidData);
98 return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
103 if (precomputedCodes_ != enable) {
104 precomputedCodes_ = enable;
106 if (precomputedCodes_) {
112 #ifdef FAISS_USE_FLOAT16
137 auto listIds = listIds2d.
view<1>({vecs.
getSize(0)});
139 quantizer_->query(vecs, 1, listDistance, listIds2d,
false);
151 runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
157 auto residualsView = residuals.view<3>(
158 {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
162 {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
165 runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
178 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
180 mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
182 for (
int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
183 auto closestSubQDistanceView = closestSubQDistance[subQ].
view();
184 auto closestSubQIndexView = closestSubQIndex[subQ].view();
186 auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].
view();
187 auto residualsTransposeView = residualsTranspose[subQ].view();
190 pqCentroidsMiddleCodeView,
193 residualsTransposeView,
195 closestSubQDistanceView,
196 closestSubQIndexView,
203 auto closestSubQIndexView = closestSubQIndex.view<2>(
204 {numSubQuantizers_, residuals.getSize(0)});
208 mem, {residuals.getSize(0), numSubQuantizers_}, stream);
210 runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
217 std::unordered_map<int, int> assignCounts;
223 for (
int i = 0; i < listIdsHost.
getSize(0); ++i) {
224 int listId = listIdsHost[i];
228 listOffsetHost[i] = -1;
237 auto it = assignCounts.find(listId);
238 if (it != assignCounts.end()) {
239 offset += it->second;
242 assignCounts[listId] = 1;
245 listOffsetHost[i] = offset;
260 for (
auto& counts : assignCounts) {
266 auto& indices = deviceListIndices_[counts.first];
272 indices->resize(indices->size() + counts.second * indexSize, stream);
278 userIndices.resize(newNumVecs);
292 std::vector<int> listIds(assignCounts.size());
294 for (
auto& counts : assignCounts) {
295 listIds[i++] = counts.first;
308 for (
int i = 0; i < hostIndices.
getSize(0); ++i) {
309 int listId = listIdsHost[i];
316 int offset = listOffsetHost[i];
321 FAISS_ASSERT(offset < userIndices.size());
322 userIndices[offset] = hostIndices[i];
332 runIVFPQInvertedListAppend(listIds,
362 auto prevCodeData = listCodes->data();
366 FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
367 FAISS_ASSERT(listCodes->size() + lengthInBytes <=
368 (size_t) std::numeric_limits<int>::max());
370 listCodes->append((
unsigned char*) codes,
380 if (prevCodeData != listCodes->data()) {
394 streamWait({stream}, {0});
399 IVFPQ::setPQCentroids_(
float* data) {
401 numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
405 thrust::host_vector<float> hostMemory;
406 hostMemory.insert(hostMemory.end(), data, data + pqSize);
410 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
411 DeviceTensor<float, 3, true> pqDevice(
415 DeviceTensor<float, 3, true> pqDeviceTranspose(
416 {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
417 runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
420 pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
424 DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
425 {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
426 runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
429 pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
433 IVFPQ::precomputeCodes_() {
447 auto pqCentroidsMiddleCodeView =
448 pqCentroidsMiddleCode_.
view<2>(
449 {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
450 DeviceTensor<float, 1, true> subQuantizerNorms(
451 {numSubQuantizers_ * numSubQuantizerCodes_});
453 runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms,
true,
465 auto centroidView = coarseCentroids.view<3>(
466 {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
467 DeviceTensor<float, 3, true> centroidsTransposed(
468 {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
470 runTransposeAny(centroidView, 0, 1, centroidsTransposed,
473 DeviceTensor<float, 3, true> coarsePQProduct(
474 {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
476 runIteratedMatrixMult(coarsePQProduct,
false,
477 centroidsTransposed,
false,
478 pqCentroidsMiddleCode_,
true,
485 DeviceTensor<float, 3, true> coarsePQProductTransposed(
486 {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
487 runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
492 auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
493 {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
498 runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
501 #ifdef FAISS_USE_FLOAT16
502 if (useFloat16LookupTables_) {
505 coarsePQProductTransposed);
512 precomputedCode_ = std::move(coarsePQProductTransposed);
522 FAISS_ASSERT(nprobe <= 1024);
523 FAISS_ASSERT(k <= 1024);
535 coarseDistances(mem, {queries.
getSize(0), nprobe}, stream);
537 coarseIndices(mem, {queries.
getSize(0), nprobe}, stream);
547 if (precomputedCodes_) {
548 runPQPrecomputedCodes_(queries,
555 runPQNoPrecomputedCodes_(queries,
570 ivfOffsetToUserIndex(hostOutIndices.
data(),
578 outIndices.
copyFrom(hostOutIndices, stream);
582 std::vector<unsigned char>
592 return pqCentroidsMiddleCode_;
596 IVFPQ::runPQPrecomputedCodes_(
612 {queries.
getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
618 auto querySubQuantizerView = queries.
view<3>(
619 {queries.
getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
620 DeviceTensor<float, 3, true> queriesTransposed(
622 {numSubQuantizers_, queries.
getSize(0), dimPerSubQuantizer_},
624 runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
626 DeviceTensor<float, 3, true> term3(
628 {numSubQuantizers_, queries.
getSize(0), numSubQuantizerCodes_},
631 runIteratedMatrixMult(term3,
false,
632 queriesTransposed,
false,
633 pqCentroidsMiddleCode_,
true,
638 runTransposeAny(term3, 0, 1, term3Transposed, stream);
641 NoTypeTensor<3, true> term2;
642 NoTypeTensor<3, true> term3;
643 #ifdef FAISS_USE_FLOAT16
644 DeviceTensor<half, 3, true> term3Half;
646 if (useFloat16LookupTables_) {
647 term3Half = toHalf(
resources_, stream, term3Transposed);
648 term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
649 term3 = NoTypeTensor<3, true>(term3Half);
653 if (!useFloat16LookupTables_) {
654 term2 = NoTypeTensor<3, true>(precomputedCode_);
655 term3 = NoTypeTensor<3, true>(term3Transposed);
658 runPQScanMultiPassPrecomputed(queries,
663 useFloat16LookupTables_,
666 numSubQuantizerCodes_,
679 IVFPQ::runPQNoPrecomputedCodes_(
680 Tensor<float, 2, true>& queries,
681 DeviceTensor<float, 2, true>& coarseDistances,
682 DeviceTensor<int, 2, true>& coarseIndices,
684 Tensor<float, 2, true>& outDistances,
685 Tensor<long, 2, true>& outIndices) {
689 runPQScanMultiPassNoPrecomputed(queries,
691 pqCentroidsInnermostCode_,
693 useFloat16LookupTables_,
696 numSubQuantizerCodes_,
const int numLists_
Number of inverted lists we maintain.
int maxListLength_
Maximum list length seen.
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
int getSize() const
Returns the number of vectors we contain.
std::vector< std::vector< long > > listOffsetToUserIndex_
Holder of GPU resources for a particular flat index.
__host__ __device__ Tensor< T, SubDim, InnerContig, IndexT, PtrTraits > view(DataPtrType at)
Base inverted list functionality for IVFFlat and IVFPQ.
thrust::device_vector< int > deviceListLengths_
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
thrust::device_vector< void * > deviceListIndexPointers_
cublasHandle_t getBlasHandleCurrentDevice()
Calls getBlasHandle with the current device.
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
__host__ void copyFrom(Tensor< T, Dim, InnerContig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Tensor< float, 3, true > getPQCentroids()
FlatIndex * quantizer_
Quantizer object.
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
__host__ __device__ IndexT getSize(int i) const
thrust::device_vector< void * > deviceListDataPointers_
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
GpuResources * resources_
Collection of GPU resources that we use.
const int bytesPerVector_
Number of bytes per vector in the list.
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables, MemorySpace space)
const int dim_
Expected dimensionality of the vectors.
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
static bool isSupportedNoPrecomputedSubDimSize(int dims)