/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD+Patents license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #include "IVFPQ.cuh" #include "../GpuResources.h" #include "BroadcastSum.cuh" #include "Distance.cuh" #include "FlatIndex.cuh" #include "InvertedListAppend.cuh" #include "L2Norm.cuh" #include "PQCodeDistances.cuh" #include "PQScanMultiPassNoPrecomputed.cuh" #include "PQScanMultiPassPrecomputed.cuh" #include "RemapIndices.h" #include "VectorResidual.cuh" #include "../utils/DeviceDefs.cuh" #include "../utils/DeviceUtils.h" #include "../utils/HostTensor.cuh" #include "../utils/MatrixMult.cuh" #include "../utils/NoTypeTensor.cuh" #include "../utils/Transpose.cuh" #include #include #include namespace faiss { namespace gpu { IVFPQ::IVFPQ(GpuResources* resources, FlatIndex* quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float* pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables, MemorySpace space) : IVFBase(resources, quantizer, numSubQuantizers, indicesOptions, space), numSubQuantizers_(numSubQuantizers), bitsPerSubQuantizer_(bitsPerSubQuantizer), numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)), dimPerSubQuantizer_(dim_ / numSubQuantizers), precomputedCodes_(false), useFloat16LookupTables_(useFloat16LookupTables) { FAISS_ASSERT(pqCentroidData); FAISS_ASSERT(bitsPerSubQuantizer_ <= 8); FAISS_ASSERT(dim_ % numSubQuantizers_ == 0); FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_)); #ifndef FAISS_USE_FLOAT16 FAISS_ASSERT(!useFloat16LookupTables_); #endif setPQCentroids_(pqCentroidData); } IVFPQ::~IVFPQ() { } bool IVFPQ::isSupportedPQCodeLength(int size) { switch (size) { case 1: case 2: case 3: case 4: case 8: case 12: case 16: case 20: case 24: case 28: case 32: case 40: case 48: case 56: // only supported with float16 case 64: // only supported with float16 case 96: // only supported with float16 return true; default: return false; } } bool IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) { return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims); } void IVFPQ::setPrecomputedCodes(bool enable) { if (precomputedCodes_ != enable) { precomputedCodes_ = enable; if (precomputedCodes_) { precomputeCodes_(); } else { // Clear out old precomputed code data precomputedCode_ = std::move(DeviceTensor()); #ifdef FAISS_USE_FLOAT16 precomputedCodeHalf_ = std::move(DeviceTensor()); #endif } } } int IVFPQ::classifyAndAddVectors(Tensor& vecs, Tensor& indices) { FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0)); FAISS_ASSERT(vecs.getSize(1) == dim_); FAISS_ASSERT(!quantizer_->getUseFloat16()); auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); // Number of valid vectors that we actually add; we return this int numAdded = 0; // We don't actually need this DeviceTensor listDistance(mem, {vecs.getSize(0), 1}, stream); // We use this DeviceTensor listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); quantizer_->query(vecs, 1, listDistance, listIds2d, false); // Copy the lists that we wish to append to back to the CPU // FIXME: really this can be into pinned memory and a true async // copy on a different stream; we can start the copy early, but it's // tiny HostTensor listIdsHost(listIds, stream); // Calculate the residual for each closest centroid DeviceTensor residuals( mem, {vecs.getSize(0), vecs.getSize(1)}, stream); runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); // Residuals are in the form // (vec x numSubQuantizer x dimPerSubQuantizer) // transpose to // (numSubQuantizer x vec x dimPerSubQuantizer) auto residualsView = residuals.view<3>( {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_}); DeviceTensor residualsTranspose( mem, {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_}, stream); runTransposeAny(residualsView, 0, 1, residualsTranspose, stream); // Get the product quantizer centroids in the form // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer) // which is pqCentroidsMiddleCode_ // We now have a batch operation to find the top-1 distances: // batch size: numSubQuantizer // centroids: (numSubQuantizerCodes x dimPerSubQuantizer) // residuals: (vec x dimPerSubQuantizer) // => (numSubQuantizer x vec x 1) DeviceTensor closestSubQDistance( mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream); DeviceTensor closestSubQIndex( mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream); for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) { auto closestSubQDistanceView = closestSubQDistance[subQ].view(); auto closestSubQIndexView = closestSubQIndex[subQ].view(); auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view(); auto residualsTransposeView = residualsTranspose[subQ].view(); runL2Distance(resources_, pqCentroidsMiddleCodeView, nullptr, // no transposed storage nullptr, // no precomputed norms residualsTransposeView, 1, closestSubQDistanceView, closestSubQIndexView, // We don't care about distances true, // Much larger tile size, since these vectors are a // lot smaller than query vectors 1024); } // Now, we have the nearest sub-q centroid for each slice of the // residual vector. auto closestSubQIndexView = closestSubQIndex.view<2>( {numSubQuantizers_, residuals.getSize(0)}); // Transpose this for easy use DeviceTensor encodings( mem, {residuals.getSize(0), numSubQuantizers_}, stream); runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream); // Now we add the encoded vectors to the individual lists // First, make sure that there is space available for adding the new // encoded vectors and indices // list id -> # being added std::unordered_map assignCounts; // vector id -> offset in list // (we already have vector id -> list id in listIds) HostTensor listOffsetHost({listIdsHost.getSize(0)}); for (int i = 0; i < listIdsHost.getSize(0); ++i) { int listId = listIdsHost[i]; // Add vector could be invalid (contains NaNs etc) if (listId < 0) { listOffsetHost[i] = -1; continue; } FAISS_ASSERT(listId < numLists_); ++numAdded; int offset = deviceListData_[listId]->size() / bytesPerVector_; auto it = assignCounts.find(listId); if (it != assignCounts.end()) { offset += it->second; it->second++; } else { assignCounts[listId] = 1; } listOffsetHost[i] = offset; } // If we didn't add anything (all invalid vectors), no need to // continue if (numAdded == 0) { return 0; } // We need to resize the data structures for the inverted lists on // the GPUs, which means that they might need reallocation, which // means that their base address may change. Figure out the new base // addresses, and update those in a batch on the device { // Resize all of the lists that we are appending to for (auto& counts : assignCounts) { auto& codes = deviceListData_[counts.first]; codes->resize(codes->size() + counts.second * bytesPerVector_, stream); int newNumVecs = (int) (codes->size() / bytesPerVector_); auto& indices = deviceListIndices_[counts.first]; if ((indicesOptions_ == INDICES_32_BIT) || (indicesOptions_ == INDICES_64_BIT)) { size_t indexSize = (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long); indices->resize(indices->size() + counts.second * indexSize, stream); } else if (indicesOptions_ == INDICES_CPU) { // indices are stored on the CPU side FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size()); auto& userIndices = listOffsetToUserIndex_[counts.first]; userIndices.resize(newNumVecs); } else { // indices are not stored on the GPU or CPU side FAISS_ASSERT(indicesOptions_ == INDICES_IVF); } // This is used by the multi-pass query to decide how much scratch // space to allocate for intermediate results maxListLength_ = std::max(maxListLength_, newNumVecs); } // Update all pointers and sizes on the device for lists that we // appended to { std::vector listIds(assignCounts.size()); int i = 0; for (auto& counts : assignCounts) { listIds[i++] = counts.first; } updateDeviceListInfo_(listIds, stream); } } // If we're maintaining the indices on the CPU side, update our // map. We already resized our map above. if (indicesOptions_ == INDICES_CPU) { // We need to maintain the indices on the CPU side HostTensor hostIndices(indices, stream); for (int i = 0; i < hostIndices.getSize(0); ++i) { int listId = listIdsHost[i]; // Add vector could be invalid (contains NaNs etc) if (listId < 0) { continue; } int offset = listOffsetHost[i]; FAISS_ASSERT(listId < listOffsetToUserIndex_.size()); auto& userIndices = listOffsetToUserIndex_[listId]; FAISS_ASSERT(offset < userIndices.size()); userIndices[offset] = hostIndices[i]; } } // We similarly need to actually append the new encoded vectors { DeviceTensor listOffset(mem, listOffsetHost, stream); // This kernel will handle appending each encoded vector + index to // the appropriate list runIVFPQInvertedListAppend(listIds, listOffset, encodings, indices, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, stream); } return numAdded; } void IVFPQ::addCodeVectorsFromCpu(int listId, const void* codes, const long* indices, size_t numVecs) { // This list must already exist FAISS_ASSERT(listId < deviceListData_.size()); auto stream = resources_->getDefaultStreamCurrentDevice(); // If there's nothing to add, then there's nothing we have to do if (numVecs == 0) { return; } size_t lengthInBytes = numVecs * bytesPerVector_; auto& listCodes = deviceListData_[listId]; auto prevCodeData = listCodes->data(); // We only have int32 length representations on the GPU per each // list; the length is in sizeof(char) FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0); FAISS_ASSERT(listCodes->size() + lengthInBytes <= (size_t) std::numeric_limits::max()); listCodes->append((unsigned char*) codes, lengthInBytes, stream, true /* exact reserved size */); // Handle the indices as well addIndicesFromCpu_(listId, indices, numVecs); // This list address may have changed due to vector resizing, but // only bother updating it on the device if it has changed if (prevCodeData != listCodes->data()) { deviceListDataPointers_[listId] = listCodes->data(); } // And our size has changed too int listLength = listCodes->size() / bytesPerVector_; deviceListLengths_[listId] = listLength; // We update this as well, since the multi-pass algorithm uses it maxListLength_ = std::max(maxListLength_, listLength); // device_vector add is potentially happening on a different stream // than our default stream if (resources_->getDefaultStreamCurrentDevice() != 0) { streamWait({stream}, {0}); } } void IVFPQ::setPQCentroids_(float* data) { size_t pqSize = numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_; // Make sure the data is on the host // FIXME: why are we doing this? thrust::host_vector hostMemory; hostMemory.insert(hostMemory.end(), data, data + pqSize); HostTensor pqHost( hostMemory.data(), {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_}); DeviceTensor pqDevice( pqHost, resources_->getDefaultStreamCurrentDevice()); DeviceTensor pqDeviceTranspose( {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_}); runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose, resources_->getDefaultStreamCurrentDevice()); pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose); // Also maintain the PQ centroids in the form // (sub q)(code id)(sub dim) DeviceTensor pqCentroidsMiddleCode( {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_}); runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, resources_->getDefaultStreamCurrentDevice()); pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode); } void IVFPQ::precomputeCodes_() { // // d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R) // --------------- --------------------------- ------- // term 1 term 2 term 3 // // Terms 1 and 3 are available only at query time. We compute term 2 // here. FAISS_ASSERT(!quantizer_->getUseFloat16()); auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); // Compute ||y_R||^2 by treating // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim) auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_.view<2>( {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_}); DeviceTensor subQuantizerNorms( {numSubQuantizers_ * numSubQuantizerCodes_}); runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms, true, resources_->getDefaultStreamCurrentDevice()); // Compute 2 * (y_C|y_R) via batch matrix multiplication // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'} // => (sub q) x {(centroid id)(code id)} // => (sub q)(centroid id)(code id) // View (centroid id)(dim) as // (centroid id)(sub q)(dim) // Transpose (centroid id)(sub q)(sub dim) to // (sub q)(centroid id)(sub dim) auto centroidView = coarseCentroids.view<3>( {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_}); DeviceTensor centroidsTransposed( {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_}); runTransposeAny(centroidView, 0, 1, centroidsTransposed, resources_->getDefaultStreamCurrentDevice()); DeviceTensor coarsePQProduct( {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_}); runIteratedMatrixMult(coarsePQProduct, false, centroidsTransposed, false, pqCentroidsMiddleCode_, true, 2.0f, 0.0f, resources_->getBlasHandleCurrentDevice(), resources_->getDefaultStreamCurrentDevice()); // Transpose (sub q)(centroid id)(code id) to // (centroid id)(sub q)(code id) DeviceTensor coarsePQProductTransposed( {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_}); runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed, resources_->getDefaultStreamCurrentDevice()); // View (centroid id)(sub q)(code id) as // (centroid id)(sub q * code id) auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>( {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_}); // Sum || y_R ||^2 + 2 * (y_C|y_R) // i.e., add norms (sub q * code id) // along columns of inner product (centroid id)(sub q * code id) runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView, resources_->getDefaultStreamCurrentDevice()); #ifdef FAISS_USE_FLOAT16 if (useFloat16LookupTables_) { precomputedCodeHalf_ = toHalf(resources_, resources_->getDefaultStreamCurrentDevice(), coarsePQProductTransposed); return; } #endif // We added into the view, so `coarsePQProductTransposed` is now our // precomputed term 2. precomputedCode_ = std::move(coarsePQProductTransposed); } void IVFPQ::query(Tensor& queries, int nprobe, int k, Tensor& outDistances, Tensor& outIndices) { // Validate these at a top level FAISS_ASSERT(nprobe <= 1024); FAISS_ASSERT(k <= 1024); auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); nprobe = std::min(nprobe, quantizer_->getSize()); FAISS_ASSERT(queries.getSize(1) == dim_); FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); // Reserve space for the closest coarse centroids DeviceTensor coarseDistances(mem, {queries.getSize(0), nprobe}, stream); DeviceTensor coarseIndices(mem, {queries.getSize(0), nprobe}, stream); // Find the `nprobe` closest coarse centroids; we can use int // indices both internally and externally quantizer_->query(queries, nprobe, coarseDistances, coarseIndices, true); if (precomputedCodes_) { runPQPrecomputedCodes_(queries, coarseDistances, coarseIndices, k, outDistances, outIndices); } else { runPQNoPrecomputedCodes_(queries, coarseDistances, coarseIndices, k, outDistances, outIndices); } // If the GPU isn't storing indices (they are on the CPU side), we // need to perform the re-mapping here // FIXME: we might ultimately be calling this function with inputs // from the CPU, these are unnecessary copies if (indicesOptions_ == INDICES_CPU) { HostTensor hostOutIndices(outIndices, stream); ivfOffsetToUserIndex(hostOutIndices.data(), numLists_, hostOutIndices.getSize(0), hostOutIndices.getSize(1), listOffsetToUserIndex_); // Copy back to GPU, since the input to this function is on the // GPU outIndices.copyFrom(hostOutIndices, stream); } } std::vector IVFPQ::getListCodes(int listId) const { FAISS_ASSERT(listId < deviceListData_.size()); return deviceListData_[listId]->copyToHost( resources_->getDefaultStreamCurrentDevice()); } Tensor IVFPQ::getPQCentroids() { return pqCentroidsMiddleCode_; } void IVFPQ::runPQPrecomputedCodes_( Tensor& queries, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, Tensor& outDistances, Tensor& outIndices) { auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); // Compute precomputed code term 3, - 2 * (x|y_R) // This is done via batch MM // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} => // {sub q} x {(query id)(code id)} DeviceTensor term3Transposed( mem, {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_}, stream); // These allocations within are only temporary, so release them when // we're done to maximize free space { auto querySubQuantizerView = queries.view<3>( {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_}); DeviceTensor queriesTransposed( mem, {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_}, stream); runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream); DeviceTensor term3( mem, {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_}, stream); runIteratedMatrixMult(term3, false, queriesTransposed, false, pqCentroidsMiddleCode_, true, -2.0f, 0.0f, resources_->getBlasHandleCurrentDevice(), stream); runTransposeAny(term3, 0, 1, term3Transposed, stream); } NoTypeTensor<3, true> term2; NoTypeTensor<3, true> term3; #ifdef FAISS_USE_FLOAT16 DeviceTensor term3Half; if (useFloat16LookupTables_) { term3Half = toHalf(resources_, stream, term3Transposed); term2 = NoTypeTensor<3, true>(precomputedCodeHalf_); term3 = NoTypeTensor<3, true>(term3Half); } #endif if (!useFloat16LookupTables_) { term2 = NoTypeTensor<3, true>(precomputedCode_); term3 = NoTypeTensor<3, true>(term3Transposed); } runPQScanMultiPassPrecomputed(queries, coarseDistances, // term 1 term2, // term 2 term3, // term 3 coarseIndices, useFloat16LookupTables_, bytesPerVector_, numSubQuantizers_, numSubQuantizerCodes_, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, deviceListLengths_, maxListLength_, k, outDistances, outIndices, resources_); } void IVFPQ::runPQNoPrecomputedCodes_( Tensor& queries, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, Tensor& outDistances, Tensor& outIndices) { FAISS_ASSERT(!quantizer_->getUseFloat16()); auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); runPQScanMultiPassNoPrecomputed(queries, coarseCentroids, pqCentroidsInnermostCode_, coarseIndices, useFloat16LookupTables_, bytesPerVector_, numSubQuantizers_, numSubQuantizerCodes_, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, deviceListLengths_, maxListLength_, k, outDistances, outIndices, resources_); } } } // namespace