/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD+Patents license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #include "IVFFlat.cuh" #include "../GpuResources.h" #include "FlatIndex.cuh" #include "InvertedListAppend.cuh" #include "IVFFlatScan.cuh" #include "RemapIndices.h" #include "../utils/CopyUtils.cuh" #include "../utils/DeviceDefs.cuh" #include "../utils/DeviceUtils.h" #include "../utils/Float16.cuh" #include "../utils/HostTensor.cuh" #include "../utils/Transpose.cuh" #include #include #include namespace faiss { namespace gpu { IVFFlat::IVFFlat(GpuResources* resources, FlatIndex* quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions, MemorySpace space) : IVFBase(resources, quantizer, #ifdef FAISS_USE_FLOAT16 useFloat16 ? sizeof(half) * quantizer->getDim() : sizeof(float) * quantizer->getDim(), #else sizeof(float) * quantizer->getDim(), #endif indicesOptions, space), l2Distance_(l2Distance), useFloat16_(useFloat16) { #ifndef FAISS_USE_FLOAT16 FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported"); useFloat16_ = false; #endif } IVFFlat::~IVFFlat() { } void IVFFlat::addCodeVectorsFromCpu(int listId, const float* vecs, const long* indices, size_t numVecs) { // This list must already exist FAISS_ASSERT(listId < deviceListData_.size()); auto stream = resources_->getDefaultStreamCurrentDevice(); // If there's nothing to add, then there's nothing we have to do if (numVecs == 0) { return; } size_t lengthInBytes = numVecs * bytesPerVector_; auto& listData = deviceListData_[listId]; auto prevData = listData->data(); // We only have int32 length representations on the GPU per each // list; the length is in sizeof(char) FAISS_ASSERT(listData->size() + lengthInBytes <= (size_t) std::numeric_limits::max()); if (useFloat16_) { #ifdef FAISS_USE_FLOAT16 // We have to convert data to the half format. // Make sure the source data is on our device first; it is not // guaranteed before function entry to avoid unnecessary h2d copies auto floatData = toDevice(resources_, getCurrentDevice(), (float*) vecs, stream, {(int) numVecs * dim_}); auto halfData = toHalf<1>(resources_, stream, floatData); listData->append((unsigned char*) halfData.data(), lengthInBytes, stream, true /* exact reserved size */); #endif } else { listData->append((unsigned char*) vecs, lengthInBytes, stream, true /* exact reserved size */); } // Handle the indices as well addIndicesFromCpu_(listId, indices, numVecs); // This list address may have changed due to vector resizing, but // only bother updating it on the device if it has changed if (prevData != listData->data()) { deviceListDataPointers_[listId] = listData->data(); } // And our size has changed too int listLength = listData->size() / bytesPerVector_; deviceListLengths_[listId] = listLength; // We update this as well, since the multi-pass algorithm uses it maxListLength_ = std::max(maxListLength_, listLength); // device_vector add is potentially happening on a different stream // than our default stream if (stream != 0) { streamWait({stream}, {0}); } } int IVFFlat::classifyAndAddVectors(Tensor& vecs, Tensor& indices) { FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0)); FAISS_ASSERT(vecs.getSize(1) == dim_); auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); // Number of valid vectors that we actually add; we return this int numAdded = 0; // We don't actually need this DeviceTensor listDistance(mem, {vecs.getSize(0), 1}, stream); // We use this DeviceTensor listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); quantizer_->query(vecs, 1, listDistance, listIds2d, false); // Copy the lists that we wish to append to back to the CPU // FIXME: really this can be into pinned memory and a true async // copy on a different stream; we can start the copy early, but it's // tiny HostTensor listIdsHost(listIds, stream); // Now we add the encoded vectors to the individual lists // First, make sure that there is space available for adding the new // encoded vectors and indices // list id -> # being added std::unordered_map assignCounts; // vector id -> offset in list // (we already have vector id -> list id in listIds) HostTensor listOffsetHost({listIdsHost.getSize(0)}); for (int i = 0; i < listIds.getSize(0); ++i) { int listId = listIdsHost[i]; // Add vector could be invalid (contains NaNs etc) if (listId < 0) { listOffsetHost[i] = -1; continue; } FAISS_ASSERT(listId < numLists_); ++numAdded; int offset = deviceListData_[listId]->size() / bytesPerVector_; auto it = assignCounts.find(listId); if (it != assignCounts.end()) { offset += it->second; it->second++; } else { assignCounts[listId] = 1; } listOffsetHost[i] = offset; } // If we didn't add anything (all invalid vectors), no need to // continue if (numAdded == 0) { return 0; } // We need to resize the data structures for the inverted lists on // the GPUs, which means that they might need reallocation, which // means that their base address may change. Figure out the new base // addresses, and update those in a batch on the device { for (auto& counts : assignCounts) { auto& data = deviceListData_[counts.first]; data->resize(data->size() + counts.second * bytesPerVector_, stream); int newNumVecs = (int) (data->size() / bytesPerVector_); auto& indices = deviceListIndices_[counts.first]; if ((indicesOptions_ == INDICES_32_BIT) || (indicesOptions_ == INDICES_64_BIT)) { size_t indexSize = (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long); indices->resize(indices->size() + counts.second * indexSize, stream); } else if (indicesOptions_ == INDICES_CPU) { // indices are stored on the CPU side FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size()); auto& userIndices = listOffsetToUserIndex_[counts.first]; userIndices.resize(newNumVecs); } else { // indices are not stored on the GPU or CPU side FAISS_ASSERT(indicesOptions_ == INDICES_IVF); } // This is used by the multi-pass query to decide how much scratch // space to allocate for intermediate results maxListLength_ = std::max(maxListLength_, newNumVecs); } // Update all pointers to the lists on the device that may have // changed { std::vector listIds(assignCounts.size()); int i = 0; for (auto& counts : assignCounts) { listIds[i++] = counts.first; } updateDeviceListInfo_(listIds, stream); } } // If we're maintaining the indices on the CPU side, update our // map. We already resized our map above. if (indicesOptions_ == INDICES_CPU) { // We need to maintain the indices on the CPU side HostTensor hostIndices(indices, stream); for (int i = 0; i < hostIndices.getSize(0); ++i) { int listId = listIdsHost[i]; // Add vector could be invalid (contains NaNs etc) if (listId < 0) { continue; } int offset = listOffsetHost[i]; FAISS_ASSERT(listId < listOffsetToUserIndex_.size()); auto& userIndices = listOffsetToUserIndex_[listId]; FAISS_ASSERT(offset < userIndices.size()); userIndices[offset] = hostIndices[i]; } } // We similarly need to actually append the new vectors { DeviceTensor listOffset(mem, listOffsetHost, stream); // Now, for each list to which a vector is being assigned, write it runIVFFlatInvertedListAppend(listIds, listOffset, vecs, indices, useFloat16_, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, stream); } return numAdded; } void IVFFlat::query(Tensor& queries, int nprobe, int k, Tensor& outDistances, Tensor& outIndices) { auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); // Validate these at a top level FAISS_ASSERT(nprobe <= 1024); FAISS_ASSERT(k <= 1024); nprobe = std::min(nprobe, quantizer_->getSize()); FAISS_ASSERT(queries.getSize(1) == dim_); FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); // Reserve space for the quantized information DeviceTensor coarseDistances(mem, {queries.getSize(0), nprobe}, stream); DeviceTensor coarseIndices(mem, {queries.getSize(0), nprobe}, stream); // Find the `nprobe` closest lists; we can use int indices both // internally and externally quantizer_->query(queries, nprobe, coarseDistances, coarseIndices, false); runIVFFlatScan(queries, coarseIndices, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, deviceListLengths_, maxListLength_, k, l2Distance_, useFloat16_, outDistances, outIndices, resources_); // If the GPU isn't storing indices (they are on the CPU side), we // need to perform the re-mapping here // FIXME: we might ultimately be calling this function with inputs // from the CPU, these are unnecessary copies if (indicesOptions_ == INDICES_CPU) { HostTensor hostOutIndices(outIndices, stream); ivfOffsetToUserIndex(hostOutIndices.data(), numLists_, hostOutIndices.getSize(0), hostOutIndices.getSize(1), listOffsetToUserIndex_); // Copy back to GPU, since the input to this function is on the // GPU outIndices.copyFrom(hostOutIndices, stream); } } std::vector IVFFlat::getListVectors(int listId) const { FAISS_ASSERT(listId < deviceListData_.size()); auto& encVecs = *deviceListData_[listId]; auto stream = resources_->getDefaultStreamCurrentDevice(); if (useFloat16_) { #ifdef FAISS_USE_FLOAT16 size_t num = encVecs.size() / sizeof(half); Tensor devHalf((half*) encVecs.data(), {(int) num}); auto devFloat = fromHalf(resources_, stream, devHalf); std::vector out(num); HostTensor hostFloat(out.data(), {(int) num}); hostFloat.copyFrom(devFloat, stream); return out; #endif } size_t num = encVecs.size() / sizeof(float); Tensor devFloat((float*) encVecs.data(), {(int) num}); std::vector out(num); HostTensor hostFloat(out.data(), {(int) num}); hostFloat.copyFrom(devFloat, stream); return out; } } } // namespace