docs/html/IVFFlat_8cu_source.html

 /**

  * Copyright (c) Facebook, Inc. and its affiliates.

  *

  * This source code is licensed under the MIT license found in the

  * LICENSE file in the root directory of this source tree.

  */


 #include "IVFFlat.cuh"

 #include "../GpuResources.h"

 #include "FlatIndex.cuh"

 #include "InvertedListAppend.cuh"

 #include "IVFFlatScan.cuh"

 #include "RemapIndices.h"

 #include "../utils/CopyUtils.cuh"

 #include "../utils/DeviceDefs.cuh"

 #include "../utils/DeviceUtils.h"

 #include "../utils/Float16.cuh"

 #include "../utils/HostTensor.cuh"

 #include "../utils/Transpose.cuh"

 #include <limits>

 #include <thrust/host_vector.h>

 #include <unordered_map>


 namespace faiss { namespace gpu {


 IVFFlat::IVFFlat(GpuResources* resources,

                  FlatIndex* quantizer,

                  bool l2Distance,

                  bool useFloat16,

                  IndicesOptions indicesOptions,

                  MemorySpace space) :

     IVFBase(resources,

             quantizer,

 #ifdef FAISS_USE_FLOAT16

             useFloat16 ?

             sizeof(half) * quantizer->getDim()

             : sizeof(float) * quantizer->getDim(),

 #else

             sizeof(float) * quantizer->getDim(),

 #endif

             indicesOptions,

             space),

     l2Distance_(l2Distance),

     useFloat16_(useFloat16) {

 }


 IVFFlat::~IVFFlat() {

 }


 void

 IVFFlat::addCodeVectorsFromCpu(int listId,

                                const float* vecs,

                                const long* indices,

                                size_t numVecs) {

   // This list must already exist

   FAISS_ASSERT(listId < deviceListData_.size());

   auto stream = resources_->getDefaultStreamCurrentDevice();


   // If there's nothing to add, then there's nothing we have to do

   if (numVecs == 0) {

     return;

   }


   size_t lengthInBytes = numVecs * bytesPerVector_;


   auto& listData = deviceListData_[listId];

   auto prevData = listData->data();


   // We only have int32 length representations on the GPU per each

   // list; the length is in sizeof(char)

   FAISS_ASSERT(listData->size() + lengthInBytes <=

          (size_t) std::numeric_limits<int>::max());


   if (useFloat16_) {

 #ifdef FAISS_USE_FLOAT16

     // We have to convert data to the half format.

     // Make sure the source data is on our device first; it is not

     // guaranteed before function entry to avoid unnecessary h2d copies

     auto floatData =

       toDevice<float, 1>(resources_,

                          getCurrentDevice(),

                          (float*) vecs,

                          stream,

                          {(int) numVecs * dim_});

     auto halfData = toHalf<1>(resources_, stream, floatData);


     listData->append((unsigned char*) halfData.data(),

                      lengthInBytes,

                      stream,

                      true /* exact reserved size */);

 #else

     // we are not compiling with float16 support

     FAISS_ASSERT(false);

 #endif

   } else {

     listData->append((unsigned char*) vecs,

                      lengthInBytes,

                      stream,

                      true /* exact reserved size */);

   }


   // Handle the indices as well

   addIndicesFromCpu_(listId, indices, numVecs);


   // This list address may have changed due to vector resizing, but

   // only bother updating it on the device if it has changed

   if (prevData != listData->data()) {

     deviceListDataPointers_[listId] = listData->data();

   }


   // And our size has changed too

   int listLength = listData->size() / bytesPerVector_;

   deviceListLengths_[listId] = listLength;


   // We update this as well, since the multi-pass algorithm uses it

   maxListLength_ = std::max(maxListLength_, listLength);


   // device_vector add is potentially happening on a different stream

   // than our default stream

   if (stream != 0) {

     streamWait({stream}, {0});

   }

 }


 int

 IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,

                                Tensor<long, 1, true>& indices) {

   FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));

   FAISS_ASSERT(vecs.getSize(1) == dim_);


   auto& mem = resources_->getMemoryManagerCurrentDevice();

   auto stream = resources_->getDefaultStreamCurrentDevice();


   // Number of valid vectors that we actually add; we return this

   int numAdded = 0;


   // We don't actually need this

   DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);

   // We use this

   DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1},  stream);

   auto listIds = listIds2d.view<1>({vecs.getSize(0)});


   quantizer_->query(vecs, 1, listDistance, listIds2d, false);


   // Copy the lists that we wish to append to back to the CPU

   // FIXME: really this can be into pinned memory and a true async

   // copy on a different stream; we can start the copy early, but it's

   // tiny

   HostTensor<int, 1, true> listIdsHost(listIds, stream);


   // Now we add the encoded vectors to the individual lists

   // First, make sure that there is space available for adding the new

   // encoded vectors and indices


   // list id -> # being added

   std::unordered_map<int, int> assignCounts;


   // vector id -> offset in list

   // (we already have vector id -> list id in listIds)

   HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});


   for (int i = 0; i < listIds.getSize(0); ++i) {

     int listId = listIdsHost[i];


     // Add vector could be invalid (contains NaNs etc)

     if (listId < 0) {

       listOffsetHost[i] = -1;

       continue;

     }


     FAISS_ASSERT(listId < numLists_);

     ++numAdded;


     int offset = deviceListData_[listId]->size() / bytesPerVector_;


     auto it = assignCounts.find(listId);

     if (it != assignCounts.end()) {

       offset += it->second;

       it->second++;

     } else {

       assignCounts[listId] = 1;

     }


     listOffsetHost[i] = offset;

   }


   // If we didn't add anything (all invalid vectors), no need to

   // continue

   if (numAdded == 0) {

     return 0;

   }


   // We need to resize the data structures for the inverted lists on

   // the GPUs, which means that they might need reallocation, which

   // means that their base address may change. Figure out the new base

   // addresses, and update those in a batch on the device

   {

     for (auto& counts : assignCounts) {

       auto& data = deviceListData_[counts.first];

       data->resize(data->size() + counts.second * bytesPerVector_,

                    stream);

       int newNumVecs = (int) (data->size() / bytesPerVector_);


       auto& indices = deviceListIndices_[counts.first];

       if ((indicesOptions_ == INDICES_32_BIT) ||

           (indicesOptions_ == INDICES_64_BIT)) {

         size_t indexSize =

           (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);


         indices->resize(indices->size() + counts.second * indexSize, stream);

       } else if (indicesOptions_ == INDICES_CPU) {

         // indices are stored on the CPU side

         FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());


         auto& userIndices = listOffsetToUserIndex_[counts.first];

         userIndices.resize(newNumVecs);

       } else {

         // indices are not stored on the GPU or CPU side

         FAISS_ASSERT(indicesOptions_ == INDICES_IVF);

       }


       // This is used by the multi-pass query to decide how much scratch

       // space to allocate for intermediate results

       maxListLength_ = std::max(maxListLength_, newNumVecs);

     }


     // Update all pointers to the lists on the device that may have

     // changed

     {

       std::vector<int> listIds(assignCounts.size());

       int i = 0;

       for (auto& counts : assignCounts) {

         listIds[i++] = counts.first;

       }


       updateDeviceListInfo_(listIds, stream);

     }

   }


   // If we're maintaining the indices on the CPU side, update our

   // map. We already resized our map above.

   if (indicesOptions_ == INDICES_CPU) {

     // We need to maintain the indices on the CPU side

     HostTensor<long, 1, true> hostIndices(indices, stream);


     for (int i = 0; i < hostIndices.getSize(0); ++i) {

       int listId = listIdsHost[i];


       // Add vector could be invalid (contains NaNs etc)

       if (listId < 0) {

         continue;

       }


       int offset = listOffsetHost[i];


       FAISS_ASSERT(listId < listOffsetToUserIndex_.size());

       auto& userIndices = listOffsetToUserIndex_[listId];


       FAISS_ASSERT(offset < userIndices.size());

       userIndices[offset] = hostIndices[i];

     }

   }


   // We similarly need to actually append the new vectors

   {

     DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);


     // Now, for each list to which a vector is being assigned, write it

     runIVFFlatInvertedListAppend(listIds,

                                  listOffset,

                                  vecs,

                                  indices,

                                  useFloat16_,

                                  deviceListDataPointers_,

                                  deviceListIndexPointers_,

                                  indicesOptions_,

                                  stream);

   }


   return numAdded;

 }


 void

 IVFFlat::query(Tensor<float, 2, true>& queries,

                int nprobe,

                int k,

                Tensor<float, 2, true>& outDistances,

                Tensor<long, 2, true>& outIndices) {

   auto& mem = resources_->getMemoryManagerCurrentDevice();

   auto stream = resources_->getDefaultStreamCurrentDevice();


   // These are caught at a higher level

   FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);

   FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);

   nprobe = std::min(nprobe, quantizer_->getSize());


   FAISS_ASSERT(queries.getSize(1) == dim_);


   FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));

   FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));


   // Reserve space for the quantized information

   DeviceTensor<float, 2, true>

     coarseDistances(mem, {queries.getSize(0), nprobe}, stream);

   DeviceTensor<int, 2, true>

     coarseIndices(mem, {queries.getSize(0), nprobe}, stream);


   // Find the `nprobe` closest lists; we can use int indices both

   // internally and externally

   quantizer_->query(queries,

                     nprobe,

                     coarseDistances,

                     coarseIndices,

                     false);


   runIVFFlatScan(queries,

                  coarseIndices,

                  deviceListDataPointers_,

                  deviceListIndexPointers_,

                  indicesOptions_,

                  deviceListLengths_,

                  maxListLength_,

                  k,

                  l2Distance_,

                  useFloat16_,

                  outDistances,

                  outIndices,

                  resources_);


   // If the GPU isn't storing indices (they are on the CPU side), we

   // need to perform the re-mapping here

   // FIXME: we might ultimately be calling this function with inputs

   // from the CPU, these are unnecessary copies

   if (indicesOptions_ == INDICES_CPU) {

     HostTensor<long, 2, true> hostOutIndices(outIndices, stream);


     ivfOffsetToUserIndex(hostOutIndices.data(),

                          numLists_,

                          hostOutIndices.getSize(0),

                          hostOutIndices.getSize(1),

                          listOffsetToUserIndex_);


     // Copy back to GPU, since the input to this function is on the

     // GPU

     outIndices.copyFrom(hostOutIndices, stream);

   }

 }


 std::vector<float>

 IVFFlat::getListVectors(int listId) const {

   FAISS_ASSERT(listId < deviceListData_.size());

   auto& encVecs = *deviceListData_[listId];


   auto stream = resources_->getDefaultStreamCurrentDevice();


   if (useFloat16_) {

 #ifdef FAISS_USE_FLOAT16

     size_t num = encVecs.size() / sizeof(half);


     Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});

     auto devFloat = fromHalf(resources_, stream, devHalf);


     std::vector<float> out(num);

     HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});

     hostFloat.copyFrom(devFloat, stream);


     return out;

 #endif

   }


   size_t num = encVecs.size() / sizeof(float);


   Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});


   std::vector<float> out(num);

   HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});

   hostFloat.copyFrom(devFloat, stream);


   return out;

 }


 } } // namespace

faiss::gpu::IVFBase::numLists_
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:89

faiss::gpu::IVFBase::maxListLength_
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:113

faiss::gpu::GpuResources::getDefaultStreamCurrentDevice
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
Definition: GpuResources.cpp:23

faiss::gpu::FlatIndex::getSize
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:45

faiss::gpu::IVFBase::listOffsetToUserIndex_
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:125

faiss::gpu::FlatIndex
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:21

faiss::gpu::Tensor::view
__host__ __device__ Tensor< T, SubDim, InnerContig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:632

faiss::gpu::IVFBase
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:25

faiss::gpu::GpuResources
Definition: GpuResources.h:21

faiss::gpu::IVFFlat::IVFFlat
IVFFlat(GpuResources *resources, FlatIndex *quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions, MemorySpace space)
Construct from a quantizer that has elemen.
Definition: IVFFlat.cu:27

faiss::gpu::IVFBase::deviceListLengths_
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:110

faiss::gpu::IVFBase::deviceListIndexPointers_
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:106

faiss::gpu::IVFFlat::classifyAndAddVectors
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFFlat.cu:127

faiss::gpu::GpuResources::getMemoryManagerCurrentDevice
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
Definition: GpuResources.cpp:33

faiss::gpu::Tensor::copyFrom
__host__ void copyFrom(Tensor< T, Dim, InnerContig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:130

faiss::gpu::IVFBase::quantizer_
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:83

faiss::gpu::Tensor::getSize
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:222

faiss::gpu::IVFBase::deviceListDataPointers_
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:102

faiss::gpu::Tensor::data
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:174

faiss::gpu::IVFBase::resources_
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:80

faiss::gpu::Tensor
Our tensor type.
Definition: Tensor.cuh:28

faiss::gpu::IVFFlat::addCodeVectorsFromCpu
void addCodeVectorsFromCpu(int listId, const float *vecs, const long *indices, size_t numVecs)
Definition: IVFFlat.cu:52

faiss::gpu::IVFBase::bytesPerVector_
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:92

faiss::gpu::IVFFlat::query
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFFlat.cu:285

faiss::gpu::IVFFlat::getListVectors
std::vector< float > getListVectors(int listId) const
Return the vectors of a particular list back to the CPU.
Definition: IVFFlat.cu:351

faiss::gpu::IVFBase::updateDeviceListInfo_
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:136

faiss::gpu::IVFBase::indicesOptions_
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:95

faiss::gpu::HostTensor
Definition: HostTensor.cuh:20

faiss::gpu::IVFBase::deviceListData_
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:119

faiss::gpu::DeviceTensor< float, 2, true >

faiss::gpu::IVFBase::dim_
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:86

faiss::gpu::IVFBase::addIndicesFromCpu_
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:243