Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFFlat.cu
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include "IVFFlat.cuh"
10 #include "../GpuResources.h"
11 #include "FlatIndex.cuh"
12 #include "InvertedListAppend.cuh"
13 #include "IVFFlatScan.cuh"
14 #include "RemapIndices.h"
15 #include "../utils/CopyUtils.cuh"
16 #include "../utils/DeviceDefs.cuh"
17 #include "../utils/DeviceUtils.h"
18 #include "../utils/Float16.cuh"
19 #include "../utils/HostTensor.cuh"
20 #include "../utils/Transpose.cuh"
21 #include <limits>
22 #include <thrust/host_vector.h>
23 #include <unordered_map>
24 
25 namespace faiss { namespace gpu {
26 
28  FlatIndex* quantizer,
29  bool l2Distance,
30  bool useFloat16,
31  IndicesOptions indicesOptions,
32  MemorySpace space) :
33  IVFBase(resources,
34  quantizer,
35 #ifdef FAISS_USE_FLOAT16
36  useFloat16 ?
37  sizeof(half) * quantizer->getDim()
38  : sizeof(float) * quantizer->getDim(),
39 #else
40  sizeof(float) * quantizer->getDim(),
41 #endif
42  indicesOptions,
43  space),
44  l2Distance_(l2Distance),
45  useFloat16_(useFloat16) {
46 }
47 
48 IVFFlat::~IVFFlat() {
49 }
50 
51 void
53  const float* vecs,
54  const long* indices,
55  size_t numVecs) {
56  // This list must already exist
57  FAISS_ASSERT(listId < deviceListData_.size());
59 
60  // If there's nothing to add, then there's nothing we have to do
61  if (numVecs == 0) {
62  return;
63  }
64 
65  size_t lengthInBytes = numVecs * bytesPerVector_;
66 
67  auto& listData = deviceListData_[listId];
68  auto prevData = listData->data();
69 
70  // We only have int32 length representations on the GPU per each
71  // list; the length is in sizeof(char)
72  FAISS_ASSERT(listData->size() + lengthInBytes <=
73  (size_t) std::numeric_limits<int>::max());
74 
75  if (useFloat16_) {
76 #ifdef FAISS_USE_FLOAT16
77  // We have to convert data to the half format.
78  // Make sure the source data is on our device first; it is not
79  // guaranteed before function entry to avoid unnecessary h2d copies
80  auto floatData =
81  toDevice<float, 1>(resources_,
82  getCurrentDevice(),
83  (float*) vecs,
84  stream,
85  {(int) numVecs * dim_});
86  auto halfData = toHalf<1>(resources_, stream, floatData);
87 
88  listData->append((unsigned char*) halfData.data(),
89  lengthInBytes,
90  stream,
91  true /* exact reserved size */);
92 #else
93  // we are not compiling with float16 support
94  FAISS_ASSERT(false);
95 #endif
96  } else {
97  listData->append((unsigned char*) vecs,
98  lengthInBytes,
99  stream,
100  true /* exact reserved size */);
101  }
102 
103  // Handle the indices as well
104  addIndicesFromCpu_(listId, indices, numVecs);
105 
106  // This list address may have changed due to vector resizing, but
107  // only bother updating it on the device if it has changed
108  if (prevData != listData->data()) {
109  deviceListDataPointers_[listId] = listData->data();
110  }
111 
112  // And our size has changed too
113  int listLength = listData->size() / bytesPerVector_;
114  deviceListLengths_[listId] = listLength;
115 
116  // We update this as well, since the multi-pass algorithm uses it
117  maxListLength_ = std::max(maxListLength_, listLength);
118 
119  // device_vector add is potentially happening on a different stream
120  // than our default stream
121  if (stream != 0) {
122  streamWait({stream}, {0});
123  }
124 }
125 
126 int
128  Tensor<long, 1, true>& indices) {
129  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
130  FAISS_ASSERT(vecs.getSize(1) == dim_);
131 
133  auto stream = resources_->getDefaultStreamCurrentDevice();
134 
135  // Number of valid vectors that we actually add; we return this
136  int numAdded = 0;
137 
138  // We don't actually need this
139  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
140  // We use this
141  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
142  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
143 
144  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
145 
146  // Copy the lists that we wish to append to back to the CPU
147  // FIXME: really this can be into pinned memory and a true async
148  // copy on a different stream; we can start the copy early, but it's
149  // tiny
150  HostTensor<int, 1, true> listIdsHost(listIds, stream);
151 
152  // Now we add the encoded vectors to the individual lists
153  // First, make sure that there is space available for adding the new
154  // encoded vectors and indices
155 
156  // list id -> # being added
157  std::unordered_map<int, int> assignCounts;
158 
159  // vector id -> offset in list
160  // (we already have vector id -> list id in listIds)
161  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
162 
163  for (int i = 0; i < listIds.getSize(0); ++i) {
164  int listId = listIdsHost[i];
165 
166  // Add vector could be invalid (contains NaNs etc)
167  if (listId < 0) {
168  listOffsetHost[i] = -1;
169  continue;
170  }
171 
172  FAISS_ASSERT(listId < numLists_);
173  ++numAdded;
174 
175  int offset = deviceListData_[listId]->size() / bytesPerVector_;
176 
177  auto it = assignCounts.find(listId);
178  if (it != assignCounts.end()) {
179  offset += it->second;
180  it->second++;
181  } else {
182  assignCounts[listId] = 1;
183  }
184 
185  listOffsetHost[i] = offset;
186  }
187 
188  // If we didn't add anything (all invalid vectors), no need to
189  // continue
190  if (numAdded == 0) {
191  return 0;
192  }
193 
194  // We need to resize the data structures for the inverted lists on
195  // the GPUs, which means that they might need reallocation, which
196  // means that their base address may change. Figure out the new base
197  // addresses, and update those in a batch on the device
198  {
199  for (auto& counts : assignCounts) {
200  auto& data = deviceListData_[counts.first];
201  data->resize(data->size() + counts.second * bytesPerVector_,
202  stream);
203  int newNumVecs = (int) (data->size() / bytesPerVector_);
204 
205  auto& indices = deviceListIndices_[counts.first];
206  if ((indicesOptions_ == INDICES_32_BIT) ||
207  (indicesOptions_ == INDICES_64_BIT)) {
208  size_t indexSize =
209  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
210 
211  indices->resize(indices->size() + counts.second * indexSize, stream);
212  } else if (indicesOptions_ == INDICES_CPU) {
213  // indices are stored on the CPU side
214  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
215 
216  auto& userIndices = listOffsetToUserIndex_[counts.first];
217  userIndices.resize(newNumVecs);
218  } else {
219  // indices are not stored on the GPU or CPU side
220  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
221  }
222 
223  // This is used by the multi-pass query to decide how much scratch
224  // space to allocate for intermediate results
225  maxListLength_ = std::max(maxListLength_, newNumVecs);
226  }
227 
228  // Update all pointers to the lists on the device that may have
229  // changed
230  {
231  std::vector<int> listIds(assignCounts.size());
232  int i = 0;
233  for (auto& counts : assignCounts) {
234  listIds[i++] = counts.first;
235  }
236 
237  updateDeviceListInfo_(listIds, stream);
238  }
239  }
240 
241  // If we're maintaining the indices on the CPU side, update our
242  // map. We already resized our map above.
243  if (indicesOptions_ == INDICES_CPU) {
244  // We need to maintain the indices on the CPU side
245  HostTensor<long, 1, true> hostIndices(indices, stream);
246 
247  for (int i = 0; i < hostIndices.getSize(0); ++i) {
248  int listId = listIdsHost[i];
249 
250  // Add vector could be invalid (contains NaNs etc)
251  if (listId < 0) {
252  continue;
253  }
254 
255  int offset = listOffsetHost[i];
256 
257  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
258  auto& userIndices = listOffsetToUserIndex_[listId];
259 
260  FAISS_ASSERT(offset < userIndices.size());
261  userIndices[offset] = hostIndices[i];
262  }
263  }
264 
265  // We similarly need to actually append the new vectors
266  {
267  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
268 
269  // Now, for each list to which a vector is being assigned, write it
270  runIVFFlatInvertedListAppend(listIds,
271  listOffset,
272  vecs,
273  indices,
274  useFloat16_,
278  stream);
279  }
280 
281  return numAdded;
282 }
283 
284 void
286  int nprobe,
287  int k,
288  Tensor<float, 2, true>& outDistances,
289  Tensor<long, 2, true>& outIndices) {
291  auto stream = resources_->getDefaultStreamCurrentDevice();
292 
293  // These are caught at a higher level
294  FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
295  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
296  nprobe = std::min(nprobe, quantizer_->getSize());
297 
298  FAISS_ASSERT(queries.getSize(1) == dim_);
299 
300  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
301  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
302 
303  // Reserve space for the quantized information
305  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
307  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
308 
309  // Find the `nprobe` closest lists; we can use int indices both
310  // internally and externally
311  quantizer_->query(queries,
312  nprobe,
313  coarseDistances,
314  coarseIndices,
315  false);
316 
317  runIVFFlatScan(queries,
318  coarseIndices,
324  k,
325  l2Distance_,
326  useFloat16_,
327  outDistances,
328  outIndices,
329  resources_);
330 
331  // If the GPU isn't storing indices (they are on the CPU side), we
332  // need to perform the re-mapping here
333  // FIXME: we might ultimately be calling this function with inputs
334  // from the CPU, these are unnecessary copies
335  if (indicesOptions_ == INDICES_CPU) {
336  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
337 
338  ivfOffsetToUserIndex(hostOutIndices.data(),
339  numLists_,
340  hostOutIndices.getSize(0),
341  hostOutIndices.getSize(1),
343 
344  // Copy back to GPU, since the input to this function is on the
345  // GPU
346  outIndices.copyFrom(hostOutIndices, stream);
347  }
348 }
349 
350 std::vector<float>
351 IVFFlat::getListVectors(int listId) const {
352  FAISS_ASSERT(listId < deviceListData_.size());
353  auto& encVecs = *deviceListData_[listId];
354 
355  auto stream = resources_->getDefaultStreamCurrentDevice();
356 
357  if (useFloat16_) {
358 #ifdef FAISS_USE_FLOAT16
359  size_t num = encVecs.size() / sizeof(half);
360 
361  Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});
362  auto devFloat = fromHalf(resources_, stream, devHalf);
363 
364  std::vector<float> out(num);
365  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
366  hostFloat.copyFrom(devFloat, stream);
367 
368  return out;
369 #endif
370  }
371 
372  size_t num = encVecs.size() / sizeof(float);
373 
374  Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});
375 
376  std::vector<float> out(num);
377  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
378  hostFloat.copyFrom(devFloat, stream);
379 
380  return out;
381 }
382 
383 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:89
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:113
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:45
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:125
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:21
__host__ __device__ Tensor< T, SubDim, InnerContig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:632
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:25
IVFFlat(GpuResources *resources, FlatIndex *quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions, MemorySpace space)
Construct from a quantizer that has elemen.
Definition: IVFFlat.cu:27
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:110
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:106
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFFlat.cu:127
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
__host__ void copyFrom(Tensor< T, Dim, InnerContig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:130
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:83
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:222
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:102
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:174
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:80
Our tensor type.
Definition: Tensor.cuh:28
void addCodeVectorsFromCpu(int listId, const float *vecs, const long *indices, size_t numVecs)
Definition: IVFFlat.cu:52
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:92
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFFlat.cu:285
std::vector< float > getListVectors(int listId) const
Return the vectors of a particular list back to the CPU.
Definition: IVFFlat.cu:351
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:136
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:95
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:119
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:86
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:243