Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFFlat.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include "IVFFlat.cuh"
11 #include "../GpuResources.h"
12 #include "FlatIndex.cuh"
13 #include "InvertedListAppend.cuh"
14 #include "IVFFlatScan.cuh"
15 #include "RemapIndices.h"
16 #include "../utils/CopyUtils.cuh"
17 #include "../utils/DeviceDefs.cuh"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/Float16.cuh"
20 #include "../utils/HostTensor.cuh"
21 #include "../utils/Transpose.cuh"
22 #include <limits>
23 #include <thrust/host_vector.h>
24 #include <unordered_map>
25 
26 namespace faiss { namespace gpu {
27 
29  FlatIndex* quantizer,
30  bool l2Distance,
31  bool useFloat16,
32  IndicesOptions indicesOptions,
33  MemorySpace space) :
34  IVFBase(resources,
35  quantizer,
36 #ifdef FAISS_USE_FLOAT16
37  useFloat16 ?
38  sizeof(half) * quantizer->getDim()
39  : sizeof(float) * quantizer->getDim(),
40 #else
41  sizeof(float) * quantizer->getDim(),
42 #endif
43  indicesOptions,
44  space),
45  l2Distance_(l2Distance),
46  useFloat16_(useFloat16) {
47 }
48 
49 IVFFlat::~IVFFlat() {
50 }
51 
52 void
54  const float* vecs,
55  const long* indices,
56  size_t numVecs) {
57  // This list must already exist
58  FAISS_ASSERT(listId < deviceListData_.size());
60 
61  // If there's nothing to add, then there's nothing we have to do
62  if (numVecs == 0) {
63  return;
64  }
65 
66  size_t lengthInBytes = numVecs * bytesPerVector_;
67 
68  auto& listData = deviceListData_[listId];
69  auto prevData = listData->data();
70 
71  // We only have int32 length representations on the GPU per each
72  // list; the length is in sizeof(char)
73  FAISS_ASSERT(listData->size() + lengthInBytes <=
74  (size_t) std::numeric_limits<int>::max());
75 
76  if (useFloat16_) {
77 #ifdef FAISS_USE_FLOAT16
78  // We have to convert data to the half format.
79  // Make sure the source data is on our device first; it is not
80  // guaranteed before function entry to avoid unnecessary h2d copies
81  auto floatData =
82  toDevice<float, 1>(resources_,
83  getCurrentDevice(),
84  (float*) vecs,
85  stream,
86  {(int) numVecs * dim_});
87  auto halfData = toHalf<1>(resources_, stream, floatData);
88 
89  listData->append((unsigned char*) halfData.data(),
90  lengthInBytes,
91  stream,
92  true /* exact reserved size */);
93 #else
94  // we are not compiling with float16 support
95  FAISS_ASSERT(false);
96 #endif
97  } else {
98  listData->append((unsigned char*) vecs,
99  lengthInBytes,
100  stream,
101  true /* exact reserved size */);
102  }
103 
104  // Handle the indices as well
105  addIndicesFromCpu_(listId, indices, numVecs);
106 
107  // This list address may have changed due to vector resizing, but
108  // only bother updating it on the device if it has changed
109  if (prevData != listData->data()) {
110  deviceListDataPointers_[listId] = listData->data();
111  }
112 
113  // And our size has changed too
114  int listLength = listData->size() / bytesPerVector_;
115  deviceListLengths_[listId] = listLength;
116 
117  // We update this as well, since the multi-pass algorithm uses it
118  maxListLength_ = std::max(maxListLength_, listLength);
119 
120  // device_vector add is potentially happening on a different stream
121  // than our default stream
122  if (stream != 0) {
123  streamWait({stream}, {0});
124  }
125 }
126 
127 int
129  Tensor<long, 1, true>& indices) {
130  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
131  FAISS_ASSERT(vecs.getSize(1) == dim_);
132 
134  auto stream = resources_->getDefaultStreamCurrentDevice();
135 
136  // Number of valid vectors that we actually add; we return this
137  int numAdded = 0;
138 
139  // We don't actually need this
140  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
141  // We use this
142  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
143  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
144 
145  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
146 
147  // Copy the lists that we wish to append to back to the CPU
148  // FIXME: really this can be into pinned memory and a true async
149  // copy on a different stream; we can start the copy early, but it's
150  // tiny
151  HostTensor<int, 1, true> listIdsHost(listIds, stream);
152 
153  // Now we add the encoded vectors to the individual lists
154  // First, make sure that there is space available for adding the new
155  // encoded vectors and indices
156 
157  // list id -> # being added
158  std::unordered_map<int, int> assignCounts;
159 
160  // vector id -> offset in list
161  // (we already have vector id -> list id in listIds)
162  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
163 
164  for (int i = 0; i < listIds.getSize(0); ++i) {
165  int listId = listIdsHost[i];
166 
167  // Add vector could be invalid (contains NaNs etc)
168  if (listId < 0) {
169  listOffsetHost[i] = -1;
170  continue;
171  }
172 
173  FAISS_ASSERT(listId < numLists_);
174  ++numAdded;
175 
176  int offset = deviceListData_[listId]->size() / bytesPerVector_;
177 
178  auto it = assignCounts.find(listId);
179  if (it != assignCounts.end()) {
180  offset += it->second;
181  it->second++;
182  } else {
183  assignCounts[listId] = 1;
184  }
185 
186  listOffsetHost[i] = offset;
187  }
188 
189  // If we didn't add anything (all invalid vectors), no need to
190  // continue
191  if (numAdded == 0) {
192  return 0;
193  }
194 
195  // We need to resize the data structures for the inverted lists on
196  // the GPUs, which means that they might need reallocation, which
197  // means that their base address may change. Figure out the new base
198  // addresses, and update those in a batch on the device
199  {
200  for (auto& counts : assignCounts) {
201  auto& data = deviceListData_[counts.first];
202  data->resize(data->size() + counts.second * bytesPerVector_,
203  stream);
204  int newNumVecs = (int) (data->size() / bytesPerVector_);
205 
206  auto& indices = deviceListIndices_[counts.first];
207  if ((indicesOptions_ == INDICES_32_BIT) ||
208  (indicesOptions_ == INDICES_64_BIT)) {
209  size_t indexSize =
210  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
211 
212  indices->resize(indices->size() + counts.second * indexSize, stream);
213  } else if (indicesOptions_ == INDICES_CPU) {
214  // indices are stored on the CPU side
215  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
216 
217  auto& userIndices = listOffsetToUserIndex_[counts.first];
218  userIndices.resize(newNumVecs);
219  } else {
220  // indices are not stored on the GPU or CPU side
221  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
222  }
223 
224  // This is used by the multi-pass query to decide how much scratch
225  // space to allocate for intermediate results
226  maxListLength_ = std::max(maxListLength_, newNumVecs);
227  }
228 
229  // Update all pointers to the lists on the device that may have
230  // changed
231  {
232  std::vector<int> listIds(assignCounts.size());
233  int i = 0;
234  for (auto& counts : assignCounts) {
235  listIds[i++] = counts.first;
236  }
237 
238  updateDeviceListInfo_(listIds, stream);
239  }
240  }
241 
242  // If we're maintaining the indices on the CPU side, update our
243  // map. We already resized our map above.
244  if (indicesOptions_ == INDICES_CPU) {
245  // We need to maintain the indices on the CPU side
246  HostTensor<long, 1, true> hostIndices(indices, stream);
247 
248  for (int i = 0; i < hostIndices.getSize(0); ++i) {
249  int listId = listIdsHost[i];
250 
251  // Add vector could be invalid (contains NaNs etc)
252  if (listId < 0) {
253  continue;
254  }
255 
256  int offset = listOffsetHost[i];
257 
258  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
259  auto& userIndices = listOffsetToUserIndex_[listId];
260 
261  FAISS_ASSERT(offset < userIndices.size());
262  userIndices[offset] = hostIndices[i];
263  }
264  }
265 
266  // We similarly need to actually append the new vectors
267  {
268  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
269 
270  // Now, for each list to which a vector is being assigned, write it
271  runIVFFlatInvertedListAppend(listIds,
272  listOffset,
273  vecs,
274  indices,
275  useFloat16_,
279  stream);
280  }
281 
282  return numAdded;
283 }
284 
285 void
287  int nprobe,
288  int k,
289  Tensor<float, 2, true>& outDistances,
290  Tensor<long, 2, true>& outIndices) {
292  auto stream = resources_->getDefaultStreamCurrentDevice();
293 
294  // Validate these at a top level
295  FAISS_ASSERT(nprobe <= 1024);
296  FAISS_ASSERT(k <= 1024);
297  nprobe = std::min(nprobe, quantizer_->getSize());
298 
299  FAISS_ASSERT(queries.getSize(1) == dim_);
300 
301  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
302  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
303 
304  // Reserve space for the quantized information
306  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
308  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
309 
310  // Find the `nprobe` closest lists; we can use int indices both
311  // internally and externally
312  quantizer_->query(queries,
313  nprobe,
314  coarseDistances,
315  coarseIndices,
316  false);
317 
318  runIVFFlatScan(queries,
319  coarseIndices,
325  k,
326  l2Distance_,
327  useFloat16_,
328  outDistances,
329  outIndices,
330  resources_);
331 
332  // If the GPU isn't storing indices (they are on the CPU side), we
333  // need to perform the re-mapping here
334  // FIXME: we might ultimately be calling this function with inputs
335  // from the CPU, these are unnecessary copies
336  if (indicesOptions_ == INDICES_CPU) {
337  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
338 
339  ivfOffsetToUserIndex(hostOutIndices.data(),
340  numLists_,
341  hostOutIndices.getSize(0),
342  hostOutIndices.getSize(1),
344 
345  // Copy back to GPU, since the input to this function is on the
346  // GPU
347  outIndices.copyFrom(hostOutIndices, stream);
348  }
349 }
350 
351 std::vector<float>
352 IVFFlat::getListVectors(int listId) const {
353  FAISS_ASSERT(listId < deviceListData_.size());
354  auto& encVecs = *deviceListData_[listId];
355 
356  auto stream = resources_->getDefaultStreamCurrentDevice();
357 
358  if (useFloat16_) {
359 #ifdef FAISS_USE_FLOAT16
360  size_t num = encVecs.size() / sizeof(half);
361 
362  Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});
363  auto devFloat = fromHalf(resources_, stream, devHalf);
364 
365  std::vector<float> out(num);
366  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
367  hostFloat.copyFrom(devFloat, stream);
368 
369  return out;
370 #endif
371  }
372 
373  size_t num = encVecs.size() / sizeof(float);
374 
375  Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});
376 
377  std::vector<float> out(num);
378  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
379  hostFloat.copyFrom(devFloat, stream);
380 
381  return out;
382 }
383 
384 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:90
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:114
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:46
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:126
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:22
__host__ __device__ Tensor< T, SubDim, InnerContig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:633
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:26
IVFFlat(GpuResources *resources, FlatIndex *quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions, MemorySpace space)
Construct from a quantizer that has elemen.
Definition: IVFFlat.cu:28
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:111
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:107
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFFlat.cu:128
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
__host__ void copyFrom(Tensor< T, Dim, InnerContig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:131
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:84
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:223
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:103
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:175
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:81
Our tensor type.
Definition: Tensor.cuh:29
void addCodeVectorsFromCpu(int listId, const float *vecs, const long *indices, size_t numVecs)
Definition: IVFFlat.cu:53
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:93
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFFlat.cu:286
std::vector< float > getListVectors(int listId) const
Return the vectors of a particular list back to the CPU.
Definition: IVFFlat.cu:352
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:137
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:96
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:120
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:87
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:244