Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFFlat.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "IVFFlat.cuh"
12 #include "../GpuResources.h"
13 #include "FlatIndex.cuh"
14 #include "InvertedListAppend.cuh"
15 #include "IVFFlatScan.cuh"
16 #include "RemapIndices.h"
17 #include "../utils/CopyUtils.cuh"
18 #include "../utils/DeviceDefs.cuh"
19 #include "../utils/DeviceUtils.h"
20 #include "../utils/Float16.cuh"
21 #include "../utils/HostTensor.cuh"
22 #include "../utils/Transpose.cuh"
23 #include <limits>
24 #include <thrust/host_vector.h>
25 #include <unordered_map>
26 
27 namespace faiss { namespace gpu {
28 
30  FlatIndex* quantizer,
31  bool l2Distance,
32  bool useFloat16,
33  IndicesOptions indicesOptions,
34  MemorySpace space) :
35  IVFBase(resources,
36  quantizer,
37 #ifdef FAISS_USE_FLOAT16
38  useFloat16 ?
39  sizeof(half) * quantizer->getDim()
40  : sizeof(float) * quantizer->getDim(),
41 #else
42  sizeof(float) * quantizer->getDim(),
43 #endif
44  indicesOptions,
45  space),
46  l2Distance_(l2Distance),
47  useFloat16_(useFloat16) {
48 #ifndef FAISS_USE_FLOAT16
49  FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported");
50  useFloat16_ = false;
51 #endif
52 }
53 
54 IVFFlat::~IVFFlat() {
55 }
56 
57 void
59  const float* vecs,
60  const long* indices,
61  size_t numVecs) {
62  // This list must already exist
63  FAISS_ASSERT(listId < deviceListData_.size());
65 
66  // If there's nothing to add, then there's nothing we have to do
67  if (numVecs == 0) {
68  return;
69  }
70 
71  size_t lengthInBytes = numVecs * bytesPerVector_;
72 
73  auto& listData = deviceListData_[listId];
74  auto prevData = listData->data();
75 
76  // We only have int32 length representations on the GPU per each
77  // list; the length is in sizeof(char)
78  FAISS_ASSERT(listData->size() + lengthInBytes <=
79  (size_t) std::numeric_limits<int>::max());
80 
81  if (useFloat16_) {
82 #ifdef FAISS_USE_FLOAT16
83  // We have to convert data to the half format.
84  // Make sure the source data is on our device first; it is not
85  // guaranteed before function entry to avoid unnecessary h2d copies
86  auto floatData =
87  toDevice<float, 1>(resources_,
88  getCurrentDevice(),
89  (float*) vecs,
90  stream,
91  {(int) numVecs * dim_});
92  auto halfData = toHalf<1>(resources_, stream, floatData);
93 
94  listData->append((unsigned char*) halfData.data(),
95  lengthInBytes,
96  stream,
97  true /* exact reserved size */);
98 #endif
99  } else {
100  listData->append((unsigned char*) vecs,
101  lengthInBytes,
102  stream,
103  true /* exact reserved size */);
104  }
105 
106  // Handle the indices as well
107  addIndicesFromCpu_(listId, indices, numVecs);
108 
109  // This list address may have changed due to vector resizing, but
110  // only bother updating it on the device if it has changed
111  if (prevData != listData->data()) {
112  deviceListDataPointers_[listId] = listData->data();
113  }
114 
115  // And our size has changed too
116  int listLength = listData->size() / bytesPerVector_;
117  deviceListLengths_[listId] = listLength;
118 
119  // We update this as well, since the multi-pass algorithm uses it
120  maxListLength_ = std::max(maxListLength_, listLength);
121 
122  // device_vector add is potentially happening on a different stream
123  // than our default stream
124  if (stream != 0) {
125  streamWait({stream}, {0});
126  }
127 }
128 
129 int
131  Tensor<long, 1, true>& indices) {
132  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
133  FAISS_ASSERT(vecs.getSize(1) == dim_);
134 
136  auto stream = resources_->getDefaultStreamCurrentDevice();
137 
138  // Number of valid vectors that we actually add; we return this
139  int numAdded = 0;
140 
141  // We don't actually need this
142  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
143  // We use this
144  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
145  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
146 
147  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
148 
149  // Copy the lists that we wish to append to back to the CPU
150  // FIXME: really this can be into pinned memory and a true async
151  // copy on a different stream; we can start the copy early, but it's
152  // tiny
153  HostTensor<int, 1, true> listIdsHost(listIds, stream);
154 
155  // Now we add the encoded vectors to the individual lists
156  // First, make sure that there is space available for adding the new
157  // encoded vectors and indices
158 
159  // list id -> # being added
160  std::unordered_map<int, int> assignCounts;
161 
162  // vector id -> offset in list
163  // (we already have vector id -> list id in listIds)
164  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
165 
166  for (int i = 0; i < listIds.getSize(0); ++i) {
167  int listId = listIdsHost[i];
168 
169  // Add vector could be invalid (contains NaNs etc)
170  if (listId < 0) {
171  listOffsetHost[i] = -1;
172  continue;
173  }
174 
175  FAISS_ASSERT(listId < numLists_);
176  ++numAdded;
177 
178  int offset = deviceListData_[listId]->size() / bytesPerVector_;
179 
180  auto it = assignCounts.find(listId);
181  if (it != assignCounts.end()) {
182  offset += it->second;
183  it->second++;
184  } else {
185  assignCounts[listId] = 1;
186  }
187 
188  listOffsetHost[i] = offset;
189  }
190 
191  // If we didn't add anything (all invalid vectors), no need to
192  // continue
193  if (numAdded == 0) {
194  return 0;
195  }
196 
197  // We need to resize the data structures for the inverted lists on
198  // the GPUs, which means that they might need reallocation, which
199  // means that their base address may change. Figure out the new base
200  // addresses, and update those in a batch on the device
201  {
202  for (auto& counts : assignCounts) {
203  auto& data = deviceListData_[counts.first];
204  data->resize(data->size() + counts.second * bytesPerVector_,
205  stream);
206  int newNumVecs = (int) (data->size() / bytesPerVector_);
207 
208  auto& indices = deviceListIndices_[counts.first];
209  if ((indicesOptions_ == INDICES_32_BIT) ||
210  (indicesOptions_ == INDICES_64_BIT)) {
211  size_t indexSize =
212  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
213 
214  indices->resize(indices->size() + counts.second * indexSize, stream);
215  } else if (indicesOptions_ == INDICES_CPU) {
216  // indices are stored on the CPU side
217  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
218 
219  auto& userIndices = listOffsetToUserIndex_[counts.first];
220  userIndices.resize(newNumVecs);
221  } else {
222  // indices are not stored on the GPU or CPU side
223  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
224  }
225 
226  // This is used by the multi-pass query to decide how much scratch
227  // space to allocate for intermediate results
228  maxListLength_ = std::max(maxListLength_, newNumVecs);
229  }
230 
231  // Update all pointers to the lists on the device that may have
232  // changed
233  {
234  std::vector<int> listIds(assignCounts.size());
235  int i = 0;
236  for (auto& counts : assignCounts) {
237  listIds[i++] = counts.first;
238  }
239 
240  updateDeviceListInfo_(listIds, stream);
241  }
242  }
243 
244  // If we're maintaining the indices on the CPU side, update our
245  // map. We already resized our map above.
246  if (indicesOptions_ == INDICES_CPU) {
247  // We need to maintain the indices on the CPU side
248  HostTensor<long, 1, true> hostIndices(indices, stream);
249 
250  for (int i = 0; i < hostIndices.getSize(0); ++i) {
251  int listId = listIdsHost[i];
252 
253  // Add vector could be invalid (contains NaNs etc)
254  if (listId < 0) {
255  continue;
256  }
257 
258  int offset = listOffsetHost[i];
259 
260  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
261  auto& userIndices = listOffsetToUserIndex_[listId];
262 
263  FAISS_ASSERT(offset < userIndices.size());
264  userIndices[offset] = hostIndices[i];
265  }
266  }
267 
268  // We similarly need to actually append the new vectors
269  {
270  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
271 
272  // Now, for each list to which a vector is being assigned, write it
273  runIVFFlatInvertedListAppend(listIds,
274  listOffset,
275  vecs,
276  indices,
277  useFloat16_,
281  stream);
282  }
283 
284  return numAdded;
285 }
286 
287 void
289  int nprobe,
290  int k,
291  Tensor<float, 2, true>& outDistances,
292  Tensor<long, 2, true>& outIndices) {
294  auto stream = resources_->getDefaultStreamCurrentDevice();
295 
296  // Validate these at a top level
297  FAISS_ASSERT(nprobe <= 1024);
298  FAISS_ASSERT(k <= 1024);
299  nprobe = std::min(nprobe, quantizer_->getSize());
300 
301  FAISS_ASSERT(queries.getSize(1) == dim_);
302 
303  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
304  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
305 
306  // Reserve space for the quantized information
308  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
310  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
311 
312  // Find the `nprobe` closest lists; we can use int indices both
313  // internally and externally
314  quantizer_->query(queries,
315  nprobe,
316  coarseDistances,
317  coarseIndices,
318  false);
319 
320  runIVFFlatScan(queries,
321  coarseIndices,
327  k,
328  l2Distance_,
329  useFloat16_,
330  outDistances,
331  outIndices,
332  resources_);
333 
334  // If the GPU isn't storing indices (they are on the CPU side), we
335  // need to perform the re-mapping here
336  // FIXME: we might ultimately be calling this function with inputs
337  // from the CPU, these are unnecessary copies
338  if (indicesOptions_ == INDICES_CPU) {
339  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
340 
341  ivfOffsetToUserIndex(hostOutIndices.data(),
342  numLists_,
343  hostOutIndices.getSize(0),
344  hostOutIndices.getSize(1),
346 
347  // Copy back to GPU, since the input to this function is on the
348  // GPU
349  outIndices.copyFrom(hostOutIndices, stream);
350  }
351 }
352 
353 std::vector<float>
354 IVFFlat::getListVectors(int listId) const {
355  FAISS_ASSERT(listId < deviceListData_.size());
356  auto& encVecs = *deviceListData_[listId];
357 
358  auto stream = resources_->getDefaultStreamCurrentDevice();
359 
360  if (useFloat16_) {
361 #ifdef FAISS_USE_FLOAT16
362  size_t num = encVecs.size() / sizeof(half);
363 
364  Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});
365  auto devFloat = fromHalf(resources_, stream, devHalf);
366 
367  std::vector<float> out(num);
368  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
369  hostFloat.copyFrom(devFloat, stream);
370 
371  return out;
372 #endif
373  }
374 
375  size_t num = encVecs.size() / sizeof(float);
376 
377  Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});
378 
379  std::vector<float> out(num);
380  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
381  hostFloat.copyFrom(devFloat, stream);
382 
383  return out;
384 }
385 
386 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:91
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:115
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:47
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:127
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:173
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:27
IVFFlat(GpuResources *resources, FlatIndex *quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions, MemorySpace space)
Construct from a quantizer that has elemen.
Definition: IVFFlat.cu:29
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:112
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:108
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFFlat.cu:130
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:85
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:104
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:82
Our tensor type.
Definition: Tensor.cuh:30
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:578
void addCodeVectorsFromCpu(int listId, const float *vecs, const long *indices, size_t numVecs)
Definition: IVFFlat.cu:58
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:94
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFFlat.cu:288
std::vector< float > getListVectors(int listId) const
Return the vectors of a particular list back to the CPU.
Definition: IVFFlat.cu:354
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:138
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:221
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:101
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:97
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:121
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:88
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:245