Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFFlat.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "IVFFlat.cuh"
13 #include "../GpuResources.h"
14 #include "FlatIndex.cuh"
15 #include "InvertedListAppend.cuh"
16 #include "IVFFlatScan.cuh"
17 #include "RemapIndices.h"
18 #include "../utils/CopyUtils.cuh"
19 #include "../utils/DeviceDefs.cuh"
20 #include "../utils/DeviceUtils.h"
21 #include "../utils/Float16.cuh"
22 #include "../utils/HostTensor.cuh"
23 #include "../utils/Transpose.cuh"
24 #include <limits>
25 #include <thrust/host_vector.h>
26 #include <unordered_map>
27 
28 namespace faiss { namespace gpu {
29 
31  FlatIndex* quantizer,
32  bool l2Distance,
33  bool useFloat16,
34  IndicesOptions indicesOptions) :
35  IVFBase(resources,
36  quantizer,
37 #ifdef FAISS_USE_FLOAT16
38  useFloat16 ?
39  sizeof(half) * quantizer->getDim()
40  : sizeof(float) * quantizer->getDim(),
41 #else
42  sizeof(float) * quantizer->getDim(),
43 #endif
44  indicesOptions),
45  l2Distance_(l2Distance),
46  useFloat16_(useFloat16) {
47 #ifndef FAISS_USE_FLOAT16
48  FAISS_ASSERT(!useFloat16 | !"float16 unsupported");
49  useFloat16_ = false;
50 #endif
51 }
52 
53 IVFFlat::~IVFFlat() {
54 }
55 
56 void
58  const float* vecs,
59  const long* indices,
60  size_t numVecs) {
61  // This list must already exist
62  FAISS_ASSERT(listId < deviceListData_.size());
63  auto stream = resources_->getDefaultStreamCurrentDevice();
64 
65  // If there's nothing to add, then there's nothing we have to do
66  if (numVecs == 0) {
67  return;
68  }
69 
70  size_t lengthInBytes = numVecs * bytesPerVector_;
71 
72  auto& listData = deviceListData_[listId];
73  auto prevData = listData->data();
74 
75  // We only have int32 length representations on the GPU per each
76  // list; the length is in sizeof(char)
77  FAISS_ASSERT(listData->size() + lengthInBytes <=
78  (size_t) std::numeric_limits<int>::max());
79 
80  if (useFloat16_) {
81 #ifdef FAISS_USE_FLOAT16
82  // We have to convert data to the half format.
83  // Make sure the source data is on our device first; it is not
84  // guaranteed before function entry to avoid unnecessary h2d copies
85  auto floatData =
86  toDevice<float, 1>(resources_,
87  getCurrentDevice(),
88  (float*) vecs,
89  stream,
90  {(int) numVecs * dim_});
91  auto halfData = toHalf<1>(resources_, stream, floatData);
92 
93  listData->append((unsigned char*) halfData.data(),
94  lengthInBytes,
95  stream,
96  true /* exact reserved size */);
97 #endif
98  } else {
99  listData->append((unsigned char*) vecs,
100  lengthInBytes,
101  stream,
102  true /* exact reserved size */);
103  }
104 
105  // Handle the indices as well
106  addIndicesFromCpu_(listId, indices, numVecs);
107 
108  // This list address may have changed due to vector resizing, but
109  // only bother updating it on the device if it has changed
110  if (prevData != listData->data()) {
111  deviceListDataPointers_[listId] = listData->data();
112  }
113 
114  // And our size has changed too
115  int listLength = listData->size() / bytesPerVector_;
116  deviceListLengths_[listId] = listLength;
117 
118  // We update this as well, since the multi-pass algorithm uses it
119  maxListLength_ = std::max(maxListLength_, listLength);
120 
121  // device_vector add is potentially happening on a different stream
122  // than our default stream
123  if (stream != 0) {
124  streamWait({stream}, {0});
125  }
126 }
127 
128 int
130  Tensor<long, 1, true>& indices) {
131  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
132  FAISS_ASSERT(vecs.getSize(1) == dim_);
133 
134  auto& mem = resources_->getMemoryManagerCurrentDevice();
135  auto stream = resources_->getDefaultStreamCurrentDevice();
136 
137  // Number of valid vectors that we actually add; we return this
138  int numAdded = 0;
139 
140  // We don't actually need this
141  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
142  // We use this
143  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
144  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
145 
146  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
147 
148  // Copy the lists that we wish to append to back to the CPU
149  // FIXME: really this can be into pinned memory and a true async
150  // copy on a different stream; we can start the copy early, but it's
151  // tiny
152  HostTensor<int, 1, true> listIdsHost(listIds, stream);
153 
154  // Now we add the encoded vectors to the individual lists
155  // First, make sure that there is space available for adding the new
156  // encoded vectors and indices
157 
158  // list id -> # being added
159  std::unordered_map<int, int> assignCounts;
160 
161  // vector id -> offset in list
162  // (we already have vector id -> list id in listIds)
163  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
164 
165  for (int i = 0; i < listIds.getSize(0); ++i) {
166  int listId = listIdsHost[i];
167 
168  // Add vector could be invalid (contains NaNs etc)
169  if (listId < 0) {
170  listOffsetHost[i] = -1;
171  continue;
172  }
173 
174  FAISS_ASSERT(listId < numLists_);
175  ++numAdded;
176 
177  int offset = deviceListData_[listId]->size() / bytesPerVector_;
178 
179  auto it = assignCounts.find(listId);
180  if (it != assignCounts.end()) {
181  offset += it->second;
182  it->second++;
183  } else {
184  assignCounts[listId] = 1;
185  }
186 
187  listOffsetHost[i] = offset;
188  }
189 
190  // If we didn't add anything (all invalid vectors), no need to
191  // continue
192  if (numAdded == 0) {
193  return 0;
194  }
195 
196  // We need to resize the data structures for the inverted lists on
197  // the GPUs, which means that they might need reallocation, which
198  // means that their base address may change. Figure out the new base
199  // addresses, and update those in a batch on the device
200  {
201  for (auto& counts : assignCounts) {
202  auto& data = deviceListData_[counts.first];
203  data->resize(data->size() + counts.second * bytesPerVector_,
204  stream);
205  int newNumVecs = (int) (data->size() / bytesPerVector_);
206 
207  auto& indices = deviceListIndices_[counts.first];
208  if ((indicesOptions_ == INDICES_32_BIT) ||
209  (indicesOptions_ == INDICES_64_BIT)) {
210  size_t indexSize =
211  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
212 
213  indices->resize(indices->size() + counts.second * indexSize, stream);
214  } else if (indicesOptions_ == INDICES_CPU) {
215  // indices are stored on the CPU side
216  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
217 
218  auto& userIndices = listOffsetToUserIndex_[counts.first];
219  userIndices.resize(newNumVecs);
220  } else {
221  // indices are not stored on the GPU or CPU side
222  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
223  }
224 
225  // This is used by the multi-pass query to decide how much scratch
226  // space to allocate for intermediate results
227  maxListLength_ = std::max(maxListLength_, newNumVecs);
228  }
229 
230  // Update all pointers to the lists on the device that may have
231  // changed
232  {
233  std::vector<int> listIds(assignCounts.size());
234  int i = 0;
235  for (auto& counts : assignCounts) {
236  listIds[i++] = counts.first;
237  }
238 
239  updateDeviceListInfo_(listIds, stream);
240  }
241  }
242 
243  // If we're maintaining the indices on the CPU side, update our
244  // map. We already resized our map above.
245  if (indicesOptions_ == INDICES_CPU) {
246  // We need to maintain the indices on the CPU side
247  HostTensor<long, 1, true> hostIndices(indices, stream);
248 
249  for (int i = 0; i < hostIndices.getSize(0); ++i) {
250  int listId = listIdsHost[i];
251 
252  // Add vector could be invalid (contains NaNs etc)
253  if (listId < 0) {
254  continue;
255  }
256 
257  int offset = listOffsetHost[i];
258 
259  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
260  auto& userIndices = listOffsetToUserIndex_[listId];
261 
262  FAISS_ASSERT(offset < userIndices.size());
263  userIndices[offset] = hostIndices[i];
264  }
265  }
266 
267  // We similarly need to actually append the new vectors
268  {
269  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
270 
271  // Now, for each list to which a vector is being assigned, write it
272  runIVFFlatInvertedListAppend(listIds,
273  listOffset,
274  vecs,
275  indices,
276  useFloat16_,
280  stream);
281  }
282 
283  return numAdded;
284 }
285 
286 void
288  int nprobe,
289  int k,
290  Tensor<float, 2, true>& outDistances,
291  Tensor<long, 2, true>& outIndices) {
292  auto& mem = resources_->getMemoryManagerCurrentDevice();
293  auto stream = resources_->getDefaultStreamCurrentDevice();
294 
295  // Validate these at a top level
296  FAISS_ASSERT(nprobe <= 1024);
297  FAISS_ASSERT(k <= 1024);
298  nprobe = std::min(nprobe, quantizer_->getSize());
299 
300  FAISS_ASSERT(queries.getSize(1) == dim_);
301 
302  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
303  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
304 
305  // Reserve space for the quantized information
307  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
309  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
310 
311  // Find the `nprobe` closest lists; we can use int indices both
312  // internally and externally
313  quantizer_->query(queries,
314  nprobe,
315  coarseDistances,
316  coarseIndices,
317  false);
318 
319  runIVFFlatScan(queries,
320  coarseIndices,
326  k,
327  l2Distance_,
328  useFloat16_,
329  outDistances,
330  outIndices,
331  resources_);
332 
333  // If the GPU isn't storing indices (they are on the CPU side), we
334  // need to perform the re-mapping here
335  // FIXME: we might ultimately be calling this function with inputs
336  // from the CPU, these are unnecessary copies
337  if (indicesOptions_ == INDICES_CPU) {
338  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
339 
340  ivfOffsetToUserIndex(hostOutIndices.data(),
341  numLists_,
342  hostOutIndices.getSize(0),
343  hostOutIndices.getSize(1),
345 
346  // Copy back to GPU, since the input to this function is on the
347  // GPU
348  outIndices.copyFrom(hostOutIndices, stream);
349  }
350 }
351 
352 std::vector<float>
353 IVFFlat::getListVectors(int listId) const {
354  FAISS_ASSERT(listId < deviceListData_.size());
355  auto& encVecs = *deviceListData_[listId];
356 
357  auto stream = resources_->getDefaultStreamCurrentDevice();
358 
359  if (useFloat16_) {
360 #ifdef FAISS_USE_FLOAT16
361  size_t num = encVecs.size() / sizeof(half);
362 
363  Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});
364  auto devFloat = fromHalf(resources_, stream, devHalf);
365 
366  std::vector<float> out(num);
367  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
368  hostFloat.copyFrom(devFloat, stream);
369 
370  return out;
371 #endif
372  }
373 
374  size_t num = encVecs.size() / sizeof(float);
375 
376  Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});
377 
378  std::vector<float> out(num);
379  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
380  hostFloat.copyFrom(devFloat, stream);
381 
382  return out;
383 }
384 
385 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:90
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:111
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:40
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:123
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:162
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:27
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:108
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:104
IVFFlat(GpuResources *resources, FlatIndex *quantizer, bool l2Distance, bool useFloat16, IndicesOptions indicesOptions)
Construct from a quantizer that has elemen.
Definition: IVFFlat.cu:30
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFFlat.cu:129
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:84
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:100
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:81
Our tensor type.
Definition: Tensor.cuh:31
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:526
void addCodeVectorsFromCpu(int listId, const float *vecs, const long *indices, size_t numVecs)
Definition: IVFFlat.cu:57
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:93
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFFlat.cu:287
std::vector< float > getListVectors(int listId) const
Return the vectors of a particular list back to the CPU.
Definition: IVFFlat.cu:353
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:137
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:210
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:101
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:96
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:117
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:87
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:244