12 #include "GpuIndexFlat.h"
13 #include "../IndexFlat.h"
14 #include "GpuResources.h"
15 #include "impl/FlatIndex.cuh"
16 #include "utils/CopyUtils.cuh"
17 #include "utils/DeviceUtils.h"
18 #include "utils/StaticUtils.h"
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
24 namespace faiss {
namespace gpu {
27 constexpr
size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
31 constexpr
size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
36 GpuIndex(resources, config.device, index->d, index->metric_type),
37 minPagedSize_(kMinPageSize),
49 GpuIndex(resources, config.device, dims, metric),
50 minPagedSize_(kMinPageSize),
59 metric == faiss::METRIC_L2,
61 config_.storeTransposed);
64 GpuIndexFlat::~GpuIndexFlat() {
80 return config_.useFloat16;
91 FAISS_ASSERT(index->
ntotal <=
100 config_.storeTransposed);
119 index->
xb.resize(this->
ntotal * this->
d);
124 if (config_.useFloat16) {
126 fromDevice(vecFloat32, index->
xb.data(), stream);
164 FAISS_ASSERT(this->
ntotal + n <=
172 __device__
long operator()(
int v)
const {
return (
long) v; }
188 FAISS_ASSERT(k <= 1024);
199 auto outDistances = toDevice<float, 2>(
resources_,
207 resources_->getMemoryManagerCurrentDevice(),
208 {(int) n, (
int) k}, stream);
210 bool usePaged =
false;
212 if (getDeviceForAddress(x) == -1) {
219 size_t dataSize = (size_t) n * this->
d *
sizeof(
float);
224 outIntIndices.data());
230 searchNonPaged_(n, x, k,
232 outIntIndices.data());
236 auto outIndices = toDevice<faiss::Index::idx_t, 2>(
resources_,
243 thrust::transform(thrust::cuda::par.on(stream),
244 outIntIndices.data(),
250 fromDevice<float, 2>(outDistances, distances, stream);
251 fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
260 FAISS_ASSERT(!
"Should not be called");
264 GpuIndexFlat::searchNonPaged_(
int n,
267 float* outDistancesData,
268 int* outIndicesData)
const {
270 Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
278 const_cast<float*
>(x),
282 data_->query(vecs, k, outDistances, outIndices,
true);
289 float* outDistancesData,
290 int* outIndicesData)
const {
295 auto pinnedAlloc = resources_->getPinnedMemory();
297 (int) ((pinnedAlloc.second / 2) / (
sizeof(float) * this->
d));
299 if (!pinnedAlloc.first || pageSizeInVecs < 1) {
301 int batchSize = utils::nextHighestPowerOf2(
302 (
int) ((
size_t) kNonPinnedPageSize /
303 (
sizeof(
float) * this->
d)));
305 for (
int cur = 0; cur < n; cur += batchSize) {
306 int num = std::min(batchSize, n - cur);
308 auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
309 auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
312 x + (
size_t) cur * this->d,
314 outDistancesSlice.data(),
315 outIndicesSlice.data());
335 auto defaultStream = resources_->getDefaultStream(device_);
336 auto copyStream = resources_->getAsyncCopyStream(device_);
338 FAISS_ASSERT((
size_t) pageSizeInVecs * this->
d <=
339 (
size_t) std::numeric_limits<int>::max());
341 float* bufPinnedA = (
float*) pinnedAlloc.first;
342 float* bufPinnedB = bufPinnedA + (
size_t) pageSizeInVecs * this->
d;
343 float* bufPinned[2] = {bufPinnedA, bufPinnedB};
348 resources_->getMemoryManagerCurrentDevice(),
349 {(int) pageSizeInVecs, (
int) this->d},
352 resources_->getMemoryManagerCurrentDevice(),
353 {(int) pageSizeInVecs, (
int) this->d},
358 std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
361 std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
368 int cur1BufIndex = 0;
372 int cur2BufIndex = 0;
376 int cur3BufIndex = 0;
380 if (cur2 != -1 && cur2 < n) {
382 int numToCopy = std::min(pageSizeInVecs, n - cur2);
385 auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
386 if (eventPrev.get()) {
387 eventPrev->streamWaitOnEvent(copyStream);
390 CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
391 bufPinned[cur2BufIndex],
392 (
size_t) numToCopy * this->d *
sizeof(
float),
393 cudaMemcpyHostToDevice,
397 eventPinnedCopyDone[cur2BufIndex] =
398 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(copyStream)));
403 cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
406 if (cur3 != -1 && cur3 < n) {
408 int numToProcess = std::min(pageSizeInVecs, n - cur3);
411 auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
412 FAISS_ASSERT(eventPrev.get());
414 eventPrev->streamWaitOnEvent(defaultStream);
418 {numToProcess, this->d});
419 auto outDistancesSlice = outDistances.
narrowOutermost(cur3, numToProcess);
422 data_->query(input, k,
424 outIndicesSlice,
true);
427 eventGpuExecuteDone[cur3BufIndex] =
428 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(defaultStream)));
431 cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
432 cur3 += numToProcess;
437 int numToCopy = std::min(pageSizeInVecs, n - cur1);
440 auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
441 if (eventPrev.get()) {
442 eventPrev->cpuWaitOnEvent();
445 memcpy(bufPinned[cur1BufIndex],
446 x + (
size_t) cur1 * this->d,
447 (
size_t) numToCopy * this->d *
sizeof(
float));
452 cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
462 FAISS_ASSERT(key < this->
ntotal);
463 auto stream = resources_->getDefaultStream(device_);
465 if (config_.useFloat16) {
467 fromDevice(vec.data(), out, this->
d, stream);
470 fromDevice(vec.data(), out, this->
d, stream);
480 FAISS_ASSERT(i0 < this->
ntotal);
481 FAISS_ASSERT(i0 + num - 1 < this->ntotal);
482 auto stream = resources_->getDefaultStream(device_);
484 if (config_.useFloat16) {
486 fromDevice(vec.data(), out, num * this->
d, stream);
489 fromDevice(vec.data(), out, this->
d * num, stream);
494 GpuIndexFlat::set_typename() {
496 this->index_typename =
"GpuL2";
498 this->index_typename =
"GpuIP";
515 GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
541 GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
virtual void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const
Should not be called (we have our own implementation)
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
int getSize() const
Returns the number of vectors we contain.
Holder of GPU resources for a particular flat index.
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
size_t getNumVecs() const
Returns the number of vectors we contain.
virtual void train(Index::idx_t n, const float *x)
This index is not trained, so this does nothing.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
virtual void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids)
Called from GpuIndex for add.
void setMinPagingSize(size_t size)
int device_
The GPU device we are resident on.
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
bool getUseFloat16() const
Do we store vectors and perform math in float16?
virtual void reset()
Clears all vectors from this index.
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
long idx_t
all indices are this type
virtual void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const
void add(const float *data, int numVecs, cudaStream_t stream)
idx_t ntotal
total nb of indexed vectors
void copyFrom(const faiss::IndexFlat *index)
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
virtual void reconstruct(faiss::Index::idx_t key, float *out) const
MetricType metric_type
type of metric this index uses for search
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
void reset()
Free all storage.
std::vector< float > xb
database vectors, size ntotal * d
virtual void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const
Batch reconstruction method.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
FlatIndex * data_
Holds our GPU data containing the list of vectors.