12 #include "GpuIndexFlat.h"
13 #include "../IndexFlat.h"
14 #include "GpuResources.h"
15 #include "impl/FlatIndex.cuh"
16 #include "utils/CopyUtils.cuh"
17 #include "utils/DeviceUtils.h"
18 #include "utils/StaticUtils.h"
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
24 namespace faiss {
namespace gpu {
27 constexpr
size_t kMinPagedQuerySize = (size_t) 256 * 1024 * 1024;
31 constexpr
size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
37 GpuIndex(resources, device, index->d, index->metric_type),
38 minPagedSize_(kMinPagedQuerySize),
39 useFloat16_(useFloat16),
49 GpuIndex(resources, device, dims, metric),
50 minPagedSize_(kMinPagedQuerySize),
51 useFloat16_(useFloat16),
57 metric == faiss::METRIC_L2,
61 GpuIndexFlat::~GpuIndexFlat() {
88 FAISS_ASSERT(index->
ntotal <=
115 index->
xb.resize(this->
ntotal * this->
d);
122 fromDevice(vecFloat32, index->
xb.data(), stream);
152 FAISS_ASSERT(this->
ntotal + n <=
163 __device__
long operator()(
int v)
const {
return (
long) v; }
179 FAISS_ASSERT(k <= 1024);
190 auto outDistances = toDevice<float, 2>(
resources_,
198 resources_->getMemoryManagerCurrentDevice(),
199 {(int) n, (
int) k}, stream);
201 bool usePaged =
false;
203 if (getDeviceForAddress(x) == -1) {
210 size_t dataSize = (size_t) n * this->
d *
sizeof(
float);
215 outIntIndices.data());
221 searchNonPaged_(n, x, k,
223 outIntIndices.data());
227 auto outIndices = toDevice<faiss::Index::idx_t, 2>(
resources_,
234 thrust::transform(thrust::cuda::par.on(stream),
235 outIntIndices.data(),
241 fromDevice<float, 2>(outDistances, distances, stream);
242 fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
246 GpuIndexFlat::searchNonPaged_(
int n,
249 float* outDistancesData,
250 int* outIndicesData)
const {
252 Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
260 const_cast<float*
>(x),
264 data_->query(vecs, k, outDistances, outIndices,
true);
271 float* outDistancesData,
272 int* outIndicesData)
const {
277 auto pinnedAlloc = resources_->getPinnedMemory();
279 (int) ((pinnedAlloc.second / 2) / (
sizeof(float) * this->
d));
281 if (!pinnedAlloc.first || pageSizeInVecs < 1) {
283 int batchSize = utils::nextHighestPowerOf2(
284 (
int) ((
size_t) kNonPinnedPageSize /
285 (
sizeof(
float) * this->
d)));
287 for (
int cur = 0; cur < n; cur += batchSize) {
288 int num = std::min(batchSize, n - cur);
290 auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
291 auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
294 x + (
size_t) cur * this->d,
296 outDistancesSlice.data(),
297 outIndicesSlice.data());
317 auto defaultStream = resources_->getDefaultStream(device_);
318 auto copyStream = resources_->getAsyncCopyStream(device_);
320 FAISS_ASSERT((
size_t) pageSizeInVecs * this->
d <=
321 (
size_t) std::numeric_limits<int>::max());
323 float* bufPinnedA = (
float*) pinnedAlloc.first;
324 float* bufPinnedB = bufPinnedA + (
size_t) pageSizeInVecs * this->
d;
325 float* bufPinned[2] = {bufPinnedA, bufPinnedB};
330 resources_->getMemoryManagerCurrentDevice(),
331 {(int) pageSizeInVecs, (
int) this->d},
334 resources_->getMemoryManagerCurrentDevice(),
335 {(int) pageSizeInVecs, (
int) this->d},
340 std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
343 std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
350 int cur1BufIndex = 0;
354 int cur2BufIndex = 0;
358 int cur3BufIndex = 0;
362 if (cur2 != -1 && cur2 < n) {
364 int numToCopy = std::min(pageSizeInVecs, n - cur2);
367 auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
368 if (eventPrev.get()) {
369 eventPrev->streamWaitOnEvent(copyStream);
372 CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
373 bufPinned[cur2BufIndex],
374 (
size_t) numToCopy * this->d *
sizeof(
float),
375 cudaMemcpyHostToDevice,
379 eventPinnedCopyDone[cur2BufIndex] =
380 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(copyStream)));
385 cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
388 if (cur3 != -1 && cur3 < n) {
390 int numToProcess = std::min(pageSizeInVecs, n - cur3);
393 auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
394 FAISS_ASSERT(eventPrev.get());
396 eventPrev->streamWaitOnEvent(defaultStream);
400 {numToProcess, this->d});
401 auto outDistancesSlice = outDistances.
narrowOutermost(cur3, numToProcess);
404 data_->query(input, k,
406 outIndicesSlice,
true);
409 eventGpuExecuteDone[cur3BufIndex] =
410 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(defaultStream)));
413 cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
414 cur3 += numToProcess;
419 int numToCopy = std::min(pageSizeInVecs, n - cur1);
422 auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
423 if (eventPrev.get()) {
424 eventPrev->cpuWaitOnEvent();
427 memcpy(bufPinned[cur1BufIndex],
428 x + (
size_t) cur1 * this->d,
429 (
size_t) numToCopy * this->d *
sizeof(
float));
434 cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
444 FAISS_ASSERT(key < this->
ntotal);
445 auto stream = resources_->getDefaultStream(device_);
449 fromDevice(vec.data(), out, this->
d, stream);
452 fromDevice(vec.data(), out, this->
d, stream);
462 FAISS_ASSERT(i0 < this->
ntotal);
463 FAISS_ASSERT(i0 + num - 1 < this->ntotal);
464 auto stream = resources_->getDefaultStream(device_);
468 fromDevice(vec.data(), out, num * this->
d, stream);
471 fromDevice(vec.data(), out, this->
d * num, stream);
476 GpuIndexFlat::set_typename() {
478 this->index_typename =
"GpuL2";
480 this->index_typename =
"GpuIP";
499 GpuIndexFlat(resources, device, dims, useFloat16, faiss::METRIC_L2) {
528 faiss::METRIC_INNER_PRODUCT) {
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
GpuIndexFlat(GpuResources *resources, int device, bool useFloat16, const faiss::IndexFlat *index)
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
int getSize() const
Returns the number of vectors we contain.
Holder of GPU resources for a particular flat index.
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
const bool useFloat16_
Whether or not we store our vectors in float32 or float16.
size_t getNumVecs() const
Returns the number of vectors we contain.
void setMinPagingSize(size_t size)
int device_
The GPU device we are resident on.
bool getUseFloat16() const
Do we store vectors and perform math in float16?
GpuIndexFlatL2(GpuResources *resources, int device, bool useFloat16, faiss::IndexFlatL2 *index)
void reconstruct(faiss::Index::idx_t key, float *out) const override
void copyTo(faiss::IndexFlatIP *index)
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
long idx_t
all indices are this type
void add(const float *data, int numVecs, cudaStream_t stream)
idx_t ntotal
total nb of indexed vectors
void copyFrom(const faiss::IndexFlat *index)
void add(Index::idx_t n, const float *x) override
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
GpuIndexFlatIP(GpuResources *resources, int device, bool useFloat16, faiss::IndexFlatIP *index)
MetricType metric_type
type of metric this index uses for search
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
void reset()
Free all storage.
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
FlatIndex * data_
Holds our GPU data containing the list of vectors.