10 #include "GpuIndexFlat.h"
11 #include "../IndexFlat.h"
12 #include "GpuResources.h"
13 #include "impl/FlatIndex.cuh"
14 #include "utils/ConversionOperators.cuh"
15 #include "utils/CopyUtils.cuh"
16 #include "utils/DeviceUtils.h"
17 #include "utils/Float16.cuh"
18 #include "utils/StaticUtils.h"
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
24 namespace faiss {
namespace gpu {
27 constexpr
size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
31 constexpr
size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
36 GpuIndex(resources, index->d, index->metric_type, config),
37 minPagedSize_(kMinPageSize),
38 config_(std::move(config)),
52 GpuIndex(resources, dims, metric, config),
53 minPagedSize_(kMinPageSize),
54 config_(std::move(config)),
65 metric == faiss::METRIC_L2,
72 GpuIndexFlat::~GpuIndexFlat() {
94 FAISS_THROW_IF_NOT_FMT(index->
ntotal <=
96 "GPU index only supports up to %zu indices; "
97 "attempting to copy CPU index with %zu parameters",
98 (
size_t) std::numeric_limits<int>::max(),
129 index->
xb.resize(this->
ntotal * this->
d);
136 fromDevice(vecFloat32, index->
xb.data(), stream);
187 FAISS_THROW_IF_NOT_MSG(!ids,
"add_with_ids not supported");
188 FAISS_THROW_IF_NOT(n > 0);
192 FAISS_THROW_IF_NOT_FMT(this->
ntotal + n <=
194 "GPU index only supports up to %zu indices",
195 (
size_t) std::numeric_limits<int>::max());
212 FAISS_THROW_IF_NOT_FMT(n <=
214 "GPU index only supports up to %zu indices",
215 (
size_t) std::numeric_limits<int>::max());
216 FAISS_THROW_IF_NOT_FMT(k <= 1024,
217 "GPU only supports k <= 1024 (requested %d)",
229 auto outDistances = toDevice<float, 2>(
resources_,
237 resources_->getMemoryManagerCurrentDevice(),
238 {(int) n, (
int) k}, stream);
240 bool usePaged =
false;
242 if (getDeviceForAddress(x) == -1) {
249 size_t dataSize = (size_t) n * this->
d *
sizeof(
float);
254 outIntIndices.data());
260 searchNonPaged_(n, x, k,
262 outIntIndices.data());
266 auto outIndices = toDevice<faiss::Index::idx_t, 2>(
resources_,
273 thrust::transform(thrust::cuda::par.on(stream),
274 outIntIndices.data(),
280 fromDevice<float, 2>(outDistances, distances, stream);
281 fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
290 FAISS_ASSERT_MSG(
false,
"Should not be called");
294 GpuIndexFlat::searchNonPaged_(
int n,
297 float* outDistancesData,
298 int* outIndicesData)
const {
300 Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
308 const_cast<float*
>(x),
312 data_->query(vecs, k, outDistances, outIndices,
true);
319 float* outDistancesData,
320 int* outIndicesData)
const {
325 auto pinnedAlloc = resources_->getPinnedMemory();
327 (int) ((pinnedAlloc.second / 2) / (
sizeof(float) * this->
d));
329 if (!pinnedAlloc.first || pageSizeInVecs < 1) {
331 int batchSize = utils::nextHighestPowerOf2(
332 (
int) ((
size_t) kNonPinnedPageSize /
333 (
sizeof(
float) * this->
d)));
335 for (
int cur = 0; cur < n; cur += batchSize) {
336 int num = std::min(batchSize, n - cur);
338 auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
339 auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
342 x + (
size_t) cur * this->d,
344 outDistancesSlice.data(),
345 outIndicesSlice.data());
365 auto defaultStream = resources_->getDefaultStream(device_);
366 auto copyStream = resources_->getAsyncCopyStream(device_);
368 FAISS_ASSERT((
size_t) pageSizeInVecs * this->
d <=
369 (
size_t) std::numeric_limits<int>::max());
371 float* bufPinnedA = (
float*) pinnedAlloc.first;
372 float* bufPinnedB = bufPinnedA + (
size_t) pageSizeInVecs * this->
d;
373 float* bufPinned[2] = {bufPinnedA, bufPinnedB};
378 resources_->getMemoryManagerCurrentDevice(),
379 {(int) pageSizeInVecs, (
int) this->d},
382 resources_->getMemoryManagerCurrentDevice(),
383 {(int) pageSizeInVecs, (
int) this->d},
388 std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
391 std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
398 int cur1BufIndex = 0;
402 int cur2BufIndex = 0;
406 int cur3BufIndex = 0;
410 if (cur2 != -1 && cur2 < n) {
412 int numToCopy = std::min(pageSizeInVecs, n - cur2);
415 auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
416 if (eventPrev.get()) {
417 eventPrev->streamWaitOnEvent(copyStream);
420 CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
421 bufPinned[cur2BufIndex],
422 (
size_t) numToCopy * this->d *
sizeof(
float),
423 cudaMemcpyHostToDevice,
427 eventPinnedCopyDone[cur2BufIndex] =
428 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(copyStream)));
433 cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
436 if (cur3 != -1 && cur3 < n) {
438 int numToProcess = std::min(pageSizeInVecs, n - cur3);
441 auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
442 FAISS_ASSERT(eventPrev.get());
444 eventPrev->streamWaitOnEvent(defaultStream);
448 {numToProcess, this->d});
449 auto outDistancesSlice = outDistances.
narrowOutermost(cur3, numToProcess);
452 data_->query(input, k,
454 outIndicesSlice,
true);
457 eventGpuExecuteDone[cur3BufIndex] =
458 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(defaultStream)));
461 cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
462 cur3 += numToProcess;
467 int numToCopy = std::min(pageSizeInVecs, n - cur1);
470 auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
471 if (eventPrev.get()) {
472 eventPrev->cpuWaitOnEvent();
475 memcpy(bufPinned[cur1BufIndex],
476 x + (
size_t) cur1 * this->d,
477 (
size_t) numToCopy * this->d *
sizeof(
float));
482 cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
492 FAISS_THROW_IF_NOT_MSG(key < this->
ntotal,
"index out of bounds");
493 auto stream = resources_->getDefaultStream(device_);
497 fromDevice(vec.data(), out, this->
d, stream);
500 fromDevice(vec.data(), out, this->
d, stream);
510 FAISS_THROW_IF_NOT_MSG(i0 < this->
ntotal,
"index out of bounds");
511 FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal,
"num out of bounds");
512 auto stream = resources_->getDefaultStream(device_);
516 fromDevice(vec.data(), out, num * this->
d, stream);
519 fromDevice(vec.data(), out, this->
d * num, stream);
524 GpuIndexFlat::verifySettings_()
const {
527 #ifdef FAISS_USE_FLOAT16
529 "useFloat16Accumulator can only be enabled "
532 FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(
config_.
device),
533 "Device %d does not support Hgemm "
534 "(useFloat16Accumulator)",
537 FAISS_THROW_IF_NOT_MSG(
false,
"not compiled with float16 support");
555 GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
581 GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
bool useFloat16Accumulator
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
int getSize() const
Returns the number of vectors we contain.
Holder of GPU resources for a particular flat index.
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids) override
Called from GpuIndex for add.
void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
Should not be called (we have our own implementation)
size_t getNumVecs() const
Returns the number of vectors we contain.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
void setMinPagingSize(size_t size)
bool useFloat16
Whether or not data is stored as float16.
int device
GPU device on which the index is resident.
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
virtual cudaStream_t getDefaultStream(int device)=0
__host__ __device__ Tensor< T, Dim, InnerContig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
void reconstruct(faiss::Index::idx_t key, float *out) const override
const int device_
The GPU device we are resident on.
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
long idx_t
all indices are this type
void reserve(size_t numVecs, cudaStream_t stream)
Reserve storage that can contain at least this many vectors.
void add(const float *data, int numVecs, cudaStream_t stream)
idx_t ntotal
total nb of indexed vectors
void add(faiss::Index::idx_t, const float *x) override
void copyFrom(const faiss::IndexFlat *index)
const MemorySpace memorySpace_
The memory space of our primary storage on the GPU.
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
const GpuIndexFlatConfig config_
Our config object.
MetricType metric_type
type of metric this index uses for search
void add(faiss::Index::idx_t, const float *x) override
Overrides to avoid excessive copies.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
void reset()
Free all storage.
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product version and a L2 search version.