11 #include "GpuIndexFlat.h"
12 #include "../IndexFlat.h"
13 #include "GpuResources.h"
14 #include "impl/FlatIndex.cuh"
15 #include "utils/CopyUtils.cuh"
16 #include "utils/DeviceUtils.h"
17 #include "utils/Float16.cuh"
18 #include "utils/StaticUtils.h"
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
24 namespace faiss {
namespace gpu {
27 constexpr
size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
31 constexpr
size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
36 GpuIndex(resources, index->d, index->metric_type, config),
37 minPagedSize_(kMinPageSize),
52 GpuIndex(resources, dims, metric, config),
53 minPagedSize_(kMinPageSize),
65 metric == faiss::METRIC_L2,
72 GpuIndexFlat::~GpuIndexFlat() {
94 FAISS_THROW_IF_NOT_FMT(index->
ntotal <=
96 "GPU index only supports up to %zu indices; "
97 "attempting to copy CPU index with %zu parameters",
98 (
size_t) std::numeric_limits<int>::max(),
128 index->
xb.resize(this->
ntotal * this->
d);
135 fromDevice(vecFloat32, index->
xb.data(), stream);
186 FAISS_THROW_IF_NOT_MSG(!ids,
"add_with_ids not supported");
187 FAISS_THROW_IF_NOT(n > 0);
191 FAISS_THROW_IF_NOT_FMT(this->
ntotal + n <=
193 "GPU index only supports up to %zu indices",
194 (
size_t) std::numeric_limits<int>::max());
201 __device__
long operator()(
int v)
const {
return (
long) v; }
215 FAISS_THROW_IF_NOT_FMT(n <=
217 "GPU index only supports up to %zu indices",
218 (
size_t) std::numeric_limits<int>::max());
219 FAISS_THROW_IF_NOT_FMT(k <= 1024,
220 "GPU only supports k <= 1024 (requested %d)",
232 auto outDistances = toDevice<float, 2>(
resources_,
240 resources_->getMemoryManagerCurrentDevice(),
241 {(int) n, (
int) k}, stream);
243 bool usePaged =
false;
245 if (getDeviceForAddress(x) == -1) {
252 size_t dataSize = (size_t) n * this->
d *
sizeof(
float);
257 outIntIndices.data());
263 searchNonPaged_(n, x, k,
265 outIntIndices.data());
269 auto outIndices = toDevice<faiss::Index::idx_t, 2>(
resources_,
276 thrust::transform(thrust::cuda::par.on(stream),
277 outIntIndices.data(),
283 fromDevice<float, 2>(outDistances, distances, stream);
284 fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
293 FAISS_ASSERT_MSG(
false,
"Should not be called");
297 GpuIndexFlat::searchNonPaged_(
int n,
300 float* outDistancesData,
301 int* outIndicesData)
const {
303 Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
311 const_cast<float*
>(x),
315 data_->query(vecs, k, outDistances, outIndices,
true);
322 float* outDistancesData,
323 int* outIndicesData)
const {
328 auto pinnedAlloc = resources_->getPinnedMemory();
330 (int) ((pinnedAlloc.second / 2) / (
sizeof(float) * this->
d));
332 if (!pinnedAlloc.first || pageSizeInVecs < 1) {
334 int batchSize = utils::nextHighestPowerOf2(
335 (
int) ((
size_t) kNonPinnedPageSize /
336 (
sizeof(
float) * this->
d)));
338 for (
int cur = 0; cur < n; cur += batchSize) {
339 int num = std::min(batchSize, n - cur);
341 auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
342 auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
345 x + (
size_t) cur * this->d,
347 outDistancesSlice.data(),
348 outIndicesSlice.data());
368 auto defaultStream = resources_->getDefaultStream(device_);
369 auto copyStream = resources_->getAsyncCopyStream(device_);
371 FAISS_ASSERT((
size_t) pageSizeInVecs * this->
d <=
372 (
size_t) std::numeric_limits<int>::max());
374 float* bufPinnedA = (
float*) pinnedAlloc.first;
375 float* bufPinnedB = bufPinnedA + (
size_t) pageSizeInVecs * this->
d;
376 float* bufPinned[2] = {bufPinnedA, bufPinnedB};
381 resources_->getMemoryManagerCurrentDevice(),
382 {(int) pageSizeInVecs, (
int) this->d},
385 resources_->getMemoryManagerCurrentDevice(),
386 {(int) pageSizeInVecs, (
int) this->d},
391 std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
394 std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
401 int cur1BufIndex = 0;
405 int cur2BufIndex = 0;
409 int cur3BufIndex = 0;
413 if (cur2 != -1 && cur2 < n) {
415 int numToCopy = std::min(pageSizeInVecs, n - cur2);
418 auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
419 if (eventPrev.get()) {
420 eventPrev->streamWaitOnEvent(copyStream);
423 CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
424 bufPinned[cur2BufIndex],
425 (
size_t) numToCopy * this->d *
sizeof(
float),
426 cudaMemcpyHostToDevice,
430 eventPinnedCopyDone[cur2BufIndex] =
431 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(copyStream)));
436 cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
439 if (cur3 != -1 && cur3 < n) {
441 int numToProcess = std::min(pageSizeInVecs, n - cur3);
444 auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
445 FAISS_ASSERT(eventPrev.get());
447 eventPrev->streamWaitOnEvent(defaultStream);
451 {numToProcess, this->d});
452 auto outDistancesSlice = outDistances.
narrowOutermost(cur3, numToProcess);
455 data_->query(input, k,
457 outIndicesSlice,
true);
460 eventGpuExecuteDone[cur3BufIndex] =
461 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(defaultStream)));
464 cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
465 cur3 += numToProcess;
470 int numToCopy = std::min(pageSizeInVecs, n - cur1);
473 auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
474 if (eventPrev.get()) {
475 eventPrev->cpuWaitOnEvent();
478 memcpy(bufPinned[cur1BufIndex],
479 x + (
size_t) cur1 * this->d,
480 (
size_t) numToCopy * this->d *
sizeof(
float));
485 cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
495 FAISS_THROW_IF_NOT_MSG(key < this->
ntotal,
"index out of bounds");
496 auto stream = resources_->getDefaultStream(device_);
500 fromDevice(vec.data(), out, this->
d, stream);
503 fromDevice(vec.data(), out, this->
d, stream);
513 FAISS_THROW_IF_NOT_MSG(i0 < this->
ntotal,
"index out of bounds");
514 FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal,
"num out of bounds");
515 auto stream = resources_->getDefaultStream(device_);
519 fromDevice(vec.data(), out, num * this->
d, stream);
522 fromDevice(vec.data(), out, this->
d * num, stream);
527 GpuIndexFlat::verifySettings_()
const {
530 #ifdef FAISS_USE_FLOAT16
532 "useFloat16Accumulator can only be enabled "
535 FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(
config_.
device),
536 "Device %d does not support Hgemm "
537 "(useFloat16Accumulator)",
540 FAISS_THROW_IF_NOT_MSG(
false,
"not compiled with float16 support");
558 GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
584 GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
bool useFloat16Accumulator
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
int getSize() const
Returns the number of vectors we contain.
Holder of GPU resources for a particular flat index.
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids) override
Called from GpuIndex for add.
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
Should not be called (we have our own implementation)
size_t getNumVecs() const
Returns the number of vectors we contain.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
void setMinPagingSize(size_t size)
bool useFloat16
Whether or not data is stored as float16.
int device
GPU device on which the index is resident.
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
void reconstruct(faiss::Index::idx_t key, float *out) const override
const int device_
The GPU device we are resident on.
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
long idx_t
all indices are this type
void reserve(size_t numVecs, cudaStream_t stream)
Reserve storage that can contain at least this many vectors.
void add(const float *data, int numVecs, cudaStream_t stream)
idx_t ntotal
total nb of indexed vectors
void add(faiss::Index::idx_t, const float *x) override
void copyFrom(const faiss::IndexFlat *index)
const MemorySpace memorySpace_
The memory space of our primary storage on the GPU.
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
const GpuIndexFlatConfig config_
Our config object.
MetricType metric_type
type of metric this index uses for search
void add(faiss::Index::idx_t, const float *x) override
Overrides to avoid excessive copies.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
void reset()
Free all storage.
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
FlatIndex * data_
Holds our GPU data containing the list of vectors.