/** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include namespace faiss { namespace gpu { /// Default CPU search size for which we use paged copies constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024; /// Size above which we page copies from the CPU to GPU (non-paged /// memory usage) constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024; // Default size for which we page add or search constexpr size_t kAddPageSize = (size_t) 256 * 1024 * 1024; // Or, maximum number of vectors to consider per page of add or search constexpr size_t kAddVecSize = (size_t) 512 * 1024; // Use a smaller search size, as precomputed code usage on IVFPQ // requires substantial amounts of memory // FIXME: parameterize based on algorithm need constexpr size_t kSearchVecSize = (size_t) 32 * 1024; GpuIndex::GpuIndex(GpuResources* resources, int dims, faiss::MetricType metric, float metricArg, GpuIndexConfig config) : Index(dims, metric), resources_(resources), device_(config.device), memorySpace_(config.memorySpace), minPagedSize_(kMinPageSize) { FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(), "Invalid GPU device %d", device_); FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions"); #ifdef FAISS_UNIFIED_MEM FAISS_THROW_IF_NOT_FMT( memorySpace_ == MemorySpace::Device || (memorySpace_ == MemorySpace::Unified && getFullUnifiedMemSupport(device_)), "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)", config.device); #else FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device, "Must compile with CUDA 8+ for Unified Memory support"); #endif metric_arg = metricArg; FAISS_ASSERT(resources_); resources_->initializeForDevice(device_); } void GpuIndex::copyFrom(const faiss::Index* index) { d = index->d; metric_type = index->metric_type; metric_arg = index->metric_arg; ntotal = index->ntotal; is_trained = index->is_trained; } void GpuIndex::copyTo(faiss::Index* index) const { index->d = d; index->metric_type = metric_type; index->metric_arg = metric_arg; index->ntotal = ntotal; index->is_trained = is_trained; } void GpuIndex::setMinPagingSize(size_t size) { minPagedSize_ = size; } size_t GpuIndex::getMinPagingSize() const { return minPagedSize_; } void GpuIndex::add(Index::idx_t n, const float* x) { // Pass to add_with_ids add_with_ids(n, x, nullptr); } void GpuIndex::add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids) { FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained"); // For now, only support <= max int results FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits::max(), "GPU index only supports up to %d indices", std::numeric_limits::max()); if (n == 0) { // nothing to add return; } std::vector generatedIds; // Generate IDs if we need them if (!ids && addImplRequiresIDs_()) { generatedIds = std::vector(n); for (Index::idx_t i = 0; i < n; ++i) { generatedIds[i] = this->ntotal + i; } } DeviceScope scope(device_); addPaged_((int) n, x, ids ? ids : generatedIds.data()); } void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) { if (n > 0) { size_t totalSize = (size_t) n * this->d * sizeof(float); if (totalSize > kAddPageSize || n > kAddVecSize) { // How many vectors fit into kAddPageSize? size_t maxNumVecsForPageSize = kAddPageSize / ((size_t) this->d * sizeof(float)); // Always add at least 1 vector, if we have huge vectors maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1); size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize); tileSize = std::min(tileSize, kSearchVecSize); for (size_t i = 0; i < (size_t) n; i += tileSize) { size_t curNum = std::min(tileSize, n - i); addPage_(curNum, x + i * (size_t) this->d, ids ? ids + i : nullptr); } } else { addPage_(n, x, ids); } } } void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) { // At this point, `x` can be resident on CPU or GPU, and `ids` may be resident // on CPU, GPU or may be null. // // Before continuing, we guarantee that all data will be resident on the GPU. auto stream = resources_->getDefaultStreamCurrentDevice(); auto vecs = toDevice(resources_, device_, const_cast(x), stream, {n, this->d}); if (ids) { auto indices = toDevice(resources_, device_, const_cast(ids), stream, {n}); addImpl_(n, vecs.data(), ids ? indices.data() : nullptr); } else { addImpl_(n, vecs.data(), nullptr); } } void GpuIndex::search(Index::idx_t n, const float* x, Index::idx_t k, float* distances, Index::idx_t* labels) const { FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained"); // For now, only support <= max int results FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits::max(), "GPU index only supports up to %d indices", std::numeric_limits::max()); // Maximum k-selection supported is based on the CUDA SDK FAISS_THROW_IF_NOT_FMT(k <= (Index::idx_t) getMaxKSelection(), "GPU index only supports k <= %d (requested %d)", getMaxKSelection(), (int) k); // select limitation if (n == 0 || k == 0) { // nothing to search return; } DeviceScope scope(device_); auto stream = resources_->getDefaultStream(device_); // We guarantee that the searchImpl_ will be called with device-resident // pointers. // The input vectors may be too large for the GPU, but we still // assume that the output distances and labels are not. // Go ahead and make space for output distances and labels on the // GPU. // If we reach a point where all inputs are too big, we can add // another level of tiling. auto outDistances = toDevice(resources_, device_, distances, stream, {(int) n, (int) k}); auto outLabels = toDevice(resources_, device_, labels, stream, {(int) n, (int) k}); bool usePaged = false; if (getDeviceForAddress(x) == -1) { // It is possible that the user is querying for a vector set size // `x` that won't fit on the GPU. // In this case, we will have to handle paging of the data from CPU // -> GPU. // Currently, we don't handle the case where the output data won't // fit on the GPU (e.g., n * k is too large for the GPU memory). size_t dataSize = (size_t) n * this->d * sizeof(float); if (dataSize >= minPagedSize_) { searchFromCpuPaged_(n, x, k, outDistances.data(), outLabels.data()); usePaged = true; } } if (!usePaged) { searchNonPaged_(n, x, k, outDistances.data(), outLabels.data()); } // Copy back if necessary fromDevice(outDistances, distances, stream); fromDevice(outLabels, labels, stream); } void GpuIndex::searchNonPaged_(int n, const float* x, int k, float* outDistancesData, Index::idx_t* outIndicesData) const { auto stream = resources_->getDefaultStream(device_); // Make sure arguments are on the device we desire; use temporary // memory allocations to move it if necessary auto vecs = toDevice(resources_, device_, const_cast(x), stream, {n, (int) this->d}); searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData); } void GpuIndex::searchFromCpuPaged_(int n, const float* x, int k, float* outDistancesData, Index::idx_t* outIndicesData) const { Tensor outDistances(outDistancesData, {n, k}); Tensor outIndices(outIndicesData, {n, k}); // Is pinned memory available? auto pinnedAlloc = resources_->getPinnedMemory(); int pageSizeInVecs = (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d)); if (!pinnedAlloc.first || pageSizeInVecs < 1) { // Just page without overlapping copy with compute int batchSize = utils::nextHighestPowerOf2( (int) ((size_t) kNonPinnedPageSize / (sizeof(float) * this->d))); for (int cur = 0; cur < n; cur += batchSize) { int num = std::min(batchSize, n - cur); auto outDistancesSlice = outDistances.narrowOutermost(cur, num); auto outIndicesSlice = outIndices.narrowOutermost(cur, num); searchNonPaged_(num, x + (size_t) cur * this->d, k, outDistancesSlice.data(), outIndicesSlice.data()); } return; } // // Pinned memory is available, so we can overlap copy with compute. // We use two pinned memory buffers, and triple-buffer the // procedure: // // 1 CPU copy -> pinned // 2 pinned copy -> GPU // 3 GPU compute // // 1 2 3 1 2 3 ... (pinned buf A) // 1 2 3 1 2 ... (pinned buf B) // 1 2 3 1 ... (pinned buf A) // time -> // auto defaultStream = resources_->getDefaultStream(device_); auto copyStream = resources_->getAsyncCopyStream(device_); FAISS_ASSERT((size_t) pageSizeInVecs * this->d <= (size_t) std::numeric_limits::max()); float* bufPinnedA = (float*) pinnedAlloc.first; float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d; float* bufPinned[2] = {bufPinnedA, bufPinnedB}; // Reserve space on the GPU for the destination of the pinned buffer // copy DeviceTensor bufGpuA( resources_->getMemoryManagerCurrentDevice(), {(int) pageSizeInVecs, (int) this->d}, defaultStream); DeviceTensor bufGpuB( resources_->getMemoryManagerCurrentDevice(), {(int) pageSizeInVecs, (int) this->d}, defaultStream); DeviceTensor* bufGpus[2] = {&bufGpuA, &bufGpuB}; // Copy completion events for the pinned buffers std::unique_ptr eventPinnedCopyDone[2]; // Execute completion events for the GPU buffers std::unique_ptr eventGpuExecuteDone[2]; // All offsets are in terms of number of vectors; they remain within // int bounds (as this function only handles max in vectors) // Current start offset for buffer 1 int cur1 = 0; int cur1BufIndex = 0; // Current start offset for buffer 2 int cur2 = -1; int cur2BufIndex = 0; // Current start offset for buffer 3 int cur3 = -1; int cur3BufIndex = 0; while (cur3 < n) { // Start async pinned -> GPU copy first (buf 2) if (cur2 != -1 && cur2 < n) { // Copy pinned to GPU int numToCopy = std::min(pageSizeInVecs, n - cur2); // Make sure any previous execution has completed before continuing auto& eventPrev = eventGpuExecuteDone[cur2BufIndex]; if (eventPrev.get()) { eventPrev->streamWaitOnEvent(copyStream); } CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(), bufPinned[cur2BufIndex], (size_t) numToCopy * this->d * sizeof(float), cudaMemcpyHostToDevice, copyStream)); // Mark a completion event in this stream eventPinnedCopyDone[cur2BufIndex] = std::move(std::unique_ptr(new CudaEvent(copyStream))); // We pick up from here cur3 = cur2; cur2 += numToCopy; cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0; } if (cur3 != -1 && cur3 < n) { // Process on GPU int numToProcess = std::min(pageSizeInVecs, n - cur3); // Make sure the previous copy has completed before continuing auto& eventPrev = eventPinnedCopyDone[cur3BufIndex]; FAISS_ASSERT(eventPrev.get()); eventPrev->streamWaitOnEvent(defaultStream); // Create tensor wrappers // DeviceTensor input(bufGpus[cur3BufIndex]->data(), // {numToProcess, this->d}); auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess); auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess); searchImpl_(numToProcess, bufGpus[cur3BufIndex]->data(), k, outDistancesSlice.data(), outIndicesSlice.data()); // Create completion event eventGpuExecuteDone[cur3BufIndex] = std::move(std::unique_ptr(new CudaEvent(defaultStream))); // We pick up from here cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0; cur3 += numToProcess; } if (cur1 < n) { // Copy CPU mem to CPU pinned int numToCopy = std::min(pageSizeInVecs, n - cur1); // Make sure any previous copy has completed before continuing auto& eventPrev = eventPinnedCopyDone[cur1BufIndex]; if (eventPrev.get()) { eventPrev->cpuWaitOnEvent(); } memcpy(bufPinned[cur1BufIndex], x + (size_t) cur1 * this->d, (size_t) numToCopy * this->d * sizeof(float)); // We pick up from here cur2 = cur1; cur1 += numToCopy; cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0; } } } void GpuIndex::compute_residual(const float* x, float* residual, Index::idx_t key) const { FAISS_THROW_MSG("compute_residual not implemented for this type of index"); } void GpuIndex::compute_residual_n(Index::idx_t n, const float* xs, float* residuals, const Index::idx_t* keys) const { FAISS_THROW_MSG("compute_residual_n not implemented for this type of index"); } } } // namespace