/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the CC-by-NC license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once #include "../../FaissAssert.h" #include "DeviceUtils.h" #include "StaticUtils.h" #include #include #include namespace faiss { namespace gpu { /// A simple version of thrust::device_vector, but has more control /// over whether resize() initializes new space with T() (which we /// don't want), and control on how much the reserved space grows by /// upon resize/reserve. It is also meant for POD types only. template class DeviceVector { public: DeviceVector() : data_(nullptr), num_(0), capacity_(0) { } ~DeviceVector() { clear(); } // Clear all allocated memory; reset to zero size void clear() { CUDA_VERIFY(cudaFree(data_)); data_ = nullptr; num_ = 0; capacity_ = 0; } size_t size() const { return num_; } size_t capacity() const { return capacity_; } T* data() { return data_; } const T* data() const { return data_; } template std::vector copyToHost(cudaStream_t stream) const { FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0); std::vector out((num_ * sizeof(T)) / sizeof(OutT)); CUDA_VERIFY(cudaMemcpyAsync(out.data(), data_, num_ * sizeof(T), cudaMemcpyDeviceToHost, stream)); return out; } // Returns true if we actually reallocated memory // If `reserveExact` is true, then we reserve only the memory that // we need for what we're appending bool append(const T* d, size_t n, cudaStream_t stream, bool reserveExact = false) { bool mem = false; if (n > 0) { size_t reserveSize = num_ + n; if (!reserveExact) { reserveSize = getNewCapacity_(reserveSize); } mem = reserve(reserveSize, stream); int dev = getDeviceForAddress(d); if (dev == -1) { CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T), cudaMemcpyHostToDevice, stream)); } else { CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } num_ += n; } return mem; } // Returns true if we actually reallocated memory bool resize(size_t newSize, cudaStream_t stream) { bool mem = false; if (num_ < newSize) { mem = reserve(getNewCapacity_(newSize), stream); } // Don't bother zero initializing the newly accessible memory // (unlike thrust::device_vector) num_ = newSize; return mem; } // Clean up after oversized allocations, while leaving some space to // remain for subsequent allocations (if `exact` false) or to // exactly the space we need (if `exact` true); returns space // reclaimed in bytes size_t reclaim(bool exact, cudaStream_t stream) { size_t free = capacity_ - num_; if (exact) { realloc_(num_, stream); return free * sizeof(T); } // If more than 1/4th of the space is free, then we want to // truncate to only having 1/8th of the space free; this still // preserves some space for new elements, but won't force us to // double our size right away if (free > (capacity_ / 4)) { size_t newFree = capacity_ / 8; size_t newCapacity = num_ + newFree; size_t oldCapacity = capacity_; FAISS_ASSERT(newCapacity < oldCapacity); realloc_(newCapacity, stream); return (oldCapacity - newCapacity) * sizeof(T); } return 0; } // Returns true if we actually reallocated memory bool reserve(size_t newCapacity, cudaStream_t stream) { if (newCapacity <= capacity_) { return false; } // Otherwise, we need new space. realloc_(newCapacity, stream); return true; } private: void realloc_(size_t newCapacity, cudaStream_t stream) { FAISS_ASSERT(num_ <= newCapacity); T* newData = nullptr; CUDA_VERIFY(cudaMalloc(&newData, newCapacity * sizeof(T))); CUDA_VERIFY(cudaMemcpyAsync(newData, data_, num_ * sizeof(T), cudaMemcpyDeviceToDevice, stream)); // FIXME: keep on reclamation queue to avoid hammering cudaFree? CUDA_VERIFY(cudaFree(data_)); data_ = newData; capacity_ = newCapacity; } size_t getNewCapacity_(size_t preferredSize) { return utils::nextHighestPowerOf2(preferredSize); } T* data_; size_t num_; size_t capacity_; }; } } // namespace