236 lines
6.1 KiB
C++
236 lines
6.1 KiB
C++
/**
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the CC-by-NC license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
|
|
#include "StackDeviceMemory.h"
|
|
#include "DeviceUtils.h"
|
|
#include "StaticUtils.h"
|
|
#include "../../FaissAssert.h"
|
|
#include <stdio.h>
|
|
#include <sstream>
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
StackDeviceMemory::Stack::Stack(int d, size_t sz)
|
|
: device_(d),
|
|
isOwner_(true),
|
|
start_(nullptr),
|
|
end_(nullptr),
|
|
size_(sz),
|
|
head_(nullptr),
|
|
mallocCurrent_(0),
|
|
highWaterMemoryUsed_(0),
|
|
highWaterMalloc_(0) {
|
|
DeviceScope s(device_);
|
|
|
|
cudaError_t err = cudaMalloc(&start_, size_);
|
|
FAISS_ASSERT(err == cudaSuccess);
|
|
|
|
head_ = start_;
|
|
end_ = start_ + size_;
|
|
}
|
|
|
|
StackDeviceMemory::Stack::Stack(int d, void* p, size_t sz, bool isOwner)
|
|
: device_(d),
|
|
isOwner_(isOwner),
|
|
start_((char*) p),
|
|
end_(((char*) p) + sz),
|
|
size_(sz),
|
|
head_((char*) p) {
|
|
}
|
|
|
|
StackDeviceMemory::Stack::~Stack() {
|
|
if (isOwner_) {
|
|
DeviceScope s(device_);
|
|
|
|
cudaError_t err = cudaFree(start_);
|
|
FAISS_ASSERT(err == cudaSuccess);
|
|
}
|
|
}
|
|
|
|
size_t
|
|
StackDeviceMemory::Stack::getSizeAvailable() const {
|
|
return (end_ - head_);
|
|
}
|
|
|
|
char*
|
|
StackDeviceMemory::Stack::getAlloc(size_t size, cudaStream_t stream) {
|
|
if (size > (end_ - head_)) {
|
|
// Too large for our stack
|
|
DeviceScope s(device_);
|
|
|
|
// Print our requested size before we attempt the allocation
|
|
fprintf(stderr, "WARN: increase temp memory to avoid cudaMalloc, "
|
|
"or decrease query/add size (alloc %zu B, highwater %zu B)\n",
|
|
size, highWaterMalloc_);
|
|
|
|
char* p = nullptr;
|
|
auto err = cudaMalloc(&p, size);
|
|
FAISS_ASSERT_FMT(err == cudaSuccess,
|
|
"cudaMalloc error %d on alloc size %zu",
|
|
(int) err, size);
|
|
|
|
mallocCurrent_ += size;
|
|
highWaterMalloc_ = std::max(highWaterMalloc_, mallocCurrent_);
|
|
|
|
return p;
|
|
} else {
|
|
// We can make the allocation out of our stack
|
|
// Find all the ranges that we overlap that may have been
|
|
// previously allocated; our allocation will be [head, endAlloc)
|
|
char* startAlloc = head_;
|
|
char* endAlloc = head_ + size;
|
|
|
|
while (lastUsers_.size() > 0) {
|
|
auto& prevUser = lastUsers_.back();
|
|
|
|
// Because there is a previous user, we must overlap it
|
|
FAISS_ASSERT(prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
|
|
|
|
if (stream != prevUser.stream_) {
|
|
// Synchronization required
|
|
// FIXME
|
|
FAISS_ASSERT(false);
|
|
}
|
|
|
|
if (endAlloc < prevUser.end_) {
|
|
// Update the previous user info
|
|
prevUser.start_ = endAlloc;
|
|
|
|
break;
|
|
}
|
|
|
|
// If we're the exact size of the previous request, then we
|
|
// don't need to continue
|
|
bool done = (prevUser.end_ == endAlloc);
|
|
|
|
lastUsers_.pop_back();
|
|
|
|
if (done) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
head_ = endAlloc;
|
|
FAISS_ASSERT(head_ <= end_);
|
|
|
|
highWaterMemoryUsed_ = std::max(highWaterMemoryUsed_,
|
|
(size_t) (head_ - start_));
|
|
return startAlloc;
|
|
}
|
|
}
|
|
|
|
void
|
|
StackDeviceMemory::Stack::returnAlloc(char* p,
|
|
size_t size,
|
|
cudaStream_t stream) {
|
|
if (p < start_ || p >= end_) {
|
|
// This is not on our stack; it was a one-off allocation
|
|
DeviceScope s(device_);
|
|
|
|
auto err = cudaFree(p);
|
|
FAISS_ASSERT_FMT(err == cudaSuccess,
|
|
"cudaFree error %d (addr %p size %zu)",
|
|
(int) err, p, size);
|
|
|
|
FAISS_ASSERT(mallocCurrent_ >= size);
|
|
mallocCurrent_ -= size;
|
|
} else {
|
|
// This is on our stack
|
|
// Allocations should be freed in the reverse order they are made
|
|
FAISS_ASSERT(p + size == head_);
|
|
|
|
head_ = p;
|
|
lastUsers_.push_back(Range(p, p + size, stream));
|
|
}
|
|
}
|
|
|
|
std::string
|
|
StackDeviceMemory::Stack::toString() const {
|
|
std::stringstream s;
|
|
|
|
s << "SDM device " << device_ << ": Total memory " << size_ << " ["
|
|
<< (void*) start_ << ", " << (void*) end_ << ")\n";
|
|
s << " Available memory " << (size_t) (end_ - head_)
|
|
<< " [" << (void*) head_ << ", " << (void*) end_ << ")\n";
|
|
s << " High water temp alloc " << highWaterMemoryUsed_ << "\n";
|
|
s << " High water cudaMalloc " << highWaterMalloc_ << "\n";
|
|
|
|
int i = lastUsers_.size();
|
|
for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
|
|
s << i-- << ": size " << (size_t) (it->end_ - it->start_)
|
|
<< " stream " << it->stream_
|
|
<< " [" << (void*) it->start_ << ", " << (void*) it->end_ << ")\n";
|
|
}
|
|
|
|
return s.str();
|
|
}
|
|
|
|
size_t
|
|
StackDeviceMemory::Stack::getHighWaterCudaMalloc() const {
|
|
return highWaterMalloc_;
|
|
}
|
|
|
|
StackDeviceMemory::StackDeviceMemory(int device, size_t allocPerDevice)
|
|
: device_(device),
|
|
stack_(device, allocPerDevice) {
|
|
}
|
|
|
|
StackDeviceMemory::StackDeviceMemory(int device,
|
|
void* p, size_t size, bool isOwner)
|
|
: device_(device),
|
|
stack_(device, p, size, isOwner) {
|
|
}
|
|
|
|
StackDeviceMemory::~StackDeviceMemory() {
|
|
}
|
|
|
|
int
|
|
StackDeviceMemory::getDevice() const {
|
|
return device_;
|
|
}
|
|
|
|
DeviceMemoryReservation
|
|
StackDeviceMemory::getMemory(cudaStream_t stream, size_t size) {
|
|
// We guarantee 16 byte alignment for allocations, so bump up `size`
|
|
// to the next highest multiple of 16
|
|
size = utils::roundUp(size, (size_t) 16);
|
|
|
|
return DeviceMemoryReservation(this,
|
|
device_,
|
|
stack_.getAlloc(size, stream),
|
|
size,
|
|
stream);
|
|
}
|
|
|
|
size_t
|
|
StackDeviceMemory::getSizeAvailable() const {
|
|
return stack_.getSizeAvailable();
|
|
}
|
|
|
|
std::string
|
|
StackDeviceMemory::toString() const {
|
|
return stack_.toString();
|
|
}
|
|
|
|
size_t
|
|
StackDeviceMemory::getHighWaterCudaMalloc() const {
|
|
return stack_.getHighWaterCudaMalloc();
|
|
}
|
|
|
|
void
|
|
StackDeviceMemory::returnAllocation(DeviceMemoryReservation& m) {
|
|
FAISS_ASSERT(m.get());
|
|
FAISS_ASSERT(device_ == m.device());
|
|
|
|
stack_.returnAlloc((char*) m.get(), m.size(), m.stream());
|
|
}
|
|
|
|
} } // namespace
|