mmdeploy/csrc/device/cuda/buddy_allocator.h

183 lines
4.7 KiB
C++

// Copyright (c) OpenMMLab. All rights reserved.
#ifndef MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
#define MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
#include <cuda_runtime.h>
#include <atomic>
#include <chrono>
#include <list>
#include <mutex>
#include <vector>
#include "core/logger.h"
#include "device/cuda/default_allocator.h"
namespace mmdeploy::cuda {
class BuddyAllocator {
public:
using size_type = std::size_t;
BuddyAllocator(size_type size, size_type block_size) {
block_size_ = block_size;
block_count_ = size / block_size_;
if (!IsPowerOfTwo(block_count_)) {
block_count_ = RoundToPowerOfTwo(block_count_);
WARN("Rounding up block_count to next power of 2 {}", block_count_);
}
base_ = LogPowerOfTwo(block_count_);
size_ = block_size_ * block_count_;
memory_ = gDefaultAllocator().Allocate(size_);
tree_.resize(block_count_ * 2);
free_.resize(base_ + 1);
Build(1, 0);
Add(1, 0);
ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_, block_count_);
size = size_;
for (int i = 0; i <= base_; ++i) {
ERROR("level {}, size = {}", i, size);
size /= 2;
}
}
~BuddyAllocator() {
for (int i = 0; i < free_.size(); ++i) {
ERROR("free_[{}].size(): {}", i, free_[i].size());
}
gDefaultAllocator().Deallocate(memory_, size_);
}
[[nodiscard]] void* Allocate(size_type n) {
std::lock_guard lock{mutex_};
if (n > size_) {
return nullptr;
}
auto n_level = GetLevel(n);
auto level = n_level;
for (; level >= 0; --level) {
if (!free_[level].empty()) {
break;
}
}
if (level < 0) {
WARN("failed to allocate memory size = {} bytes", n);
return nullptr;
}
for (; level < n_level; ++level) {
auto index = free_[level].front();
Split(index, level);
}
auto index = free_[level].front();
Del(index, level);
auto offset = (index ^ (1 << level)) << (base_ - level);
auto p = static_cast<uint8_t*>(memory_) + offset * block_size_;
return p;
}
void Deallocate(void* p, size_type n) {
std::lock_guard lock{mutex_};
auto offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
if (offset < 0 || offset % block_size_) {
ERROR("invalid address: {}", p);
}
offset /= static_cast<long>(block_size_);
auto level = GetLevel(n);
auto index = (offset >> (base_ - level)) ^ (1 << level);
Add(index, level);
while (index > 1) {
auto buddy = index ^ 1;
if (tree_[buddy] != free_[level].end()) {
Merge(index, level);
index /= 2;
--level;
} else {
break;
}
}
}
private:
void Add(size_type index, size_type level) {
assert(tree_[index] == free_[level].end());
tree_[index] = free_[level].insert(free_[level].end(), index);
}
void Del(size_type index, size_type level) {
assert(tree_[index] != free_[level].end());
free_[level].erase(tree_[index]);
tree_[index] = free_[level].end();
}
void Split(size_type index, size_type level) {
Del(index, level);
Add(index * 2, level + 1);
Add(index * 2 + 1, level + 1);
}
void Merge(size_type index, size_type level) {
Del(index, level);
Del(index ^ 1, level);
Add(index / 2, level - 1);
}
size_type GetLevel(size_type size) const {
size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
return base_ - LogPowerOfTwo(size);
}
static bool IsPowerOfTwo(size_type n) { return (n & (n - 1)) == 0; }
static size_type RoundToPowerOfTwo(size_type n) {
--n;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
n |= (n >> 32);
return ++n;
}
static size_type LogPowerOfTwo(size_type v) {
size_type r{};
r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
return r;
}
void Build(size_type index, size_type level) {
if (index < tree_.size()) {
tree_[index] = free_[level].end();
index *= 2;
++level;
Build(index, level);
Build(index + 1, level);
}
}
private:
size_type size_;
size_type block_size_;
size_type block_count_;
size_type base_;
void* memory_;
std::vector<std::list<size_type>::iterator> tree_;
std::vector<std::list<size_type> > free_;
std::mutex mutex_;
};
inline BuddyAllocator& gBuddyAllocator() {
static BuddyAllocator v(1U << 30, 1024 * 64);
return v;
}
} // namespace mmdeploy::cuda
#endif // MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_