mmdeploy/csrc/device/cuda/buddy_allocator.h

183 lines
4.7 KiB
C
Raw Normal View History

Merge sdk (#251) * check in cmake * move backend_ops to csrc/backend_ops * check in preprocess, model, some codebase and their c-apis * check in CMakeLists.txt * check in parts of test_csrc * commit everything else * add readme * update core's BUILD_INTERFACE directory * skip codespell on third_party * update trt_net and ort_net's CMakeLists * ignore clion's build directory * check in pybind11 * add onnx.proto. Remove MMDeploy's dependency on ncnn's source code * export MMDeployTargets only when MMDEPLOY_BUILD_SDK is ON * remove useless message * target include directory is wrong * change target name from mmdeploy_ppl_net to mmdeploy_pplnn_net * skip install directory * update project's cmake * remove useless code * set CMAKE_BUILD_TYPE to Release by force if it isn't set by user * update custom ops CMakeLists * pass object target's source lists * fix lint end-of-file * fix lint: trailing whitespace * fix codespell hook * remove bicubic_interpolate to csrc/backend_ops/ * set MMDEPLOY_BUILD_SDK OFF * change custom ops build command * add spdlog installation command * update docs on how to checkout pybind11 * move bicubic_interpolate to backend_ops/tensorrt directory * remove useless code * correct cmake * fix typo * fix typo * fix install directory * correct sdk's readme * set cub dir when cuda version < 11.0 * change directory where clang-format will apply to * fix build command * add .clang-format * change clang-format style from google to file * reformat csrc/backend_ops * format sdk's code * turn off clang-format for some files * add -Xcompiler=-fno-gnu-unique * fix trt topk initialize * check in config for sdk demo * update cmake script and csrc's readme * correct config's path * add cuda include directory, otherwise compile failed in case of tensorrt8.2 * clang-format onnx2ncnn.cpp Co-authored-by: zhangli <lzhang329@gmail.com> Co-authored-by: grimoire <yaoqian@sensetime.com>
2021-12-07 10:57:55 +08:00
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
#define MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
#include <cuda_runtime.h>
#include <atomic>
#include <chrono>
#include <list>
#include <mutex>
#include <vector>
#include "core/logger.h"
#include "device/cuda/default_allocator.h"
namespace mmdeploy::cuda {
class BuddyAllocator {
public:
using size_type = std::size_t;
BuddyAllocator(size_type size, size_type block_size) {
block_size_ = block_size;
block_count_ = size / block_size_;
if (!IsPowerOfTwo(block_count_)) {
block_count_ = RoundToPowerOfTwo(block_count_);
WARN("Rounding up block_count to next power of 2 {}", block_count_);
}
base_ = LogPowerOfTwo(block_count_);
size_ = block_size_ * block_count_;
memory_ = gDefaultAllocator().Allocate(size_);
tree_.resize(block_count_ * 2);
free_.resize(base_ + 1);
Build(1, 0);
Add(1, 0);
ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_, block_count_);
size = size_;
for (int i = 0; i <= base_; ++i) {
ERROR("level {}, size = {}", i, size);
size /= 2;
}
}
~BuddyAllocator() {
for (int i = 0; i < free_.size(); ++i) {
ERROR("free_[{}].size(): {}", i, free_[i].size());
}
gDefaultAllocator().Deallocate(memory_, size_);
}
[[nodiscard]] void* Allocate(size_type n) {
std::lock_guard lock{mutex_};
if (n > size_) {
return nullptr;
}
auto n_level = GetLevel(n);
auto level = n_level;
for (; level >= 0; --level) {
if (!free_[level].empty()) {
break;
}
}
if (level < 0) {
WARN("failed to allocate memory size = {} bytes", n);
return nullptr;
}
for (; level < n_level; ++level) {
auto index = free_[level].front();
Split(index, level);
}
auto index = free_[level].front();
Del(index, level);
auto offset = (index ^ (1 << level)) << (base_ - level);
auto p = static_cast<uint8_t*>(memory_) + offset * block_size_;
return p;
}
void Deallocate(void* p, size_type n) {
std::lock_guard lock{mutex_};
auto offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
if (offset < 0 || offset % block_size_) {
ERROR("invalid address: {}", p);
}
offset /= static_cast<long>(block_size_);
auto level = GetLevel(n);
auto index = (offset >> (base_ - level)) ^ (1 << level);
Add(index, level);
while (index > 1) {
auto buddy = index ^ 1;
if (tree_[buddy] != free_[level].end()) {
Merge(index, level);
index /= 2;
--level;
} else {
break;
}
}
}
private:
void Add(size_type index, size_type level) {
assert(tree_[index] == free_[level].end());
tree_[index] = free_[level].insert(free_[level].end(), index);
}
void Del(size_type index, size_type level) {
assert(tree_[index] != free_[level].end());
free_[level].erase(tree_[index]);
tree_[index] = free_[level].end();
}
void Split(size_type index, size_type level) {
Del(index, level);
Add(index * 2, level + 1);
Add(index * 2 + 1, level + 1);
}
void Merge(size_type index, size_type level) {
Del(index, level);
Del(index ^ 1, level);
Add(index / 2, level - 1);
}
size_type GetLevel(size_type size) const {
size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
return base_ - LogPowerOfTwo(size);
}
static bool IsPowerOfTwo(size_type n) { return (n & (n - 1)) == 0; }
static size_type RoundToPowerOfTwo(size_type n) {
--n;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
n |= (n >> 32);
return ++n;
}
static size_type LogPowerOfTwo(size_type v) {
size_type r{};
r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
return r;
}
void Build(size_type index, size_type level) {
if (index < tree_.size()) {
tree_[index] = free_[level].end();
index *= 2;
++level;
Build(index, level);
Build(index + 1, level);
}
}
private:
size_type size_;
size_type block_size_;
size_type block_count_;
size_type base_;
void* memory_;
std::vector<std::list<size_type>::iterator> tree_;
std::vector<std::list<size_type> > free_;
std::mutex mutex_;
};
inline BuddyAllocator& gBuddyAllocator() {
static BuddyAllocator v(1U << 30, 1024 * 64);
return v;
}
} // namespace mmdeploy::cuda
#endif // MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_