来源
说明
这段代码定义了一个 BaseMemory 类,主要用于管理 CPU 和 GPU 内存的分配和释放。以下是其主要功能和操作:
- GPU 内存重新分配:gpu_realloc 方法用于重新分配 GPU 内存,如果现有 GPU 内存容量小于请求的大小。
- CPU 内存重新分配:cpu_realloc 方法用于重新分配 CPU 内存,如果现有 CPU 内存容量小于请求的大小。
- 释放 CPU 内存:release_cpu 方法用于释放 CPU 内存,如果 cpu_ 不为空且当前对象是该内存的所有者。
- 释放 GPU 内存:release_gpu 方法用于释放 GPU 内存,如果 gpu_ 不为空且当前对象是该内存的所有者。
- 释放所有内存:release 方法用于同时释放 CPU 和 GPU 内存。
-
内存对齐函数:upbound 函数用于将给定大小 n 向上对齐到 align 的最小倍数。
内存对齐是我修改后加到里面的,后续分配内存不用手动考虑内存对齐
头文件
memory.hpp
#ifndef __MEMORY_HPP__
#define __MEMORY_HPP__
#include <initializer_list>
#include <memory>
#include <string>
#include <vector>
namespace tensor
{
class BaseMemory
{
public:
BaseMemory() = default;
BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
virtual ~BaseMemory();
virtual void *gpu_realloc(size_t bytes);
virtual void *cpu_realloc(size_t bytes);
void release_gpu();
void release_cpu();
void release();
inline bool owner_gpu() const { return owner_gpu_; }
inline bool owner_cpu() const { return owner_cpu_; }
inline size_t cpu_bytes() const { return cpu_bytes_; }
inline size_t gpu_bytes() const { return gpu_bytes_; }
virtual inline void *get_gpu() const { return gpu_; }
virtual inline void *get_cpu() const { return cpu_; }
void reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes);
protected:
void *cpu_ = nullptr;
size_t cpu_bytes_ = 0;
size_t cpu_capacity_ = 0;
bool owner_cpu_ = true;
void *gpu_ = nullptr;
size_t gpu_bytes_ = 0;
size_t gpu_capacity_ = 0;
bool owner_gpu_ = true;
};
template <typename _DT> class Memory : public BaseMemory
{
public:
Memory() = default;
Memory(const Memory &other) = delete;
Memory &operator=(const Memory &other) = delete;
virtual _DT *gpu(size_t size) { return (_DT *)BaseMemory::gpu_realloc(size * sizeof(_DT)); }
virtual _DT *cpu(size_t size) { return (_DT *)BaseMemory::cpu_realloc(size * sizeof(_DT)); }
inline size_t cpu_size() const { return cpu_bytes_ / sizeof(_DT); }
inline size_t gpu_size() const { return gpu_bytes_ / sizeof(_DT); }
virtual inline _DT *gpu() const { return (_DT *)gpu_; }
virtual inline _DT *cpu() const { return (_DT *)cpu_; }
};
} // namespace tensor
实现代码
memory.cu
#include "common/check.hpp"
#include "common/memory.hpp"
#include <cuda_runtime.h>
namespace tensor
{
using namespace std;
static size_t upbound(size_t n, size_t align) { return (n + align - 1) / align * align; }
BaseMemory::BaseMemory(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
{
reference(cpu, cpu_bytes, gpu, gpu_bytes);
}
void BaseMemory::reference(void *cpu, size_t cpu_bytes, void *gpu, size_t gpu_bytes)
{
release();
if (cpu == nullptr || cpu_bytes == 0)
{
cpu = nullptr;
cpu_bytes = 0;
}
if (gpu == nullptr || gpu_bytes == 0)
{
gpu = nullptr;
gpu_bytes = 0;
}
this->cpu_ = cpu;
this->cpu_capacity_ = cpu_bytes;
this->cpu_bytes_ = cpu_bytes;
this->gpu_ = gpu;
this->gpu_capacity_ = gpu_bytes;
this->gpu_bytes_ = gpu_bytes;
this->owner_cpu_ = !(cpu && cpu_bytes > 0);
this->owner_gpu_ = !(gpu && gpu_bytes > 0);
}
BaseMemory::~BaseMemory() { release(); }
void *BaseMemory::gpu_realloc(size_t bytes)
{
// 内存对齐
size_t size = upbound(bytes, 32);
if (gpu_capacity_ < size)
{
release_gpu();
gpu_capacity_ = size;
checkRuntime(cudaMalloc(&gpu_, size));
// checkRuntime(cudaMemset(gpu_, 0, size));
}
gpu_bytes_ = bytes;
return gpu_;
}
void *BaseMemory::cpu_realloc(size_t bytes)
{
size_t size = upbound(bytes, 32);
if (cpu_capacity_ < size)
{
release_cpu();
cpu_capacity_ = size;
checkRuntime(cudaMallocHost(&cpu_, size));
Assert(cpu_ != nullptr);
// memset(cpu_, 0, size);
}
cpu_bytes_ = bytes;
return cpu_;
}
void BaseMemory::release_cpu()
{
if (cpu_)
{
if (owner_cpu_)
{
checkRuntime(cudaFreeHost(cpu_));
}
cpu_ = nullptr;
}
cpu_capacity_ = 0;
cpu_bytes_ = 0;
}
void BaseMemory::release_gpu()
{
if (gpu_)
{
if (owner_gpu_)
{
checkRuntime(cudaFree(gpu_));
}
gpu_ = nullptr;
}
gpu_capacity_ = 0;
gpu_bytes_ = 0;
}
void BaseMemory::release()
{
release_cpu();
release_gpu();
}
} // namespace tensor