From ab0c393935f1669858357706e84f040ab0437b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BB=98=E8=B1=AA?= Date: Thu, 31 Jul 2025 10:26:08 +0800 Subject: [PATCH] =?UTF-8?q?[feat]=20persist=E5=86=85=E5=AD=98=E7=A9=BF?= =?UTF-8?q?=E5=88=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/acl/inc/acl/acl_rt.h | 4 + third_party/acl/libs/acl.cpp | 1 + .../csrc/core/npu/NPUCachingAllocator.cpp | 81 +++++++++++++++++-- torch_npu/csrc/core/npu/NPUCachingAllocator.h | 10 ++- .../csrc/core/npu/interface/AclInterface.cpp | 4 + .../csrc/core/npu/interface/AclInterface.h | 2 + torch_npu/csrc/npu/MemPool.cpp | 5 +- torch_npu/npu/memory.py | 8 +- 8 files changed, 104 insertions(+), 11 deletions(-) diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index ecc36f3812..d898d283e2 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -743,6 +743,10 @@ ACL_FUNC_VISIBILITY aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy); +ACL_FUNC_VISIBILITY aclError aclrtMemAdvise(void *devPtr, + uint64_t count, + uint32_t advise); + /** * @ingroup AscendCL * @brief alloc memory on device, real alloc size is aligned to 32 bytes with no padding diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index 9bb32581dd..d5d1b96b40 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -45,6 +45,7 @@ aclError aclrtEventElapsedTime(float *ms, aclrtEvent start, aclrtEvent end){retu // memory相关操作 aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy){return 0;} aclError aclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy){return 0;} +aclError aclrtMemAdvise(void *devPtr, uint64_t count, uint32_t advise){return 0;} aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind){return 0;} aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind, aclrtStream stream){return 0;} diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index ddd89c9117..f48a878822 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -226,6 +226,7 @@ struct Block { // whereas context_when_allocated records the last time we handed this // memory out from our cache. std::shared_ptr context_when_segment_allocated; + c10_npu::MemoryMode memory_mode{ c10_npu::MemoryMode::Normal }; Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr) : device(device), @@ -240,7 +241,25 @@ struct Block { next(nullptr), event_count(0), gc_count(0) - {} + { + memory_mode = c10_npu::MemoryMode::Normal; + } + + Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr, c10_npu::MemoryMode memory_mode) + : device(device), + stream(stream), + stream_uses(), + size(size), + requested_size(0), + pool(pool), + ptr(ptr), + allocated(0), + prev(nullptr), + next(nullptr), + event_count(0), + gc_count(0), + memory_mode(memory_mode) + {} // constructor for search key Block(int device, aclrtStream stream, size_t size) @@ -256,7 +275,25 @@ struct Block { next(nullptr), event_count(0), gc_count(0) - {} + { + memory_mode = c10_npu::MemoryMode::Normal; + } + + Block(int device, aclrtStream stream, size_t size, c10_npu::MemoryMode memory_mode) + : device(device), + stream(stream), + stream_uses(), + size(size), + requested_size(0), + pool(nullptr), + ptr(nullptr), + allocated(0), + prev(nullptr), + next(nullptr), + event_count(0), + gc_count(0), + memory_mode(memory_mode) + {} bool is_split() const { @@ -711,6 +748,9 @@ static bool BlockComparatorSize(const Block *a, const Block *b) if (a->stream != b->stream) { return reinterpret_cast(a->stream) < reinterpret_cast(b->stream); } + if (a->memory_mode != b->memory_mode) { + return a->memory_mode < b->memory_mode; + } if (a->size != b->size) { return a->size < b->size; } @@ -722,6 +762,9 @@ static bool BlockComparatorAddress(const Block *a, const Block *b) if (a->stream != b->stream) { return reinterpret_cast(a->stream) < reinterpret_cast(b->stream); } + if (a->memory_mode != b->memory_mode) { + return a->memory_mode < b->memory_mode; + } return reinterpret_cast(a->ptr) < reinterpret_cast(b->ptr); } @@ -730,6 +773,10 @@ struct AllocParams { : search_key(device, stream, size), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE) {} + AllocParams(int device, size_t size, aclrtStream stream, BlockPool *pool, size_t alloc_size, DeviceStats &stats, c10_npu::MemoryMode memory_mode) + : search_key(device, stream, size, memory_mode), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE) + {} + int device() const { return search_key.device; @@ -1351,7 +1398,14 @@ public: const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ? kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) : get_allocation_size(size); - AllocParams params(device, size, stream, &pool, alloc_size, stats); + + c10_npu::MemoryMode memory_mode = c10_npu::MemoryMode::Normal; + auto active_pool = MemPoolContext::getActiveMemPool(); + if (active_pool) { + memory_mode = active_pool->pool_mode(); + } + AllocParams params(device, size, stream, &pool, alloc_size, stats, memory_mode); + params.stat_types = get_stat_types_for_pool(pool); // First, try to get a block from the existing pool. @@ -1491,7 +1545,7 @@ public: if (split_remainder) { remaining = block; - block = new Block(device, stream, size, pool, block->ptr); + block = new Block(device, stream, size, pool, block->ptr, block->memory_mode); block->expandable_segment_ = remaining->expandable_segment_; block->prev = remaining->prev; if (block->prev) { @@ -2695,6 +2749,7 @@ private: } return bool(p.block); } else { + bool is_persist = p.search_key.memory_mode == c10_npu::MemoryMode::L2_persist; auto active_pool = MemPoolContext::getActiveMemPool(); if (active_pool && active_pool->allocator() && p.pool->owner_PrivatePool) { ptr = active_pool->allocator()->raw_alloc(size); @@ -2709,6 +2764,10 @@ private: if (p.err != ACL_ERROR_NONE) { return false; } + if (is_persist) { + ASCEND_LOGD("pta_memory aclrtMemWithAdvise: size = %zu", size); + c10_npu::acl::aclrtMemWithAdvise(ptr, static_cast(size)); + } } ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size); @@ -2719,7 +2778,7 @@ private: } total_allocated_memory += size; - p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr); + p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr, p.search_key.memory_mode); for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) { update_stat(stats.segment[stat_type], 1); update_stat(stats.reserved_bytes[stat_type], size); @@ -3810,7 +3869,7 @@ std::atomic MemPool::uid_{ 1 }; std::atomic MemPool::uuid_{ 1 }; -MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created) +MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created, std::string pool_mode) : allocator_(allocator), is_user_created_(is_user_created) { if (is_user_created_) { @@ -3818,6 +3877,11 @@ MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_crea } else { id_ = { uuid_++, 0 }; } + if (pool_mode == "L2_persist") { + pool_mode_ = c10_npu::MemoryMode::L2_persist; + } else { + pool_mode_ = c10_npu::MemoryMode::Normal; + } } MempoolId_t MemPool::id() @@ -3830,6 +3894,11 @@ NPUCachingAllocator::NPUAllocator *MemPool::allocator() return allocator_; } +c10_npu::MemoryMode MemPool::pool_mode() +{ + return pool_mode_; +} + // Note that active_mempool_ is a global variable here // and not inside MemPoolContext class, because in windows we // can't use __declspec(dllexport) and __declspec(thread) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h index 13c68aa0e3..3c66ed8fb5 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h @@ -473,6 +473,11 @@ bool isConfig1GPageSizeEnable(); namespace c10_npu { +enum class MemoryMode { + Normal, + L2_persist +}; + // MemPool represents a pool of memory in a caching allocator. Currently, // it's just the ID of the pool object maintained in the NPUCachingAllocator. // @@ -482,10 +487,12 @@ namespace c10_npu { struct C10_NPU_API MemPool { MemPool( NPUCachingAllocator::NPUAllocator* allocator = nullptr, - bool is_user_created = true); + bool is_user_created = true, + std::string pool_mode = "Normal"); MempoolId_t id(); NPUCachingAllocator::NPUAllocator* allocator(); + c10_npu::MemoryMode pool_mode(); private: static std::atomic uid_; @@ -493,6 +500,7 @@ private: NPUCachingAllocator::NPUAllocator* allocator_; bool is_user_created_; MempoolId_t id_; + c10_npu::MemoryMode pool_mode_; }; // MemPoolContext holds the currently active pool and stashes the previous diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index f277456dae..99031e5a75 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -475,6 +475,10 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol return ret; } +aclError aclrtMemWithAdvise(void *devPtr, uint64_t count) { + return aclrtMemAdvise(devPtr, count, 0U); +} + aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) { typedef aclError (*AclrtStreamQuery)(aclrtStream, aclrtStreamStatus*); static AclrtStreamQuery func = nullptr; diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index e917e0ab97..3dc29dff5a 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -145,6 +145,8 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy); +aclError aclrtMemWithAdvise(void *devPtr, uint64_t count); + aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status); bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id); diff --git a/torch_npu/csrc/npu/MemPool.cpp b/torch_npu/csrc/npu/MemPool.cpp index 7a14933686..db5a5c4518 100644 --- a/torch_npu/csrc/npu/MemPool.cpp +++ b/torch_npu/csrc/npu/MemPool.cpp @@ -11,9 +11,10 @@ using shared_ptr_class_ = py::class_>; void TORCH_NPU_API THNPMemPool_init(PyObject* module) { auto torch_C_m = py::handle(module).cast(); shared_ptr_class_<::c10_npu::MemPool>(torch_C_m, "_MemPool") - .def(py::init()) + .def(py::init()) .def_property_readonly("id", &::c10_npu::MemPool::id) - .def_property_readonly("allocator", &::c10_npu::MemPool::allocator); + .def_property_readonly("allocator", &::c10_npu::MemPool::allocator) + .def_property_readonly("pool_mode", &::c10_npu::MemPool::pool_mode); shared_ptr_class_<::c10_npu::MemPoolContext>(torch_C_m, "_MemPoolContext") .def(py::init()) .def_static( diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py index 7447578782..89c6cfe6a6 100644 --- a/torch_npu/npu/memory.py +++ b/torch_npu/npu/memory.py @@ -657,8 +657,8 @@ class MemPool(torch_npu._C._MemPool): """ - def __init__(self, allocator: Optional[torch_npu._C._npu_NPUAllocator] = None): - super().__init__(allocator, True) + def __init__(self, allocator: Optional[torch_npu._C._npu_NPUAllocator] = None, pool_mode: str = "Normal"): + super().__init__(allocator, True, pool_mode) @property def id(self) -> Tuple[int, int]: @@ -669,6 +669,10 @@ class MemPool(torch_npu._C._MemPool): def allocator(self) -> Optional[torch_npu._C._npu_NPUAllocator]: r"""Returns the allocator this MemPool routes allocations to""" return super().allocator + + @property + def pool_mode(self) -> str: + return super().pool_mode class MemPoolContext(torch_npu._C._MemPoolContext): -- Gitee