From ab0c393935f1669858357706e84f040ab0437b5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BB=98=E8=B1=AA?= <fuhao16@huawei.com>
Date: Thu, 31 Jul 2025 10:26:08 +0800
Subject: [PATCH] =?UTF-8?q?[feat]=20persist=E5=86=85=E5=AD=98=E7=A9=BF?=
 =?UTF-8?q?=E5=88=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/acl/inc/acl/acl_rt.h              |  4 +
 third_party/acl/libs/acl.cpp                  |  1 +
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 81 +++++++++++++++++--
 torch_npu/csrc/core/npu/NPUCachingAllocator.h | 10 ++-
 .../csrc/core/npu/interface/AclInterface.cpp  |  4 +
 .../csrc/core/npu/interface/AclInterface.h    |  2 +
 torch_npu/csrc/npu/MemPool.cpp                |  5 +-
 torch_npu/npu/memory.py                       |  8 +-
 8 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index ecc36f3812..d898d283e2 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -743,6 +743,10 @@ ACL_FUNC_VISIBILITY aclError aclrtMalloc(void **devPtr,
                                          size_t size,
                                          aclrtMemMallocPolicy policy);
 
+ACL_FUNC_VISIBILITY aclError aclrtMemAdvise(void *devPtr,
+                                         uint64_t count,
+                                         uint32_t advise);                                         
+
 /**
  * @ingroup AscendCL
  * @brief alloc memory on device, real alloc size is aligned to 32 bytes with no padding
diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp
index 9bb32581dd..d5d1b96b40 100644
--- a/third_party/acl/libs/acl.cpp
+++ b/third_party/acl/libs/acl.cpp
@@ -45,6 +45,7 @@ aclError aclrtEventElapsedTime(float *ms, aclrtEvent start, aclrtEvent end){retu
 // memory相关操作
 aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy){return 0;}
 aclError aclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy){return 0;}
+aclError aclrtMemAdvise(void *devPtr, uint64_t count, uint32_t advise){return 0;}
 aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind){return 0;}
 aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src,
                           size_t count, aclrtMemcpyKind kind, aclrtStream stream){return 0;}
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index ddd89c9117..f48a878822 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -226,6 +226,7 @@ struct Block {
     // whereas context_when_allocated records the last time we handed this
     // memory out from our cache.
     std::shared_ptr<c10::GatheredContext> context_when_segment_allocated;
+    c10_npu::MemoryMode memory_mode{ c10_npu::MemoryMode::Normal };    
 
     Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr)
         : device(device),
@@ -240,7 +241,25 @@ struct Block {
           next(nullptr),
           event_count(0),
           gc_count(0)
-    {}
+    {
+        memory_mode = c10_npu::MemoryMode::Normal;
+    }
+
+    Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr, c10_npu::MemoryMode memory_mode)
+        : device(device),
+          stream(stream),
+          stream_uses(),
+          size(size),
+          requested_size(0),
+          pool(pool),
+          ptr(ptr),
+          allocated(0),
+          prev(nullptr),
+          next(nullptr),
+          event_count(0),
+          gc_count(0),
+          memory_mode(memory_mode)
+    {}    
 
     // constructor for search key
     Block(int device, aclrtStream stream, size_t size)
@@ -256,7 +275,25 @@ struct Block {
           next(nullptr),
           event_count(0),
           gc_count(0)
-    {}
+    {
+        memory_mode = c10_npu::MemoryMode::Normal;
+    }
+
+    Block(int device, aclrtStream stream, size_t size, c10_npu::MemoryMode memory_mode)
+        : device(device),
+          stream(stream),
+          stream_uses(),
+          size(size),
+          requested_size(0),
+          pool(nullptr),
+          ptr(nullptr),
+          allocated(0),
+          prev(nullptr),
+          next(nullptr),
+          event_count(0),
+          gc_count(0),
+          memory_mode(memory_mode)
+    {}    
 
     bool is_split() const
     {
@@ -711,6 +748,9 @@ static bool BlockComparatorSize(const Block *a, const Block *b)
     if (a->stream != b->stream) {
         return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
     }
+    if (a->memory_mode != b->memory_mode) {
+        return a->memory_mode < b->memory_mode;
+    }    
     if (a->size != b->size) {
         return a->size < b->size;
     }
@@ -722,6 +762,9 @@ static bool BlockComparatorAddress(const Block *a, const Block *b)
     if (a->stream != b->stream) {
         return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
     }
+    if (a->memory_mode != b->memory_mode) {
+        return a->memory_mode < b->memory_mode;
+    }    
     return reinterpret_cast<uintptr_t>(a->ptr) < reinterpret_cast<uintptr_t>(b->ptr);
 }
 
@@ -730,6 +773,10 @@ struct AllocParams {
         : search_key(device, stream, size), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE)
     {}
 
+    AllocParams(int device, size_t size, aclrtStream stream, BlockPool *pool, size_t alloc_size, DeviceStats &stats, c10_npu::MemoryMode memory_mode)
+        : search_key(device, stream, size, memory_mode), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE)
+    {}    
+
     int device() const
     {
         return search_key.device;
@@ -1351,7 +1398,14 @@ public:
         const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ?
             kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) :
             get_allocation_size(size);
-        AllocParams params(device, size, stream, &pool, alloc_size, stats);
+
+        c10_npu::MemoryMode memory_mode = c10_npu::MemoryMode::Normal;
+        auto active_pool = MemPoolContext::getActiveMemPool();
+        if (active_pool) {
+            memory_mode = active_pool->pool_mode();     
+        }
+        AllocParams params(device, size, stream, &pool, alloc_size, stats, memory_mode);
+
         params.stat_types = get_stat_types_for_pool(pool);
 
         // First, try to get a block from the existing pool.
@@ -1491,7 +1545,7 @@ public:
         if (split_remainder) {
             remaining = block;
 
-            block = new Block(device, stream, size, pool, block->ptr);
+            block = new Block(device, stream, size, pool, block->ptr, block->memory_mode);
             block->expandable_segment_ = remaining->expandable_segment_;
             block->prev = remaining->prev;
             if (block->prev) {
@@ -2695,6 +2749,7 @@ private:
             }
             return bool(p.block);
         } else {
+            bool is_persist = p.search_key.memory_mode == c10_npu::MemoryMode::L2_persist;            
             auto active_pool = MemPoolContext::getActiveMemPool();
             if (active_pool && active_pool->allocator() && p.pool->owner_PrivatePool) {
                 ptr = active_pool->allocator()->raw_alloc(size);
@@ -2709,6 +2764,10 @@ private:
             if (p.err != ACL_ERROR_NONE) {
                 return false;
             }
+            if (is_persist) {
+                ASCEND_LOGD("pta_memory aclrtMemWithAdvise: size = %zu", size);
+                c10_npu::acl::aclrtMemWithAdvise(ptr, static_cast<uint64_t>(size));
+            }            
         }
 
         ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
@@ -2719,7 +2778,7 @@ private:
         }
 
         total_allocated_memory += size;
-        p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr);
+        p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr, p.search_key.memory_mode);
         for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
             update_stat(stats.segment[stat_type], 1);
             update_stat(stats.reserved_bytes[stat_type], size);
@@ -3810,7 +3869,7 @@ std::atomic<CaptureId_t> MemPool::uid_{ 1 };
 std::atomic<CaptureId_t> MemPool::uuid_{ 1 };
 
 
-MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created)
+MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created, std::string pool_mode)
     : allocator_(allocator), is_user_created_(is_user_created)
 {
     if (is_user_created_) {
@@ -3818,6 +3877,11 @@ MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_crea
     } else {
         id_ = { uuid_++, 0 };
     }
+    if (pool_mode == "L2_persist") {
+        pool_mode_ = c10_npu::MemoryMode::L2_persist;
+    } else {
+        pool_mode_ = c10_npu::MemoryMode::Normal;
+    }    
 }
 
 MempoolId_t MemPool::id()
@@ -3830,6 +3894,11 @@ NPUCachingAllocator::NPUAllocator *MemPool::allocator()
     return allocator_;
 }
 
+c10_npu::MemoryMode MemPool::pool_mode()
+{
+    return pool_mode_;
+}
+
 // Note that active_mempool_ is a global variable here
 // and not inside MemPoolContext class, because in windows we
 // can't use __declspec(dllexport) and __declspec(thread)
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 13c68aa0e3..3c66ed8fb5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -473,6 +473,11 @@ bool isConfig1GPageSizeEnable();
 
 namespace c10_npu {
 
+enum class MemoryMode {
+    Normal,
+    L2_persist
+};      
+
 // MemPool represents a pool of memory in a caching allocator. Currently,
 // it's just the ID of the pool object maintained in the NPUCachingAllocator.
 //
@@ -482,10 +487,12 @@ namespace c10_npu {
 struct C10_NPU_API MemPool {
     MemPool(
         NPUCachingAllocator::NPUAllocator* allocator = nullptr,
-        bool is_user_created = true);
+        bool is_user_created = true,
+        std::string pool_mode = "Normal");
 
     MempoolId_t id();
     NPUCachingAllocator::NPUAllocator* allocator();
+    c10_npu::MemoryMode pool_mode();
 
 private:
     static std::atomic<CaptureId_t> uid_;
@@ -493,6 +500,7 @@ private:
     NPUCachingAllocator::NPUAllocator* allocator_;
     bool is_user_created_;
     MempoolId_t id_;
+    c10_npu::MemoryMode pool_mode_;
 };
 
 // MemPoolContext holds the currently active pool and stashes the previous
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index f277456dae..99031e5a75 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -475,6 +475,10 @@ aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy pol
     return ret;
 }
 
+aclError aclrtMemWithAdvise(void *devPtr, uint64_t count) {
+    return aclrtMemAdvise(devPtr, count, 0U);
+}
+
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) {
     typedef aclError (*AclrtStreamQuery)(aclrtStream, aclrtStreamStatus*);
     static AclrtStreamQuery func = nullptr;
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index e917e0ab97..3dc29dff5a 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -145,6 +145,8 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
 
 aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy);
 
+aclError aclrtMemWithAdvise(void *devPtr, uint64_t count);
+
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status);
 
 bool can_device_access_peer(c10::DeviceIndex device_id, c10::DeviceIndex peer_device_id);
diff --git a/torch_npu/csrc/npu/MemPool.cpp b/torch_npu/csrc/npu/MemPool.cpp
index 7a14933686..db5a5c4518 100644
--- a/torch_npu/csrc/npu/MemPool.cpp
+++ b/torch_npu/csrc/npu/MemPool.cpp
@@ -11,9 +11,10 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 void TORCH_NPU_API THNPMemPool_init(PyObject* module) {
     auto torch_C_m = py::handle(module).cast<py::module>();
     shared_ptr_class_<::c10_npu::MemPool>(torch_C_m, "_MemPool")
-        .def(py::init<c10_npu::NPUCachingAllocator::NPUAllocator*, bool>())
+        .def(py::init<c10_npu::NPUCachingAllocator::NPUAllocator*, bool, std::string>())
         .def_property_readonly("id", &::c10_npu::MemPool::id)
-        .def_property_readonly("allocator", &::c10_npu::MemPool::allocator);
+        .def_property_readonly("allocator", &::c10_npu::MemPool::allocator)
+        .def_property_readonly("pool_mode", &::c10_npu::MemPool::pool_mode);
     shared_ptr_class_<::c10_npu::MemPoolContext>(torch_C_m, "_MemPoolContext")
         .def(py::init<c10_npu::MemPool*>())
         .def_static(
diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py
index 7447578782..89c6cfe6a6 100644
--- a/torch_npu/npu/memory.py
+++ b/torch_npu/npu/memory.py
@@ -657,8 +657,8 @@ class MemPool(torch_npu._C._MemPool):
 
     """
 
-    def __init__(self, allocator: Optional[torch_npu._C._npu_NPUAllocator] = None):
-        super().__init__(allocator, True)
+    def __init__(self, allocator: Optional[torch_npu._C._npu_NPUAllocator] = None, pool_mode: str = "Normal"):
+        super().__init__(allocator, True, pool_mode)
 
     @property
     def id(self) -> Tuple[int, int]:
@@ -669,6 +669,10 @@ class MemPool(torch_npu._C._MemPool):
     def allocator(self) -> Optional[torch_npu._C._npu_NPUAllocator]:
         r"""Returns the allocator this MemPool routes allocations to"""
         return super().allocator
+    
+    @property
+    def pool_mode(self) -> str:
+        return super().pool_mode     
 
 
 class MemPoolContext(torch_npu._C._MemPoolContext):
-- 
Gitee