From 94b0211ca3c1d8510c8a2737dc5aa0c56f0ceaed Mon Sep 17 00:00:00 2001 From: yu-liang-bin Date: Tue, 15 Jul 2025 16:05:52 +0800 Subject: [PATCH] fix memory bug --- .../csrc/core/npu/NPUCachingAllocator.cpp | 6 ++--- .../csrc/core/npu/NPUWorkspaceAllocator.cpp | 10 +++---- .../csrc/core/npu/interface/AclInterface.cpp | 26 ++++++++++++++++++- .../csrc/core/npu/interface/AclInterface.h | 4 +++ torch_npu/csrc/profiler/npu_profiler.cpp | 5 +++- torch_npu/csrc/profiler/npu_profiler.h | 5 +++- 6 files changed, 45 insertions(+), 11 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 1691427e46..b5c34ceb29 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1548,7 +1548,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif return block; @@ -1613,7 +1613,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } @@ -2418,7 +2418,7 @@ private: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 7d5173dec8..1ff15d6f03 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -113,7 +113,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); #endif block->data_ptr = nullptr; @@ -154,7 +154,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -180,7 +180,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -204,7 +204,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(this->last_stream)} + this->last_stream } ); } #endif @@ -254,7 +254,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(block_pair.first)} + block_pair.first} ); #endif } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index f5bf5b9308..94fd8c81b1 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -89,7 +89,8 @@ LOAD_FUNCTION(aclrtIpcMemClose) LOAD_FUNCTION(aclrtMemExportToShareableHandle) LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) LOAD_FUNCTION(aclrtMemImportFromShareableHandle) - +LOAD_FUNCTION(aclrtDeviceGetBareTgid) +LOAD_FUNCTION(aclrtStreamGetId) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1021,5 +1022,28 @@ aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t dev return func(shareableHandle, deviceId, handle); } +aclError AclrtDeviceGetBareTgid(int32_t *pid) +{ + typedef aclError (*AclrtDeviceGetBareTgid)(int32_t *); + static AclrtDeviceGetBareTgid func = nullptr; + if (func == nullptr) { + func = (AclrtDeviceGetBareTgid) GET_FUNC(aclrtDeviceGetBareTgid); + } + + TORCH_CHECK(func, "Failed to find function aclrtDeviceGetBareTgid", PTA_ERROR(ErrCode::NOT_FOUND)); + return func(pid); +} + +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id) +{ + typedef aclError(*AclrtStreamGetIdFunc)(aclrtStream, int32_t*); + static AclrtStreamGetIdFunc func = nullptr; + if (func == nullptr) { + func = (AclrtStreamGetIdFunc)GET_FUNC(aclrtStreamGetId); + } + TORCH_CHECK(func, "Failed to find function ", "AclrtStreamGetId", PROF_ERROR(ErrCode::NOT_FOUND)); + return func(stream, stream_id); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index f2c991b19f..e04159afca 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -243,5 +243,9 @@ aclError AclrtMemSetPidToShareableHandle(uint64_t shareableHandle, int32_t *pid, aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t deviceId, aclrtDrvMemHandle *handle); +aclError AclrtDeviceGetBareTgid(int32_t *pid); + +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index 295eda9aea..3678da0755 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -6,6 +6,7 @@ #include "torch_npu/csrc/core/npu/npu_log.h" #include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/interface/AclInterface.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" @@ -380,6 +381,8 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) if (!ProfilerMgr::GetInstance()->ReportMemEnable().load()) { return; } + int32_t stream_id; + c10_npu::acl::AclrtStreamGetId(data.stream, &stream_id); ProfilerMgr::GetInstance()->UploadWithLock(std::make_unique( data.ptr, static_cast(Utils::GetClockTime()), @@ -387,7 +390,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) data.total_allocated, data.total_reserved, data.total_active, - data.stream_ptr, + stream_id, data.device_type, data.device_index, data.component_type, diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 2127825bc1..854191dfb7 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -7,6 +7,9 @@ #include +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + #include "torch_npu/csrc/toolkit/profiler/inc/data_reporter.h" #include "torch_npu/csrc/profiler/profiler_mgr.h" #include "torch_npu/csrc/profiler/mstx_mgr.h" @@ -55,7 +58,7 @@ struct MemoryUsage { int64_t total_allocated{0}; int64_t total_reserved{0}; int64_t total_active{0}; - int64_t stream_ptr{0}; + aclrtStream stream{nullptr}; }; struct ExperimentalConfig { -- Gitee