diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 1691427e4602ce281a79344d3a5af0b4b8db29ec..b5c34ceb292638dd52fc2b3a02207f4ba968b58c 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -1548,7 +1548,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif return block; @@ -1613,7 +1613,7 @@ public: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } @@ -2418,7 +2418,7 @@ private: stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream) }); + block->stream }); #endif } diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp index 7d5173dec8d19f2f167609a32c5f02b0c199ae6a..1ff15d6f03662ca27433f98d987717d21055a801 100644 --- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp @@ -113,7 +113,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); #endif block->data_ptr = nullptr; @@ -154,7 +154,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -180,7 +180,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(stream)} + stream } ); this->last_block = block; this->last_stream = stream; @@ -204,7 +204,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(this->last_stream)} + this->last_stream } ); } #endif @@ -254,7 +254,7 @@ public: stats.allocated_bytes.current, stats.reserved_bytes.current, stats.allocated_bytes.current, - reinterpret_cast(block_pair.first)} + block_pair.first} ); #endif } diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index eb625853ce580d4d88ec8379c1cdfd77c2826e32..94fd8c81b1e8f891376479a74a86e31a7e72d7d8 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -90,7 +90,7 @@ LOAD_FUNCTION(aclrtMemExportToShareableHandle) LOAD_FUNCTION(aclrtMemSetPidToShareableHandle) LOAD_FUNCTION(aclrtMemImportFromShareableHandle) LOAD_FUNCTION(aclrtDeviceGetBareTgid) - +LOAD_FUNCTION(aclrtStreamGetId) aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -1034,5 +1034,16 @@ aclError AclrtDeviceGetBareTgid(int32_t *pid) return func(pid); } +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id) +{ + typedef aclError(*AclrtStreamGetIdFunc)(aclrtStream, int32_t*); + static AclrtStreamGetIdFunc func = nullptr; + if (func == nullptr) { + func = (AclrtStreamGetIdFunc)GET_FUNC(aclrtStreamGetId); + } + TORCH_CHECK(func, "Failed to find function ", "AclrtStreamGetId", PROF_ERROR(ErrCode::NOT_FOUND)); + return func(stream, stream_id); +} + } // namespace acl } // namespace c10 diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h index 9ecc23ed42f342b971d026e6e812df9e860abacc..e04159afca93d6f02383bdb487d017f4985a50a8 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.h +++ b/torch_npu/csrc/core/npu/interface/AclInterface.h @@ -245,5 +245,7 @@ aclError AclrtMemImportFromShareableHandle(uint64_t shareableHandle, int32_t dev aclError AclrtDeviceGetBareTgid(int32_t *pid); +aclError AclrtStreamGetId(aclrtStream stream, int32_t* stream_id); + } // namespace acl } // namespace c10_npu diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index 295eda9aea1f2a425c21caaf037e4f60713a463e..3678da07550d017db863d53ff2d7d05cf12f4edd 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -6,6 +6,7 @@ #include "torch_npu/csrc/core/npu/npu_log.h" #include "torch_npu/csrc/core/npu/NPUException.h" +#include "torch_npu/csrc/core/npu/interface/AclInterface.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" @@ -380,6 +381,8 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) if (!ProfilerMgr::GetInstance()->ReportMemEnable().load()) { return; } + int32_t stream_id; + c10_npu::acl::AclrtStreamGetId(data.stream, &stream_id); ProfilerMgr::GetInstance()->UploadWithLock(std::make_unique( data.ptr, static_cast(Utils::GetClockTime()), @@ -387,7 +390,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data) data.total_allocated, data.total_reserved, data.total_active, - data.stream_ptr, + stream_id, data.device_type, data.device_index, data.component_type, diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h index 2127825bc134e0b49178fe00890cdff58011e62c..854191dfb7669aa5e2c601874e4ad3787e85123b 100644 --- a/torch_npu/csrc/profiler/npu_profiler.h +++ b/torch_npu/csrc/profiler/npu_profiler.h @@ -7,6 +7,9 @@ #include +#include "third_party/acl/inc/acl/acl_base.h" +#include "third_party/acl/inc/acl/acl_rt.h" + #include "torch_npu/csrc/toolkit/profiler/inc/data_reporter.h" #include "torch_npu/csrc/profiler/profiler_mgr.h" #include "torch_npu/csrc/profiler/mstx_mgr.h" @@ -55,7 +58,7 @@ struct MemoryUsage { int64_t total_allocated{0}; int64_t total_reserved{0}; int64_t total_active{0}; - int64_t stream_ptr{0}; + aclrtStream stream{nullptr}; }; struct ExperimentalConfig {