From f467abc2538645bc30b578a4b3f77a33dd61a161 Mon Sep 17 00:00:00 2001 From: c00845255 Date: Wed, 30 Jul 2025 14:53:26 +0800 Subject: [PATCH 1/2] add logs --- torch_npu/__init__.py | 7 ++++++ torch_npu/csrc/InitNpuBindings.cpp | 11 ++++++++- torch_npu/csrc/core/npu/NPUEventManager.cpp | 15 ++++++++++++ torch_npu/csrc/core/npu/NPUFunctions.cpp | 24 +++++++++++++++++++ torch_npu/csrc/core/npu/NPUStream.cpp | 9 +++++++ .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 13 ++++++++++ torch_npu/utils/_error_code.py | 3 +++ 7 files changed, 81 insertions(+), 1 deletion(-) diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index f571d55240..9e09a5118a 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -7,6 +7,7 @@ import atexit import traceback import ctypes import warnings +import time from functools import wraps @@ -272,11 +273,17 @@ del DefaultDeviceType # NPU exit, need to synchronize devices def _npu_shutdown(): + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _npu_shutdown_synchronize", flush=True) success = torch_npu._C._npu_shutdown_synchronize() + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _destructor_process_group", flush=True) torch_npu.distributed.distributed_c10d._destructor_process_group() + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _npu_shutdown", flush=True) torch_npu._C._npu_shutdown(success) + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function handle_exception", flush=True) _except_handler.handle_exception() + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function matmul_check._cleanup", flush=True) torch_npu.asd.asd.matmul_check._cleanup() + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Exiting function matmul_check._cleanup", flush=True) # register npu shutdown hook on exit diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 672b5289f5..570549ea86 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -94,12 +94,15 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg) PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */) { - ASCEND_LOGI("NPU shutdown synchronize begin."); + ASCEND_LOGE("THPModule_npu_shutdown_synchronize begin."); if (!c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { + ASCEND_LOGE("NPU not initialized, GetInitFlag() returned false."); Py_RETURN_FALSE; } + ASCEND_LOGE("Stopping StressDetector worker thread..."); StressDetector::stop_worker_thread(); + ASCEND_LOGE("StressDetector worker thread stopped."); // Return aclrtSynchronizeDevice result. If sync device fails, release host // resources forcibly, only record WARN logs when acl interface of stream @@ -108,12 +111,18 @@ PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */) try { ASCEND_LOGI("NPU shutdown synchronize device."); success = c10_npu::npuSynchronizeUsedDevices(false); + if (success) { + ASCEND_LOGE("NPU devices synchronized successfully."); + } else { + ASCEND_LOGE("npuSynchronizeUsedDevices returned false, synchronization may have failed."); + } } catch (std::exception& e) { ASCEND_LOGE("npuSynchronizeDevice failed err=:%s", e.what()); success = false; } if (success) { + ASCEND_LOGE("NPU shutdown synchronize device completed successfully."); Py_RETURN_TRUE; } else { ASCEND_LOGE("NPU shutdown synchronize device failed."); diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp index 2371b9bc79..f9e21b3eb6 100644 --- a/torch_npu/csrc/core/npu/NPUEventManager.cpp +++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp @@ -76,27 +76,42 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event) void NPUEventManager::ClearEvent() { + ASCEND_LOGE("NPUEventManager::ClearEvent begin"); if (thread_pool_ != nullptr) { + ASCEND_LOGE("Thread pool exists, waiting for all work to complete..."); thread_pool_->waitWorkComplete(); + ASCEND_LOGE("All work in thread pool completed."); + } else { + ASCEND_LOGE("Thread pool is nullptr, skipping waitWorkComplete."); } while (!npu_events_.empty()) { aclrtEvent event = npu_events_.front(); + ASCEND_LOGE("Processing event: %p", event); #ifndef BUILD_LIBTORCH const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace(); if (C10_UNLIKELY(trigger)) { + ASCEND_LOGE("Found NPU trace trigger, recording event deletion for event: %p", event); trigger->traceNpuEventDeletion(reinterpret_cast(event)); + } else { + ASCEND_LOGE("No NPU trace trigger found for event: %p", event); } #endif + ASCEND_LOGE("Calling aclrtDestroyEvent for event: %p", event); auto err = aclrtDestroyEvent(event); + ASCEND_LOGE("aclrtDestroyEvent returned: %d", err); + if (err != ACL_ERROR_NONE) { + ASCEND_LOGE("Failed to destroy event: %p, error code: %d", event, err); CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err); NPU_CHECK_WARN(err); } else { ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", event); } npu_events_.pop_front(); + ASCEND_LOGE("Event removed from queue, remaining events: %zu", npu_events_.size()); } + ASCEND_LOGE("All events cleared, npu_events_ is now empty."); } void NPUEventManager::IncreaseUnrecordedCount(aclrtEvent event) { diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp index b89d1df45c..87b1ed1d42 100644 --- a/torch_npu/csrc/core/npu/NPUFunctions.cpp +++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp @@ -137,33 +137,55 @@ std::vector GetUsedDevices() aclError ResetUsedDevices() { + ASCEND_LOGE("ResetUsedDevices begin"); std::lock_guard lock(mtx); + + ASCEND_LOGE("Number of used devices to reset: %zu", used_devices.size()); for (const auto it : used_devices) { aclError err = aclrtResetDevice(it.first); + ASCEND_LOGE("aclrtResetDevice returned: %d for device: %d", err, it.first); if (err != ACL_ERROR_NONE) { + ASCEND_LOGE("Failed to reset device: %d, error code: %d", it.first, err); return err; } } + ASCEND_LOGE("Clearing used_devices container"); used_devices.clear(); + + ASCEND_LOGE("ResetUsedDevices completed successfully"); return ACL_ERROR_NONE; } aclError DestroyUsedStreams() { + ASCEND_LOGE("DestroyUsedStreams begin"); int32_t cur_device = 0; NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&cur_device)); + ASCEND_LOGE("Current device: %d", cur_device); std::lock_guard lock(mtx); + + ASCEND_LOGE("Number of used devices: %zu", used_devices.size()); for (const auto it : used_devices) { if (c10_npu::StreamInitFlag(it.first)) { NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); + ASCEND_LOGE("Successfully set device to: %d", it.first); + NPUStream stream = getCurrentNPUStream(it.first); + ASCEND_LOGE("Got current NPU stream for device: %d, stream: %p", it.first, stream.stream(false)); + aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false)); + ASCEND_LOGE("AclrtDestroyStreamForce returned: %d", acl_ret); + if (acl_ret != ACL_ERROR_NONE) { + ASCEND_LOGE("Failed to destroy stream for device: %d, error: %d", it.first, acl_ret); return acl_ret; } + } else { + ASCEND_LOGE("Stream not initialized for device: %d, skipping", it.first); } } NPU_CHECK_ERROR_WITHOUT_UCE(MaybeSetDevice(cur_device)); + ASCEND_LOGE("Successfully restored device to: %d", cur_device); return ACL_ERROR_NONE; } @@ -174,7 +196,9 @@ aclError SynchronizeUsedDevices() std::lock_guard lock(mtx); for (const auto it : used_devices) { NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first)); + ASCEND_LOGE("Calling AclrtSynchronizeDeviceWithTimeout for device: %d", it.first); aclError acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout(); + ASCEND_LOGE("AclrtSynchronizeDeviceWithTimeout returned: %d", acl_ret); if (acl_ret != ACL_ERROR_NONE) { CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(acl_ret); return acl_ret; diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index b2ddf38ff9..18070337fb 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -513,14 +513,23 @@ bool npuSynchronizeDevice(bool check_error) bool npuSynchronizeUsedDevices(bool check_error) { + ASCEND_LOGE("npuSynchronizeUsedDevices begin, check_error: %d", check_error); if (c10_npu::option::OptionsManager::GetTaskQueueEnable()) { + ASCEND_LOGE("Task queue is enabled, will empty all NPU streams."); NPUStatus ret = c10_npu::emptyAllNPUStream(check_error); if (ret != NPU_STATUS_SUCCESS) { ASCEND_LOGE("MakeSureQueueEmpty fail, ret: %s", ret.c_str()); + } else { + ASCEND_LOGE("MakeSureQueueEmpty completed successfully."); } + } else { + ASCEND_LOGE("Task queue is disabled, skipping empty all NPU streams."); } + ASCEND_LOGE("Calling SynchronizeUsedDevices..."); auto acl_ret = SynchronizeUsedDevices(); + ASCEND_LOGE("SynchronizeUsedDevices returned: %d", acl_ret); + if (check_error) { NPU_CHECK_ERROR(acl_ret); } else { diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 4b6707b849..008d7427fa 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -223,18 +223,28 @@ NpuSysCtrl::SysStatus NpuSysCtrl::OverflowSwitchEnable() NpuSysCtrl::SysStatus NpuSysCtrl::Finalize() { if (!init_flag_) { + ASCEND_LOGE("NPU already uninitialized, init_flag_ is false, returning FINALIZE_SUCC directly."); return FINALIZE_SUCC; } + ASCEND_LOGE("Registering release functions with PriorityLast..."); this->RegisterReleaseFn( [=]() -> void { + ASCEND_LOGE("Executing release function: Clearing NPU events..."); c10_npu::NPUEventManager::GetInstance().ClearEvent(); + + ASCEND_LOGE("Calling NPU_DestroyUsedStreams..."); NPU_CHECK_WARN(c10_npu::DestroyUsedStreams()); + + ASCEND_LOGE("Calling NPU_ResetUsedDevices..."); NPU_CHECK_WARN(c10_npu::ResetUsedDevices()); // Maintain a basic point of view, who applies for the resource, the resource is released by whom. // If aclInit is not a PTA call, then aclFinalize should not be a PTA call either. if (repeat_init_acl_flag_) { + ASCEND_LOGE("repeat_init_acl_flag_ is true, calling aclFinalize..."); NPU_CHECK_WARN(aclFinalize()); + } else { + ASCEND_LOGE("repeat_init_acl_flag_ is false, skipping aclFinalize..."); } }, ReleasePriority::PriorityLast); @@ -242,7 +252,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Finalize() init_flag_ = false; if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) { + ASCEND_LOGE("ACL dump date is enabled, calling aclmdlFinalizeDump..."); NPU_CHECK_WARN(aclmdlFinalizeDump()); + } else { + ASCEND_LOGE("ACL dump date is disabled, skipping aclmdlFinalizeDump..."); } // call release fn by priotity diff --git a/torch_npu/utils/_error_code.py b/torch_npu/utils/_error_code.py index dbcc6aa2a9..a82d8b2176 100644 --- a/torch_npu/utils/_error_code.py +++ b/torch_npu/utils/_error_code.py @@ -125,7 +125,9 @@ class _NPUExceptionHandler(object): def handle_exception(self): # exception raised by other component, such as original PyTorch, third-party library, or application code. if self.exception: + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: exec function handle_exception", flush=True) if self.force_stop_flag: + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: force_stop_flag function handle_exception", flush=True) raise RuntimeError("FORCE STOP." + pta_error(ErrCode.ACL)) if self._is_exception(self.npu_exception): if self._is_exception(self.npu_timeout_exception) or get_env_compact_error_output(): @@ -133,6 +135,7 @@ class _NPUExceptionHandler(object): time.sleep(self.npu_timeout_exit_offset) else: print(_format_error_msg(_SubModuleID.UNKNOWN, ErrCode.EXCEPT).lstrip()) + print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Exiting function handle_exception", flush=True) _except_handler = _NPUExceptionHandler() -- Gitee From c2d7485f204b4f25f859b86d1a0cdc6bff4df444 Mon Sep 17 00:00:00 2001 From: c00845255 Date: Wed, 30 Jul 2025 15:09:45 +0800 Subject: [PATCH 2/2] add log 2 --- torch_npu/csrc/InitNpuBindings.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 570549ea86..36c95a005b 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -58,35 +58,35 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg) // on the current device, while aclrtFree Free device memory immediately. // aclrtSynchronizeDevice should be called before aclrtFree to ensure that // all of op tasks completed before device memory free. - ASCEND_LOGI("NPU shutdown begin."); + ASCEND_LOGE("NPU shutdown begin."); if (!c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { Py_RETURN_NONE; } - ASCEND_LOGI("NPU shutdown ReleaseHcclCommList."); + ASCEND_LOGE("NPU shutdown ReleaseHcclCommList."); torch_npu::data_parallel::ReleaseHcclCommList(); - ASCEND_LOGI("NPU shutdown ReleaseHcclCommList success."); + ASCEND_LOGE("NPU shutdown ReleaseHcclCommList success."); at_npu::native::CachingHostAllocator_emptyCache(); try { - ASCEND_LOGI("NPU shutdown NPUCachingAllocator emptyCache."); + ASCEND_LOGE("NPU shutdown NPUCachingAllocator emptyCache."); c10_npu::NPUCachingAllocator::emptyCache(false); } catch (...) { ASCEND_LOGE("NPUCachingAllocator::emptyCache failed"); } try { - ASCEND_LOGI("NPU shutdown NPUSwappedMemoryAllocator emptyCache."); + ASCEND_LOGE("NPU shutdown NPUSwappedMemoryAllocator emptyCache."); c10_npu::NPUSwappedMemoryAllocator::emptyCache(); } catch (...) { ASCEND_LOGE("NPUSwappedMemoryAllocator::emptyCache failed"); } - ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize."); + ASCEND_LOGE("NPU shutdown NpuSysCtrl Finalize."); c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize(); if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) { ASCEND_LOGE("NPU shutdown failed."); } else { - ASCEND_LOGI("NPU shutdown success."); + ASCEND_LOGE("NPU shutdown success."); } Py_RETURN_NONE; -- Gitee