From f467abc2538645bc30b578a4b3f77a33dd61a161 Mon Sep 17 00:00:00 2001
From: c00845255 <chenkun82@huawei.com>
Date: Wed, 30 Jul 2025 14:53:26 +0800
Subject: [PATCH 1/2] add logs

---
 torch_npu/__init__.py                         |  7 ++++++
 torch_npu/csrc/InitNpuBindings.cpp            | 11 ++++++++-
 torch_npu/csrc/core/npu/NPUEventManager.cpp   | 15 ++++++++++++
 torch_npu/csrc/core/npu/NPUFunctions.cpp      | 24 +++++++++++++++++++
 torch_npu/csrc/core/npu/NPUStream.cpp         |  9 +++++++
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   | 13 ++++++++++
 torch_npu/utils/_error_code.py                |  3 +++
 7 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index f571d55240..9e09a5118a 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -7,6 +7,7 @@ import atexit
 import traceback
 import ctypes
 import warnings
+import time
 
 from functools import wraps
 
@@ -272,11 +273,17 @@ del DefaultDeviceType
 
 # NPU exit, need to synchronize devices
 def _npu_shutdown():
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _npu_shutdown_synchronize", flush=True)
     success = torch_npu._C._npu_shutdown_synchronize()
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _destructor_process_group", flush=True)
     torch_npu.distributed.distributed_c10d._destructor_process_group()
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function _npu_shutdown", flush=True)
     torch_npu._C._npu_shutdown(success)
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function handle_exception", flush=True)
     _except_handler.handle_exception()
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Entering function matmul_check._cleanup", flush=True)
     torch_npu.asd.asd.matmul_check._cleanup()
+    print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Exiting function matmul_check._cleanup", flush=True)
 
 
 # register npu shutdown hook on exit
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 672b5289f5..570549ea86 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -94,12 +94,15 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg)
 
 PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */)
 {
-    ASCEND_LOGI("NPU shutdown synchronize begin.");
+    ASCEND_LOGE("THPModule_npu_shutdown_synchronize begin.");
     if (!c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        ASCEND_LOGE("NPU not initialized, GetInitFlag() returned false.");
         Py_RETURN_FALSE;
     }
 
+    ASCEND_LOGE("Stopping StressDetector worker thread...");
     StressDetector::stop_worker_thread();
+    ASCEND_LOGE("StressDetector worker thread stopped.");
 
     // Return aclrtSynchronizeDevice result. If sync device fails, release host
     // resources forcibly, only record WARN logs when acl interface of stream
@@ -108,12 +111,18 @@ PyObject* THPModule_npu_shutdown_synchronize(PyObject* /* unused */)
     try {
         ASCEND_LOGI("NPU shutdown synchronize device.");
         success = c10_npu::npuSynchronizeUsedDevices(false);
+        if (success) {
+            ASCEND_LOGE("NPU devices synchronized successfully.");
+        } else {
+            ASCEND_LOGE("npuSynchronizeUsedDevices returned false, synchronization may have failed.");
+        }
     } catch (std::exception& e) {
         ASCEND_LOGE("npuSynchronizeDevice failed err=:%s", e.what());
         success = false;
     }
 
     if (success) {
+        ASCEND_LOGE("NPU shutdown synchronize device completed successfully.");
         Py_RETURN_TRUE;
     } else {
         ASCEND_LOGE("NPU shutdown synchronize device failed.");
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp
index 2371b9bc79..f9e21b3eb6 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.cpp
+++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp
@@ -76,27 +76,42 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event)
 
 void NPUEventManager::ClearEvent()
 {
+    ASCEND_LOGE("NPUEventManager::ClearEvent begin");
     if (thread_pool_ != nullptr) {
+        ASCEND_LOGE("Thread pool exists, waiting for all work to complete...");
         thread_pool_->waitWorkComplete();
+        ASCEND_LOGE("All work in thread pool completed.");
+    } else {
+        ASCEND_LOGE("Thread pool is nullptr, skipping waitWorkComplete.");
     }
 
     while (!npu_events_.empty()) {
         aclrtEvent event = npu_events_.front();
+        ASCEND_LOGE("Processing event: %p", event);
 #ifndef BUILD_LIBTORCH
         const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
         if (C10_UNLIKELY(trigger)) {
+            ASCEND_LOGE("Found NPU trace trigger, recording event deletion for event: %p", event);
             trigger->traceNpuEventDeletion(reinterpret_cast<uintptr_t>(event));
+        } else {
+            ASCEND_LOGE("No NPU trace trigger found for event: %p", event);
         }
 #endif
+        ASCEND_LOGE("Calling aclrtDestroyEvent for event: %p", event);
         auto err = aclrtDestroyEvent(event);
+        ASCEND_LOGE("aclrtDestroyEvent returned: %d", err);
+
         if (err != ACL_ERROR_NONE) {
+            ASCEND_LOGE("Failed to destroy event: %p, error code: %d", event, err);
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err);
             NPU_CHECK_WARN(err);
         } else {
             ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", event);
         }
         npu_events_.pop_front();
+        ASCEND_LOGE("Event removed from queue, remaining events: %zu", npu_events_.size());
     }
+    ASCEND_LOGE("All events cleared, npu_events_ is now empty.");
 }
 void NPUEventManager::IncreaseUnrecordedCount(aclrtEvent event)
 {
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index b89d1df45c..87b1ed1d42 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -137,33 +137,55 @@ std::vector<int8_t> GetUsedDevices()
 
 aclError ResetUsedDevices()
 {
+    ASCEND_LOGE("ResetUsedDevices begin");
     std::lock_guard<std::recursive_mutex> lock(mtx);
+
+    ASCEND_LOGE("Number of used devices to reset: %zu", used_devices.size());
     for (const auto it : used_devices) {
         aclError err = aclrtResetDevice(it.first);
+        ASCEND_LOGE("aclrtResetDevice returned: %d for device: %d", err, it.first);
         if (err != ACL_ERROR_NONE) {
+            ASCEND_LOGE("Failed to reset device: %d, error code: %d", it.first, err);
             return err;
         }
     }
+    ASCEND_LOGE("Clearing used_devices container");
     used_devices.clear();
+
+    ASCEND_LOGE("ResetUsedDevices completed successfully");
     return ACL_ERROR_NONE;
 }
 
 aclError DestroyUsedStreams()
 {
+    ASCEND_LOGE("DestroyUsedStreams begin");
     int32_t cur_device = 0;
     NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&cur_device));
+    ASCEND_LOGE("Current device: %d", cur_device);
     std::lock_guard<std::recursive_mutex> lock(mtx);
+
+    ASCEND_LOGE("Number of used devices: %zu", used_devices.size());
     for (const auto it : used_devices) {
         if (c10_npu::StreamInitFlag(it.first)) {
             NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first));
+            ASCEND_LOGE("Successfully set device to: %d", it.first);
+
             NPUStream stream = getCurrentNPUStream(it.first);
+            ASCEND_LOGE("Got current NPU stream for device: %d, stream: %p", it.first, stream.stream(false));
+
             aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false));
+            ASCEND_LOGE("AclrtDestroyStreamForce returned: %d", acl_ret);
+
             if (acl_ret != ACL_ERROR_NONE) {
+                ASCEND_LOGE("Failed to destroy stream for device: %d, error: %d", it.first, acl_ret);
                 return acl_ret;
             }
+        } else {
+            ASCEND_LOGE("Stream not initialized for device: %d, skipping", it.first);
         }
     }
     NPU_CHECK_ERROR_WITHOUT_UCE(MaybeSetDevice(cur_device));
+    ASCEND_LOGE("Successfully restored device to: %d", cur_device);
     return ACL_ERROR_NONE;
 }
 
@@ -174,7 +196,9 @@ aclError SynchronizeUsedDevices()
     std::lock_guard<std::recursive_mutex> lock(mtx);
     for (const auto it : used_devices) {
         NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first));
+        ASCEND_LOGE("Calling AclrtSynchronizeDeviceWithTimeout for device: %d", it.first);
         aclError acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout();
+        ASCEND_LOGE("AclrtSynchronizeDeviceWithTimeout returned: %d", acl_ret);
         if (acl_ret != ACL_ERROR_NONE) {
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(acl_ret);
             return acl_ret;
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index b2ddf38ff9..18070337fb 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -513,14 +513,23 @@ bool npuSynchronizeDevice(bool check_error)
 
 bool npuSynchronizeUsedDevices(bool check_error)
 {
+    ASCEND_LOGE("npuSynchronizeUsedDevices begin, check_error: %d", check_error);
     if (c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
+        ASCEND_LOGE("Task queue is enabled, will empty all NPU streams.");
         NPUStatus ret = c10_npu::emptyAllNPUStream(check_error);
         if (ret != NPU_STATUS_SUCCESS) {
             ASCEND_LOGE("MakeSureQueueEmpty fail, ret: %s", ret.c_str());
+        } else {
+            ASCEND_LOGE("MakeSureQueueEmpty completed successfully.");
         }
+    } else {
+        ASCEND_LOGE("Task queue is disabled, skipping empty all NPU streams.");
     }
 
+    ASCEND_LOGE("Calling SynchronizeUsedDevices...");
     auto acl_ret = SynchronizeUsedDevices();
+    ASCEND_LOGE("SynchronizeUsedDevices returned: %d", acl_ret);
+    
     if (check_error) {
         NPU_CHECK_ERROR(acl_ret);
     } else {
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 4b6707b849..008d7427fa 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -223,18 +223,28 @@ NpuSysCtrl::SysStatus NpuSysCtrl::OverflowSwitchEnable()
 NpuSysCtrl::SysStatus NpuSysCtrl::Finalize()
 {
     if (!init_flag_) {
+        ASCEND_LOGE("NPU already uninitialized, init_flag_ is false, returning FINALIZE_SUCC directly.");
         return FINALIZE_SUCC;
     }
 
+    ASCEND_LOGE("Registering release functions with PriorityLast...");
     this->RegisterReleaseFn(
         [=]() -> void {
+            ASCEND_LOGE("Executing release function: Clearing NPU events...");
             c10_npu::NPUEventManager::GetInstance().ClearEvent();
+
+            ASCEND_LOGE("Calling NPU_DestroyUsedStreams...");
             NPU_CHECK_WARN(c10_npu::DestroyUsedStreams());
+
+            ASCEND_LOGE("Calling NPU_ResetUsedDevices...");
             NPU_CHECK_WARN(c10_npu::ResetUsedDevices());
             // Maintain a basic point of view, who applies for the resource, the resource is released by whom.
             // If aclInit is not a PTA call, then aclFinalize should not be a PTA call either.
             if (repeat_init_acl_flag_) {
+                ASCEND_LOGE("repeat_init_acl_flag_ is true, calling aclFinalize...");
                 NPU_CHECK_WARN(aclFinalize());
+            } else {
+                ASCEND_LOGE("repeat_init_acl_flag_ is false, skipping aclFinalize...");
             }
         },
         ReleasePriority::PriorityLast);
@@ -242,7 +252,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Finalize()
     init_flag_ = false;
 
     if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) {
+        ASCEND_LOGE("ACL dump date is enabled, calling aclmdlFinalizeDump...");
         NPU_CHECK_WARN(aclmdlFinalizeDump());
+    } else {
+        ASCEND_LOGE("ACL dump date is disabled, skipping aclmdlFinalizeDump...");
     }
 
     // call release fn by priotity
diff --git a/torch_npu/utils/_error_code.py b/torch_npu/utils/_error_code.py
index dbcc6aa2a9..a82d8b2176 100644
--- a/torch_npu/utils/_error_code.py
+++ b/torch_npu/utils/_error_code.py
@@ -125,7 +125,9 @@ class _NPUExceptionHandler(object):
     def handle_exception(self):
         # exception raised by other component, such as original PyTorch, third-party library, or application code.
         if self.exception:
+            print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: exec function handle_exception", flush=True)
             if self.force_stop_flag:
+                print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: force_stop_flag function handle_exception", flush=True)
                 raise RuntimeError("FORCE STOP." + pta_error(ErrCode.ACL))
             if self._is_exception(self.npu_exception):
                 if self._is_exception(self.npu_timeout_exception) or get_env_compact_error_output():
@@ -133,6 +135,7 @@ class _NPUExceptionHandler(object):
                     time.sleep(self.npu_timeout_exit_offset)
             else:
                 print(_format_error_msg(_SubModuleID.UNKNOWN, ErrCode.EXCEPT).lstrip())
+        print(f"[PTA][{time.strftime('%Y-%m-%d %H:%M:%S')}]: Exiting function handle_exception", flush=True)
 
 
 _except_handler = _NPUExceptionHandler()
-- 
Gitee


From c2d7485f204b4f25f859b86d1a0cdc6bff4df444 Mon Sep 17 00:00:00 2001
From: c00845255 <chenkun82@huawei.com>
Date: Wed, 30 Jul 2025 15:09:45 +0800
Subject: [PATCH 2/2] add log 2

---
 torch_npu/csrc/InitNpuBindings.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 570549ea86..36c95a005b 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -58,35 +58,35 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg)
     // on the current device, while aclrtFree Free device memory immediately.
     // aclrtSynchronizeDevice should be called before aclrtFree to ensure that
     // all of op tasks completed before device memory free.
-    ASCEND_LOGI("NPU shutdown begin.");
+    ASCEND_LOGE("NPU shutdown begin.");
     if (!c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
         Py_RETURN_NONE;
     }
 
-    ASCEND_LOGI("NPU shutdown ReleaseHcclCommList.");
+    ASCEND_LOGE("NPU shutdown ReleaseHcclCommList.");
     torch_npu::data_parallel::ReleaseHcclCommList();
-    ASCEND_LOGI("NPU shutdown ReleaseHcclCommList success.");
+    ASCEND_LOGE("NPU shutdown ReleaseHcclCommList success.");
 
     at_npu::native::CachingHostAllocator_emptyCache();
     try {
-        ASCEND_LOGI("NPU shutdown NPUCachingAllocator emptyCache.");
+        ASCEND_LOGE("NPU shutdown NPUCachingAllocator emptyCache.");
         c10_npu::NPUCachingAllocator::emptyCache(false);
     } catch (...) {
         ASCEND_LOGE("NPUCachingAllocator::emptyCache failed");
     }
     try {
-        ASCEND_LOGI("NPU shutdown NPUSwappedMemoryAllocator emptyCache.");
+        ASCEND_LOGE("NPU shutdown NPUSwappedMemoryAllocator emptyCache.");
         c10_npu::NPUSwappedMemoryAllocator::emptyCache();
     } catch (...) {
         ASCEND_LOGE("NPUSwappedMemoryAllocator::emptyCache failed");
     }
 
-    ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize.");
+    ASCEND_LOGE("NPU shutdown NpuSysCtrl Finalize.");
     c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize();
     if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) {
         ASCEND_LOGE("NPU shutdown failed.");
     } else {
-        ASCEND_LOGI("NPU shutdown success.");
+        ASCEND_LOGE("NPU shutdown success.");
     }
 
     Py_RETURN_NONE;
-- 
Gitee