From b581b6c8ca0f1b72b1c8d0f967e03e2daed19a93 Mon Sep 17 00:00:00 2001 From: wbigat Date: Mon, 17 Apr 2023 11:47:14 +0800 Subject: [PATCH 1/2] Revert "!2949 [Fix] fix core dump bug after npu shut down." This reverts commit a243de6e20c4579345f83696d4960857c4ad7b83. --- third_party/acl/inc/acl/acl_rt.h | 16 ---------------- third_party/acl/libs/acl.cpp | 1 - .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 7 +------ 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index dd4748a356..93892ee8df 100644 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -913,22 +913,6 @@ ACL_FUNC_VISIBILITY aclError aclrtCreateStreamWithConfig(aclrtStream *stream, ui */ ACL_FUNC_VISIBILITY aclError aclrtDestroyStream(aclrtStream stream); -/** - * @ingroup AscendCL - * @brief destroy stream instance by force - * - * @par Function - * Can only destroy streams created through the aclrtCreateStream interface - * - * @param stream [IN] the stream to destroy - * - * @retval ACL_SUCCESS The function is successfully executed. - * @retval OtherValues Failure - * - * @see aclrtCreateStream - */ -ACL_FUNC_VISIBILITY aclError aclrtDestroyStreamForce(aclrtStream stream); - /** * @ingroup AscendCL * @brief block the host until all tasks diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index 2c270d0857..625a9e9218 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -37,7 +37,6 @@ aclError aclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { return 0 aclError aclrtSetOpWaitTimeout(uint32_t timeout) { return 0; } aclError aclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag) { return 0; } aclError aclrtDestroyStream(aclrtStream stream){return 0;} -aclError aclrtDestroyStreamForce(aclrtStream stream){return 0;} aclError aclrtSynchronizeStream(aclrtStream stream){return 0;} // Event diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 10d043e38e..5242d60885 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -214,14 +214,9 @@ NpuSysCtrl::NpuSysCtrl() : init_flag_(false), device_id_(0) {} } this->RegisterReleaseFn([=]() ->void { - // ACL relies on aclrtDestroyStream to clean up some host resources. - // If aclrtDestroyStream is not called, a core dump will occur - // during the automatic deconstruction of ACL resources (singleton object) after npu_shut_down. - auto stream = c10_npu::getCurrentNPUStream(); - C10_NPU_CHECK(aclrtDestroyStreamForce(stream)); C10_NPU_CHECK(ge::GEFinalize()); - C10_NPU_CHECK(aclrtResetDevice(device_id_)); C10_NPU_CHECK(aclFinalize()); + C10_NPU_CHECK(aclrtResetDevice(device_id_)); }, ReleasePriority::PriorityLast); init_flag_ = false; -- Gitee From ac336f97d15e13c5d7a59e555390511920a243f7 Mon Sep 17 00:00:00 2001 From: wbigat Date: Mon, 17 Apr 2023 11:49:18 +0800 Subject: [PATCH 2/2] Revert "fixed c180c88 from https://gitee.com/wbigat/pytorch_master_wq/pulls/2905" This reverts commit 7773589d29929600d6fb18bd70d0c974b658c792. --- torch_npu/csrc/InitNpuBindings.cpp | 9 ++++++++- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index 556fb38a55..513519c94a 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -64,7 +64,14 @@ PyObject * THPModule_npu_shutdown(PyObject * /* unused */) } catch (std::exception& e) { NPU_LOGE("npuSynchronizeDevice failed err=:%s", e.what()); } - + at_npu::native::GraphExecutor::GetInstance().Finalize(); + at_npu::native::TdtChannelForPrint::GetInstance().Finalize(); + THNPUCachingHostAllocator_emptyCache(); + try { + c10_npu::NPUCachingAllocator::emptyCache(); + } catch (std::exception& e) { + NPU_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what()); + } c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize(); if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) { fprintf(stdout, "THPModule_npu_shutdown failed.\n"); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 5242d60885..942133971c 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -214,9 +214,12 @@ NpuSysCtrl::NpuSysCtrl() : init_flag_(false), device_id_(0) {} } this->RegisterReleaseFn([=]() ->void { + c10_npu::NPUEventManager::GetInstance().ClearEvent(); + auto stream = c10_npu::getCurrentNPUStream(); + (void)aclrtDestroyStream(stream); C10_NPU_CHECK(ge::GEFinalize()); - C10_NPU_CHECK(aclFinalize()); C10_NPU_CHECK(aclrtResetDevice(device_id_)); + C10_NPU_CHECK(aclFinalize()); }, ReleasePriority::PriorityLast); init_flag_ = false; -- Gitee