From 9415c1dcf5688e870f74f38ac80c509acca9f08e Mon Sep 17 00:00:00 2001 From: wbigat Date: Mon, 17 Apr 2023 11:55:13 +0800 Subject: [PATCH 1/2] Revert "fixed 63d8ec5 from https://gitee.com/wbigat/pytorch_master_wq_2/pulls/2948" This reverts commit f2613f58e86daa5023c30eb97e3ea857544ef873. --- third_party/acl/inc/acl/acl_rt.h | 16 ---------------- third_party/acl/libs/acl.cpp | 1 - .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 7 +------ 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index dd4748a356..93892ee8df 100644 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -913,22 +913,6 @@ ACL_FUNC_VISIBILITY aclError aclrtCreateStreamWithConfig(aclrtStream *stream, ui */ ACL_FUNC_VISIBILITY aclError aclrtDestroyStream(aclrtStream stream); -/** - * @ingroup AscendCL - * @brief destroy stream instance by force - * - * @par Function - * Can only destroy streams created through the aclrtCreateStream interface - * - * @param stream [IN] the stream to destroy - * - * @retval ACL_SUCCESS The function is successfully executed. - * @retval OtherValues Failure - * - * @see aclrtCreateStream - */ -ACL_FUNC_VISIBILITY aclError aclrtDestroyStreamForce(aclrtStream stream); - /** * @ingroup AscendCL * @brief block the host until all tasks diff --git a/third_party/acl/libs/acl.cpp b/third_party/acl/libs/acl.cpp index 2c270d0857..625a9e9218 100644 --- a/third_party/acl/libs/acl.cpp +++ b/third_party/acl/libs/acl.cpp @@ -37,7 +37,6 @@ aclError aclrtSetStreamFailureMode(aclrtStream stream, uint64_t mode) { return 0 aclError aclrtSetOpWaitTimeout(uint32_t timeout) { return 0; } aclError aclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag) { return 0; } aclError aclrtDestroyStream(aclrtStream stream){return 0;} -aclError aclrtDestroyStreamForce(aclrtStream stream){return 0;} aclError aclrtSynchronizeStream(aclrtStream stream){return 0;} // Event diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 6fa3e5ec25..18576154b1 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -216,14 +216,9 @@ NpuSysCtrl::NpuSysCtrl() : init_flag_(false), device_id_(0) {} } this->RegisterReleaseFn([=]() ->void { - // ACL relies on aclrtDestroyStream to clean up some host resources. - // If aclrtDestroyStream is not called, a core dump will occur - // during the automatic deconstruction of ACL resources (singleton object) after npu_shut_down. - auto stream = c10_npu::getCurrentNPUStream(); - C10_NPU_CHECK(aclrtDestroyStreamForce(stream)); C10_NPU_CHECK(ge::GEFinalize()); - C10_NPU_CHECK(aclrtResetDevice(device_id_)); C10_NPU_CHECK(aclFinalize()); + C10_NPU_CHECK(aclrtResetDevice(device_id_)); }, ReleasePriority::PriorityLast); init_flag_ = false; -- Gitee From bc3b0f667089adb178b332002ed173291787442b Mon Sep 17 00:00:00 2001 From: wbigat Date: Mon, 17 Apr 2023 11:56:06 +0800 Subject: [PATCH 2/2] Revert "fixed c180c88 from https://gitee.com/wbigat/pytorch_master_wq_2/pulls/2905" This reverts commit 30d6c720d34fe63103c511c831da0d2b761e2df9. --- torch_npu/csrc/InitNpuBindings.cpp | 9 ++++++++- torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp index b908b4ccdf..89e3b6afad 100644 --- a/torch_npu/csrc/InitNpuBindings.cpp +++ b/torch_npu/csrc/InitNpuBindings.cpp @@ -64,7 +64,14 @@ PyObject * THPModule_npu_shutdown(PyObject * /* unused */) } catch (std::exception& e) { NPU_LOGE("npuSynchronizeDevice failed err=:%s", e.what()); } - + at_npu::native::GraphExecutor::GetInstance().Finalize(); + at_npu::native::TdtChannelForPrint::GetInstance().Finalize(); + THNPUCachingHostAllocator_emptyCache(); + try { + c10_npu::NPUCachingAllocator::emptyCache(); + } catch (std::exception& e) { + NPU_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what()); + } c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize(); if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) { fprintf(stdout, "THPModule_npu_shutdown failed.\n"); diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp index 18576154b1..ae2f26d601 100644 --- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp +++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp @@ -216,9 +216,12 @@ NpuSysCtrl::NpuSysCtrl() : init_flag_(false), device_id_(0) {} } this->RegisterReleaseFn([=]() ->void { + c10_npu::NPUEventManager::GetInstance().ClearEvent(); + auto stream = c10_npu::getCurrentNPUStream(); + (void)aclrtDestroyStream(stream); C10_NPU_CHECK(ge::GEFinalize()); - C10_NPU_CHECK(aclFinalize()); C10_NPU_CHECK(aclrtResetDevice(device_id_)); + C10_NPU_CHECK(aclFinalize()); }, ReleasePriority::PriorityLast); init_flag_ = false; -- Gitee