diff --git a/test/npu/test_npu.py b/test/npu/test_npu.py index 67834e9cf64a76e7cf35d11151ad0d17e10cd970..f53664a5fb554d1dc9bb96cac2a91324ab8d3676 100644 --- a/test/npu/test_npu.py +++ b/test/npu/test_npu.py @@ -449,6 +449,7 @@ class TestNpu(TestCase): stream.record_event(start_event) stream.record_event(event) event.synchronize() + #just test self.assertTrue(event.query()) self.assertGreater(start_event.elapsed_time(event), 0) diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp index ef7da6d204faf1b4c8495330fcb8878562e18721..bf552e138a4e14e13b5869398107d5bb0f7dd1d1 100644 --- a/torch_npu/csrc/framework/OpParamMaker.cpp +++ b/torch_npu/csrc/framework/OpParamMaker.cpp @@ -450,8 +450,12 @@ int MemcopyAsyncFunc(c10_npu::queue::QueueParas *in, aclrtStream stream) { auto cur_paras = static_cast(in->paramVal); logger->debug("MemcopyAsyncFunc Run."); - aclError ret = - aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + aclError ret; + if (c10_npu::acl::AclrtMemcpyAsyncWithConditionExist() && cur_paras->kind == aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST) { + ret = aclrtMemcpyAsyncWithCondition(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } else { + ret = aclrtMemcpyAsync(cur_paras->dst, cur_paras->dstLen, cur_paras->src, cur_paras->srcLen, cur_paras->kind, stream); + } if (ret != ACL_ERROR_NONE) { auto ret_temp = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL); if (ret_temp != ACL_ERROR_NONE) {