diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.cpp b/torch_npu/csrc/core/npu/interface/OpInterface.cpp index e950ee9f931b3a3fe24c5cd9b0ea4abdaa30be63..a2f4c515614b041d50752d367e295b6ea9e33684 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/OpInterface.cpp @@ -1,19 +1,19 @@ #include "OpInterface.h" #include "torch_npu/csrc/core/npu/register/FunctionLoader.h" +#include "torch_npu/csrc/core/npu/NPUException.h" namespace c10_npu { namespace opapi { #undef LOAD_FUNCTION -#define LOAD_FUNCTION(funcName) \ - REGISTER_FUNCTION(libopapi, funcName) +#define LOAD_FUNCTION(funcName) REGISTER_FUNCTION(libopapi, funcName) #undef GET_FUNC -#define GET_FUNC(funcName) \ - GET_FUNCTION(libopapi, funcName) +#define GET_FUNC(funcName) GET_FUNCTION(libopapi, funcName) REGISTER_LIBRARY(libopapi) LOAD_FUNCTION(aclnnSilentCheck) LOAD_FUNCTION(aclnnSilentCheckV2) +LOAD_FUNCTION(aclnnReselectStaticKernel) bool IsExistAclnnSilentCheck() { @@ -24,5 +24,20 @@ bool IsExistAclnnSilentCheck() return isExist; } +aclnnStatus ReselectStaticKernel() +{ + typedef aclnnStatus (*AclnnApiFunc)(); + static AclnnApiFunc aclnnReselectStaticKernelFunc = nullptr; + if (aclnnReselectStaticKernelFunc == nullptr) { + aclnnReselectStaticKernelFunc = (AclnnApiFunc)GET_FUNC(aclnnReselectStaticKernel); + } + TORCH_CHECK(aclnnReselectStaticKernelFunc, + "Failed to find function ", + "aclnnReselectStaticKernel", + PROF_ERROR(ErrCode::NOT_FOUND)); + auto ret = aclnnReselectStaticKernelFunc(); + return ret; +} + } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.h b/torch_npu/csrc/core/npu/interface/OpInterface.h index 663f9a6144ed52569d2c92780c42e70c9ddff38d..111489a36f66f2bd99316aa3f1764cb19ec087e7 100644 --- a/torch_npu/csrc/core/npu/interface/OpInterface.h +++ b/torch_npu/csrc/core/npu/interface/OpInterface.h @@ -1,11 +1,20 @@ #pragma once +#include + namespace c10_npu { namespace opapi { +typedef int32_t aclnnStatus; + /** * This API is used to check whether aclnnSilentCheck exist. -*/ + */ bool IsExistAclnnSilentCheck(); +/** + This Api is used to reselect static kernel, it need to be called once at process. + */ +aclnnStatus ReselectStaticKernel(); + } // namespace opapi } // namespace c10_npu diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 040e4754678597ae89ba1776919184cca6d058a6..4f4872bb281c5e4c13f59b82205470ce1efe6f38 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -51,6 +51,7 @@ #include "torch_npu/csrc/aten/common/from_blob.h" #include "torch_npu/csrc/profiler/combined_traceback.h" #include "torch_npu/csrc/profiler/python/combined_traceback.h" +#include "torch_npu/csrc/core/npu/interface/OpInterface.h" struct NPUDeviceProp { std::string name; @@ -1594,6 +1595,18 @@ PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs) END_HANDLE_TH_ERRORS } +PyObject* THNPModule_aclnn_reselect_static_kernel(PyObject* self, PyObject* noargs) +{ + HANDLE_TH_ERRORS + NPUStatus ret = c10_npu::emptyAllNPUStream(); + if (ret != SUCCESS) { + ASCEND_LOGE("Failed to empty NPU task queue, ret: %s", ret.c_str()); + } + c10_npu::opapi::ReselectStaticKernel(); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + PyObject* THNPModule_npu_set_fft_plan_cache_max_size(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS @@ -1800,6 +1813,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr}, {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr}, {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr}, + {"_aclnn_reselect_static_kernel", (PyCFunction)THNPModule_aclnn_reselect_static_kernel, METH_NOARGS, nullptr}, {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr}, {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr}, {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr}, diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index 182859d8a5aefc290a702d41fdd36cc33631c72c..855b96643ca22c2dc403f5511a50e7a3ffa67daa 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -400,6 +400,10 @@ def _device_count_ascend_hal() -> int: return -1 return len(visible_devices) +def _aclnn_reselect_static_kernel(): + torch_npu.npu._lazy_init() + torch_npu._C._aclnn_reselect_static_kernel() + _cached_device_count: Optional[int] = None