diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp index a00448bd1cf547cb2f641eb47ff940c902341610..503a6c591cc2d8c1b660ceb903b50d08add9635f 100644 --- a/torch_npu/csrc/core/npu/NPUGraph.cpp +++ b/torch_npu/csrc/core/npu/NPUGraph.cpp @@ -97,7 +97,7 @@ NPUGraph::NPUGraph() void NPUGraph::capture_begin(MempoolId_t pool, aclmdlRICaptureMode capture_mode) { - static const auto _task_queue_enable = c10_npu::option::OptionsManager::GetTaskQueueEnable(); + const auto _task_queue_enable = c10_npu::option::OptionsManager::GetTaskQueueEnable(); TORCH_CHECK(_task_queue_enable != 2, "Do not support TASK_QUEUE_ENABLE = 2 during NPU graph capture, please " "export TASK_QUEUE_ENABLE=1/0.", diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp index 579514ab37390f36aa208e7711c6fcec131a9f98..98b807a9a47141437ad64ce53e1aa1e1397190e0 100644 --- a/torch_npu/csrc/core/npu/NPUQueue.cpp +++ b/torch_npu/csrc/core/npu/NPUQueue.cpp @@ -388,7 +388,7 @@ void Repository::CheckDeviceError(int ret, std::string& err_msg) bool Repository::ReadQueue() { if (IsEmptyQueue()) { - static const auto task_queue_enable = c10_npu::option::OptionsManager::GetTaskQueueEnable(); + const auto task_queue_enable = c10_npu::option::OptionsManager::GetTaskQueueEnable(); if (task_queue_enable == 2) { // read queue polls for at most 1 ms when queue is empty. for (int i = 0; i < READ_QUEUE_POLL_MAX_LOOP; ++i) { diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 1eedf54559d1171ab270b1288523fdf0295ffa63..ba79866cc063c7efac4ede2e3c5d3798cadd2ce6 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -12,6 +12,7 @@ #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" +#include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h" #include "torch_npu/csrc/npu/memory_snapshot.h" @@ -512,23 +513,35 @@ char* OptionsManager::GetCpuAffinityConf() return std::getenv("CPU_AFFINITY_CONF"); } +static uint32_t task_queue_enable = []() -> uint32_t { + char* env_val = std::getenv("TASK_QUEUE_ENABLE"); + int64_t task_queue_enable = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 1; + std::unordered_map taskQueueEnableMode = getTaskQueueEnableMode(); + if (taskQueueEnableMode.find(task_queue_enable) == taskQueueEnableMode.end()) { + TORCH_CHECK(false, "TASK_QUEUE_ENABLE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE)); + } + return static_cast(task_queue_enable); +}(); + uint32_t OptionsManager::GetTaskQueueEnable() { if (CheckBlockingEnable()) { return 0; } - const static uint32_t task_queue_enable = []() -> uint32_t { - char* env_val = std::getenv("TASK_QUEUE_ENABLE"); - int64_t task_queue_enable = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 1; - std::unordered_map taskQueueEnableMode = getTaskQueueEnableMode(); - if (taskQueueEnableMode.find(task_queue_enable) == taskQueueEnableMode.end()) { - TORCH_CHECK(false, "TASK_QUEUE_ENABLE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE)); - } - return static_cast(task_queue_enable); - }(); return task_queue_enable; } +void OptionsManager::SetTaskQueueEnable(uint32_t value) +{ + if (task_queue_enable != value) { + NPUStatus ret = c10_npu::emptyAllNPUStream(); + if (ret != SUCCESS) { + ASCEND_LOGE("Failed to empty NPU task queue, ret: %s", ret.c_str()); + } + task_queue_enable = value; + } +} + bool OptionsManager::CheckForceUncached() { const static bool force_uncached = []() -> bool { diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 73f5dbcb81f9fc268d8ef9122407e66b976dad08..f694f46ca4b651df5d0d5196624d0a075153f865 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -126,6 +126,7 @@ public: static uint32_t GetHcclBufferSize(); static uint32_t GetP2PBufferSize(); static uint32_t GetTaskQueueEnable(); + static void SetTaskQueueEnable(uint32_t value); static uint32_t GetAclOpInitMode(); static uint32_t GetStreamsPerDevice(); static char* GetCpuAffinityConf(); diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 24ed076429f20328e132af42c87346042cecf87a..6bbe1e6537738911b1e59a63aae69ae2e1cbda4b 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -29,6 +29,7 @@ #include "torch_npu/csrc/core/npu/NPUAffinityController.h" #include "torch_npu/csrc/core/npu/NPUGuard.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" +#include "torch_npu/csrc/core/npu/register/OptionsManager.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/register/OptionRegister.h" #include "torch_npu/csrc/core/OverflowUtils.h" @@ -1661,6 +1662,18 @@ static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args) END_HANDLE_TH_ERRORS } +PyObject* THNPModule_set_task_queue_enable(PyObject* self, PyObject* args) +{ + HANDLE_TH_ERRORS + int value; + if (!PyArg_ParseTuple(args, "i", &value)) { + throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); + } + c10_npu::option::OptionsManager::SetTaskQueueEnable(static_cast(value)); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static struct PyMethodDef THNPModule_methods[] = { {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr}, @@ -1723,6 +1736,7 @@ static struct PyMethodDef THNPModule_methods[] = { {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr}, {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr}, {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr}, + {"_set_task_queue_enable", (PyCFunction)THNPModule_set_task_queue_enable, METH_VARARGS, nullptr}, {nullptr}}; TORCH_NPU_API PyMethodDef* THNPModule_get_methods()