From 34af30cce570ab20bb7403cba80a63c45e6b2f64 Mon Sep 17 00:00:00 2001
From: qiuleilei <qiuleilei1@huawei.com>
Date: Mon, 4 Aug 2025 20:51:19 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=B8=89=E6=96=B9=E4=BE=9D?=
 =?UTF-8?q?=E8=B5=96onednn?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/external_libs/mkl_dnn.cmake             |  44 ---
 cmake/package_lite.cmake                      |   4 -
 mindspore-lite/CMakeLists.txt                 |   1 -
 mindspore-lite/cmake/ccsrc_module.cmake       |   1 -
 mindspore-lite/cmake/lite_dependences.cmake   |  10 -
 .../providers/siteai/CMakeLists.txt           |   1 -
 mindspore-lite/src/extendrt/CMakeLists.txt    |  15 -
 .../acl/cxx_api_lite/cxx_api/CMakeLists.txt   |   4 +-
 .../onednn/0001-fix-user-threadpool-bug.patch |  20 --
 .../patch/onednn/0002-fix-pool-nthr-bug.patch | 334 ------------------
 ...3-fix-zero-threads-identified-on-AMD.patch |  13 -
 .../patch/onednn/0004-fix-dnnl-limits.patch   |  10 -
 12 files changed, 2 insertions(+), 455 deletions(-)
 delete mode 100644 cmake/external_libs/mkl_dnn.cmake
 delete mode 100644 third_party/patch/onednn/0001-fix-user-threadpool-bug.patch
 delete mode 100644 third_party/patch/onednn/0002-fix-pool-nthr-bug.patch
 delete mode 100644 third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch
 delete mode 100644 third_party/patch/onednn/0004-fix-dnnl-limits.patch

diff --git a/cmake/external_libs/mkl_dnn.cmake b/cmake/external_libs/mkl_dnn.cmake
deleted file mode 100644
index 07256948..00000000
--- a/cmake/external_libs/mkl_dnn.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-set(onednn_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
-set(onednn_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
-set(onednn_LDFLAGS "-s")
-
-if(NOT MINDSPORE_PROJECT_DIR)
-set(MINDSPORE_PROJECT_DIR ${CMAKE_SOURCE_DIR})
-endif()
-
-if(USE_MS_THREADPOOL_FOR_DNNL)
-    set(USE_MS_THREADPOOL "-DDNNL_CPU_RUNTIME=THREADPOOL")
-else()
-    set(USE_MS_THREADPOOL "")
-endif()
-if(ENABLE_GITEE_EULER)
-        set(GIT_REPOSITORY "git@gitee.com:src-openeuler/onednn.git")
-        set(GIT_TAG "0d726f1")
-        set(SHA256 "4d655c0751ee6439584ef5e3d465953fe0c2f4ee2700bc02699bdc1d1572af0d")
-    __download_pkg_with_git(ONEDNN ${GIT_REPOSITORY} ${GIT_TAG} ${SHA256})
-    set(ONE_DNN_SRC "${CMAKE_BINARY_DIR}/_deps/onednn-src")
-    execute_process(COMMAND tar -xf ${ONE_DNN_SRC}/v2.2.tar.gz --strip-components 1 -C ${ONE_DNN_SRC})
-endif()
-
-if(ENABLE_GITEE)
-    set(REQ_URL "https://gitee.com/mirrors/MKL-DNN/repository/archive/v2.2.tar.gz")
-    set(SHA256 "2e809b11727af9d10784a5481b445a14387297161b5cc7f9c969c57fe40752bc")
-else()
-    set(REQ_URL "https://github.com/oneapi-src/oneDNN/archive/v2.2.tar.gz")
-    set(SHA256 "4d655c0751ee6439584ef5e3d465953fe0c2f4ee2700bc02699bdc1d1572af0d")
-endif()
-mindspore_add_pkg(onednn
-    VER 2.2
-    LIBS dnnl mkldnn
-    URL ${REQ_URL}
-    SHA256 ${SHA256}
-    PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch
-    PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch
-    PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch
-    PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0004-fix-dnnl-limits.patch
-    CMAKE_OPTION -DDNNL_ARCH_OPT_FLAGS='' -DDNNL_BUILD_EXAMPLES=OFF -DDNNL_BUILD_TESTS=OFF
-        ${USE_MS_THREADPOOL} -DDNNL_ENABLE_CONCURRENT_EXEC=ON)
-
-include_directories(${onednn_INC})
-add_library(mindspore::dnnl ALIAS onednn::dnnl)
-add_library(mindspore::mkldnn ALIAS onednn::mkldnn)
diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index 8e33fc3d..06cca6c5 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -867,10 +867,6 @@ else()
         endif()
         install(FILES ${glog_LIBPATH}/${glog_name} DESTINATION ${RUNTIME_LIB_DIR}
                 RENAME libmindspore_glog.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME})
-        if(MSLITE_DEPS_MKLDNN)
-                install(FILES ${onednn_LIBPATH}/libdnnl.so.2.2 DESTINATION ${DNNL_DIR}
-                        RENAME libdnnl.so.2 COMPONENT ${RUNTIME_COMPONENT_NAME})
-        endif()
         install(TARGETS mindspore_core mindspore_ops DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
         install(FILES ${BUILD_DIR}/src/extendrt/convert/libruntime_convert_plugin.so
                 DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
diff --git a/mindspore-lite/CMakeLists.txt b/mindspore-lite/CMakeLists.txt
index c497ee12..ea780917 100644
--- a/mindspore-lite/CMakeLists.txt
+++ b/mindspore-lite/CMakeLists.txt
@@ -947,7 +947,6 @@ endif()
 
 if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE OR MSLITE_ENABLE_CLOUD_INFERENCE)
     if(NOT MSLITE_TARGET_SITEAI)
-        set(MSLITE_DEPS_MKLDNN on)
         set(MSLITE_DEPS_LIBEVENT on)
         set(MSLITE_DEPS_PYBIND11 on)
     endif()
diff --git a/mindspore-lite/cmake/ccsrc_module.cmake b/mindspore-lite/cmake/ccsrc_module.cmake
index 6283cf57..23950e5c 100644
--- a/mindspore-lite/cmake/ccsrc_module.cmake
+++ b/mindspore-lite/cmake/ccsrc_module.cmake
@@ -18,7 +18,6 @@ list(APPEND MINDSPORE_PROTO_LIST ${COMM_PROTO_SRCS})
 
 include(${TOP_DIR}/cmake/external_libs/robin.cmake)
 include(${TOP_DIR}/cmake/external_libs/eigen.cmake)
-include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake)
 
 find_package(Python3 COMPONENTS Interpreter Development)
 if(Python3_FOUND)
diff --git a/mindspore-lite/cmake/lite_dependences.cmake b/mindspore-lite/cmake/lite_dependences.cmake
index 295a51cd..c49ad38f 100644
--- a/mindspore-lite/cmake/lite_dependences.cmake
+++ b/mindspore-lite/cmake/lite_dependences.cmake
@@ -39,16 +39,6 @@ if(MSLITE_DEPS_FAST_TRANSFORMERS)
     include(${TOP_DIR}/cmake/external_libs/fast_transformers.cmake)
 endif()
 
-if(MSLITE_DEPS_MKLDNN)
-    if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-        set(USE_MS_THREADPOOL_FOR_DNNL ON)
-    endif()
-    if(USE_MS_THREADPOOL_FOR_DNNL)
-        add_compile_definitions(USE_MS_THREADPOOL_FOR_DNNL)
-    endif()
-    include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake)
-endif()
-
 if(MSLITE_DEPS_LIBEVENT)
     include(${TOP_DIR}/cmake/external_libs/libevent.cmake)
 endif()
diff --git a/mindspore-lite/providers/siteai/CMakeLists.txt b/mindspore-lite/providers/siteai/CMakeLists.txt
index dccb9942..05a3f086 100644
--- a/mindspore-lite/providers/siteai/CMakeLists.txt
+++ b/mindspore-lite/providers/siteai/CMakeLists.txt
@@ -4,7 +4,6 @@ project(SiteAi)
 ##disable external libs
 set(MSLITE_DEPS_PYBIND11 on CACHE INTERNAL "setting MSLITE_DEPS_PYBIND11 value")
 set(MSLITE_DEPS_ROBIN_HOOD_HASHING off CACHE INTERNAL "setting MSLITE_DEPS_ROBIN_HOOD_HASHING value")
-set(MSLITE_DEPS_MKLDNN off CACHE INTERNAL "setting MSLITE_DEPS_MKLDNN value")
 set(MSLITE_DEPS_LIBEVENT off CACHE INTERNAL "setting MSLITE_DEPS_LIBEVENT value")
 set(MSLITE_DEPS_OPENSSL off CACHE INTERNAL "setting MSLITE_DEPS_OPENSSL value")
 set(MSLITE_DEPS_CMSIS off CACHE INTERNAL "setting MSLITE_DEPS_CMSIS value")
diff --git a/mindspore-lite/src/extendrt/CMakeLists.txt b/mindspore-lite/src/extendrt/CMakeLists.txt
index afd29018..7084b282 100644
--- a/mindspore-lite/src/extendrt/CMakeLists.txt
+++ b/mindspore-lite/src/extendrt/CMakeLists.txt
@@ -177,21 +177,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE OR MSLITE_ENABLE_CLOUD_INFERENCE)
                 mindspore-lite-proto)
         target_link_libraries(mindspore-extendrt_static _mindspore_cpu_kernel_mod_depend_obj
                 mindspore-lite-proto)
-        if(MSLITE_DEPS_MKLDNN)
-            add_dependencies(mindspore-extendrt mindspore::dnnl)
-            target_link_libraries(mindspore-extendrt mindspore::dnnl)
-            add_dependencies(mindspore-extendrt_static mindspore::dnnl)
-            target_link_libraries(mindspore-extendrt_static mindspore::dnnl)
-        endif()
-
-        if(MSLITE_DEPS_MKLDNN)
-            set(CPU_KERNEL_OBJECT_COUNT 0)
-            add_subdirectory(${OPS_DIR}/kernel/cpu lite_kernel_mod)
-            foreach(number RANGE 1 ${CPU_KERNEL_OBJECT_COUNT})
-                target_link_libraries(mindspore-extendrt _mindspore_ops_cpu_kernel_obj)
-                target_link_libraries(mindspore-extendrt_static _mindspore_ops_cpu_kernel_obj)
-            endforeach()
-        endif()
 
     endif()
 
diff --git a/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt b/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt
index e1c1a89e..a187f3ae 100644
--- a/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt
+++ b/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt
@@ -186,9 +186,9 @@ endif()
 
 if(ENABLE_CPU)
     if(BUILD_LITE)
-        target_link_libraries(mslite_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
+        target_link_libraries(mslite_shared_lib PRIVATE mindspore::dnnl nnacl)
     else()
-        target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
+        target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl nnacl)
     endif()
 endif()
 
diff --git a/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch b/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch
deleted file mode 100644
index f5256517..00000000
--- a/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch
+++ /dev/null
@@ -1,20 +0,0 @@
-diff --git a/src/common/dnnl_thread.hpp b/src/common/dnnl_thread.hpp
-index 342bc3b00..0b9190f9c 100644
---- a/src/common/dnnl_thread.hpp
-+++ b/src/common/dnnl_thread.hpp
-@@ -104,10 +104,11 @@ inline int dnnl_get_max_threads() {
-         def_max_threads
-                 = (int)dnnl::impl::cpu::platform::get_max_threads_to_use();
-     assert(def_max_threads > 0);
--    // Use the default value if the threadpool-provided is outside the range
--    // [1, def_max_threads]
--    return tp ? std::min(std::max(1, tp->get_num_threads()), def_max_threads)
--              : def_max_threads;
-+    
-+    // Make user responsible for number of threads provided at execution time.
-+    // This relates to the fact that the library may identify `def_max_threads`
-+    // incorrectly for a platform.
-+    return tp ? std::max(1, tp->get_num_threads()) : def_max_threads;
- }
- inline int dnnl_in_parallel() {
-     using namespace dnnl::impl::threadpool_utils;
diff --git a/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch b/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch
deleted file mode 100644
index d0ecb2f0..00000000
--- a/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch
+++ /dev/null
@@ -1,334 +0,0 @@
-diff --git a/src/cpu/nchw_pooling.cpp b/src/cpu/nchw_pooling.cpp
-index b678200a1..09736ccae 100644
---- a/src/cpu/nchw_pooling.cpp
-+++ b/src/cpu/nchw_pooling.cpp
-@@ -609,10 +609,12 @@ status_t nchw_pooling_bwd_t<data_type::bf16>::execute_backward(
-     int od_end = min(OD, 1 + (padF + ID - 1) / SD);
- 
-     dim_t c_blk = pd()->channel_block_size_;
--    int c_blk_tail = C % c_blk;
-+    dim_t c_blk_tail = C % c_blk;
-+    const int nthr = pd()->nthr_;
-+
-     if (alg == alg_kind::pooling_max) {
--        parallel_nd_ext(0, MB, utils::div_up(C, c_blk),
--                [&](int ithr, int, int mb, int cb) {
-+        parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
-+                [&](int ithr, int, dim_t mb, dim_t cb) {
-                     bool is_last_c_block
-                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
-                     int curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
-@@ -649,8 +651,8 @@ status_t nchw_pooling_bwd_t<data_type::bf16>::execute_backward(
-                             diff_src_fp32, src_sp_size * curr_c_block);
-                 });
-     } else {
--        parallel_nd_ext(0, MB, utils::div_up(C, c_blk),
--                [&](int ithr, int, int mb, int cb) {
-+        parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
-+                [&](int ithr, int, dim_t mb, dim_t cb) {
-                     bool is_last_c_block
-                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
-                     int curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
-diff --git a/src/cpu/nchw_pooling.hpp b/src/cpu/nchw_pooling.hpp
-index 9d649f3f5..2a73f6ae6 100644
---- a/src/cpu/nchw_pooling.hpp
-+++ b/src/cpu/nchw_pooling.hpp
-@@ -139,6 +139,7 @@ struct nchw_pooling_bwd_t : public primitive_t {
-                 ws_md_ = *hint_fwd_pd_->workspace_md();
-             }
- 
-+            nthr_ = dnnl_get_max_threads();
-             calculate_channel_block_size();
-             init_scratchpad();
- 
-@@ -146,6 +147,7 @@ struct nchw_pooling_bwd_t : public primitive_t {
-         }
- 
-         dim_t channel_block_size_;
-+        int nthr_; // To not exceed the limit in execute used for set up.
- 
-     private:
-         void init_scratchpad() {
-@@ -153,13 +155,12 @@ struct nchw_pooling_bwd_t : public primitive_t {
-             if (diff_dst_md()->data_type == data_type::bf16) {
-                 size_t dst_sz_ = OD() * OH() * OW();
-                 size_t src_sz_ = ID() * IH() * IW();
--                size_t nthrs = dnnl_get_max_threads();
-                 auto scratchpad = scratchpad_registry().registrar();
- 
-                 scratchpad.template book<float>(key_pool_src_bf16cvt,
--                        src_sz_ * nthrs * channel_block_size_);
-+                        src_sz_ * nthr_ * channel_block_size_);
-                 scratchpad.template book<float>(key_pool_dst_bf16cvt,
--                        dst_sz_ * nthrs * channel_block_size_);
-+                        dst_sz_ * nthr_ * channel_block_size_);
-             }
-         }
- 
-@@ -169,8 +170,7 @@ struct nchw_pooling_bwd_t : public primitive_t {
-             // spatial
-             dim_t dst_sz_ = OD() * OH() * OW();
-             dim_t src_sz_ = ID() * IH() * IW();
--            dim_t nthrs = dnnl_get_max_threads();
--            dim_t C_per_thr = nstl::min(MB() * C() / nthrs, C());
-+            dim_t C_per_thr = nstl::min(MB() * C() / nthr_, C());
-             const dim_t max_block_size
-                     = platform::get_per_core_cache_size(1) / 2;
-             dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
-diff --git a/src/cpu/nhwc_pooling.cpp b/src/cpu/nhwc_pooling.cpp
-index 48d9e1240..efe3083f7 100644
---- a/src/cpu/nhwc_pooling.cpp
-+++ b/src/cpu/nhwc_pooling.cpp
-@@ -378,8 +378,9 @@ status_t nhwc_pooling_fwd_t<data_type::bf16>::execute_forward(
-         return OSP * OC * mb + OSP * oc + SP * od + OW * oh + ow;
-     };
-     const bool are_postops_set = !(pd()->attr()->post_ops_.entry_.empty());
-+    const int nthr = pd()->nthr_;
- 
--    parallel_nd_ext(0, MB, OD, OH, OW,
-+    parallel_nd_ext(nthr, MB, OD, OH, OW,
-             [&](int ithr, int, int mb, int od, int oh, int ow) {
-                 const size_t dst_offset_init = strided_offset(mb, dst_n_stride,
-                         od, dst_d_stride, oh, dst_h_stride, ow, dst_w_stride);
-@@ -682,8 +683,9 @@ status_t nhwc_pooling_bwd_t<data_type::bf16>::execute_backward(
-     auto apply_offset = [=](int index, int offset) {
-         return (index > offset) ? index - offset : 0;
-     };
-+    const int nthr = pd()->nthr_;
- 
--    parallel_nd_ext(0, MB, ID, IH, IW,
-+    parallel_nd_ext(nthr, MB, ID, IH, IW,
-             [&](int ithr, int, int mb, int id, int ih, int iw) {
-                 size_t src_offset_init = strided_offset(mb, diff_src_n_stride,
-                         id, diff_src_d_stride, ih, diff_src_h_stride, iw,
-diff --git a/src/cpu/nhwc_pooling.hpp b/src/cpu/nhwc_pooling.hpp
-index c65196a94..c16e840a2 100644
---- a/src/cpu/nhwc_pooling.hpp
-+++ b/src/cpu/nhwc_pooling.hpp
-@@ -73,16 +73,19 @@ struct nhwc_pooling_fwd_t : public primitive_t {
-                 init_default_ws();
-             }
- 
-+            nthr_ = dnnl_get_max_threads();
-             init_scratchpad();
- 
-             return status::success;
-         }
- 
-+        int nthr_; // To not exceed the limit in execute used for set up.
-+
-     private:
-         void init_scratchpad() {
-             using namespace memory_tracking::names;
-             if (src_md()->data_type == data_type::bf16) {
--                const size_t bf16cvt_sz_ = C() * dnnl_get_max_threads();
-+                const size_t bf16cvt_sz_ = C() * nthr_;
-                 auto scratchpad = scratchpad_registry().registrar();
-                 scratchpad.template book<float>(
-                         key_pool_src_bf16cvt, bf16cvt_sz_);
-@@ -148,16 +151,19 @@ struct nhwc_pooling_bwd_t : public primitive_t {
-                 if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
-             }
- 
-+            nthr_ = dnnl_get_max_threads();
-             init_scratchpad();
- 
-             return status::success;
-         }
- 
-+        int nthr_; // To not exceed the limit in execute used for set up.
-+
-     private:
-         void init_scratchpad() {
-             using namespace memory_tracking::names;
-             if (diff_src_md()->data_type == data_type::bf16) {
--                size_t bf16cvt_sz_ = C() * dnnl_get_max_threads();
-+                size_t bf16cvt_sz_ = C() * nthr_;
-                 auto scratchpad = scratchpad_registry().registrar();
-                 scratchpad.template book<float>(
-                         key_pool_src_bf16cvt, bf16cvt_sz_);
-diff --git a/src/cpu/x64/jit_primitive_conf.hpp b/src/cpu/x64/jit_primitive_conf.hpp
-index a2a181cfa..5befb81ac 100644
---- a/src/cpu/x64/jit_primitive_conf.hpp
-+++ b/src/cpu/x64/jit_primitive_conf.hpp
-@@ -672,6 +672,7 @@ struct jit_pool_conf_t {
-     bool with_postops;
-     bool with_eltwise;
-     bool with_binary;
-+    int nthr;
- };
- 
- struct jit_pool_call_s {
-diff --git a/src/cpu/x64/jit_uni_pool_kernel.cpp b/src/cpu/x64/jit_uni_pool_kernel.cpp
-index 36d129e6d..ebd4f3af1 100644
---- a/src/cpu/x64/jit_uni_pool_kernel.cpp
-+++ b/src/cpu/x64/jit_uni_pool_kernel.cpp
-@@ -76,8 +76,7 @@ jit_uni_pool_kernel<isa>::jit_uni_pool_kernel(
- 
- template <cpu_isa_t isa>
- status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
--        memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd,
--        int nthreads) {
-+        memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd) {
- 
-     const auto &pd = *ppd->desc();
-     const memory_desc_wrapper src_d(
-@@ -87,6 +86,7 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
- 
-     const int ndims = src_d.ndims();
- 
-+    jpp.nthr = dnnl_get_max_threads();
-     jpp.is_training = pd.prop_kind == prop_kind::forward_training;
-     jpp.is_backward = pd.prop_kind == prop_kind::backward_data;
- 
-@@ -248,7 +248,7 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
-                     ? (ndims == 5 && jpp.simple_alg ? jpp.od : 1)
-                     : (ndims == 5 ? jpp.od : jpp.oh);
-             work *= jpp.mb * nb2_c;
--            auto eff = (float)work / utils::rnd_up(work, nthreads);
-+            auto eff = (float)work / utils::rnd_up(work, jpp.nthr);
-             if (eff > best_eff) {
- 
-                 best_eff = eff;
-diff --git a/src/cpu/x64/jit_uni_pool_kernel.hpp b/src/cpu/x64/jit_uni_pool_kernel.hpp
-index d5d5f25a2..57ce6f43d 100644
---- a/src/cpu/x64/jit_uni_pool_kernel.hpp
-+++ b/src/cpu/x64/jit_uni_pool_kernel.hpp
-@@ -46,8 +46,7 @@ struct jit_uni_pool_kernel : public jit_generator {
-     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_pool_kernel)
- 
-     static status_t init_conf(jit_pool_conf_t &jbp,
--            memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd,
--            int nthreads);
-+            memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd);
- 
- private:
-     using Xmm = Xbyak::Xmm;
-diff --git a/src/cpu/x64/jit_uni_pooling.cpp b/src/cpu/x64/jit_uni_pooling.cpp
-index b2055f2a9..29987f70c 100644
---- a/src/cpu/x64/jit_uni_pooling.cpp
-+++ b/src/cpu/x64/jit_uni_pooling.cpp
-@@ -612,6 +612,8 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward(const data_t *src,
-         (*kernel_)(&arg);
-     };
- 
-+    const int nthr = jpp.nthr;
-+
-     if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
-         const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-         parallel_nd(jpp.mb, jpp.oh, nb2_c, [&](int n, int oh, int b2_c) {
-@@ -622,7 +624,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward(const data_t *src,
-     } else {
-         if (trans_src || trans_dst) {
-             // ncsp format
--            parallel_nd_ext(0, jpp.mb, jpp.nb_c,
-+            parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
-                     [&](int ithr, int nthr, int n, int b_c) {
-                         if (trans_src)
-                             transpose_facade.execute_transpose_input(
-@@ -635,7 +637,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward(const data_t *src,
-                     });
-         } else {
-             // nChw16c, nChw8c format
--            parallel(0, [&](std::size_t ithr, std::size_t nthr) {
-+            parallel(nthr, [&](int ithr, int nthr) {
-                 const std::size_t work_amount
-                         = static_cast<std::size_t>(jpp.mb) * jpp.nb_c * jpp.oh;
-                 if (ithr >= work_amount) return;
-@@ -739,6 +741,8 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward_3d(const data_t *src,
-         (*kernel_)(&arg);
-     };
- 
-+    const int nthr = jpp.nthr;
-+
-     if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
-         const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-         parallel_nd(jpp.mb, jpp.od, nb2_c, [&](int n, int od, int b2_c) {
-@@ -757,7 +761,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward_3d(const data_t *src,
-         });
-     } else {
-         if (trans_src || trans_dst) {
--            parallel_nd_ext(0, jpp.mb, jpp.nb_c,
-+            parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
-                     [&](int ithr, int nthr, int n, int b_c) {
-                         if (trans_src)
-                             transpose_facade.execute_transpose_input(
-@@ -948,7 +952,9 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
-             transpose_facade.execute_transpose_output(ithr, n, b_c);
-     };
- 
--    parallel(0, [&](int ithr, int nthr) {
-+    const int nthr = jpp.nthr;
-+
-+    parallel(nthr, [&](int ithr, int nthr) {
-         const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-         const std::size_t work_amount
-                 = static_cast<std::size_t>(jpp.mb) * nb2_c;
-@@ -1098,6 +1104,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
-         }
-     };
- 
-+    const int nthr = jpp.nthr;
-+
-     if (jpp.simple_alg) {
-         if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
-             const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-@@ -1109,7 +1117,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
-         } else {
-             assert(jpp.ur_bc == 1);
-             if (trans_src || trans_dst) {
--                parallel_nd_ext(0, jpp.mb, jpp.nb_c,
-+                parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
-                         [&](int ithr, int nthr, int n, int b_c) {
-                             if (trans_src)
-                                 transpose_facade.execute_transpose_input(
-@@ -1142,7 +1150,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
-             if (!trans_src) {
-                 const size_t chunk_size
-                         = (size_t)jpp.id * jpp.ih * jpp.iw * jpp.c_block;
--                parallel_nd_ext(0, jpp.mb, jpp.nb_c,
-+                parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
-                         [&](int ithr, int nthr, int n, int b_c) {
-                             const size_t offset
-                                     = ((size_t)n * jpp.nb_c + b_c) * chunk_size;
-@@ -1155,8 +1163,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
- 
-         const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-         if (trans_src || trans_dst) {
--            parallel_nd_ext(
--                    0, jpp.mb, nb2_c, [&](int ithr, int nthr, int n, int b2_c) {
-+            parallel_nd_ext(nthr, jpp.mb, nb2_c,
-+                    [&](int ithr, int nthr, int n, int b2_c) {
-                         const auto b_c = b2_c * jpp.ur_bc;
- 
-                         if (trans_dst) {
-diff --git a/src/cpu/x64/jit_uni_pooling.hpp b/src/cpu/x64/jit_uni_pooling.hpp
-index ec4b04a2b..e25d9ce05 100644
---- a/src/cpu/x64/jit_uni_pooling.hpp
-+++ b/src/cpu/x64/jit_uni_pooling.hpp
-@@ -66,8 +66,9 @@ struct jit_uni_pooling_fwd_t : public primitive_t {
-                 init_default_ws();
- 
-             auto scratchpad = scratchpad_registry().registrar();
--            return jit_uni_pool_kernel<isa>::init_conf(
--                    jpp_, scratchpad, this, dnnl_get_max_threads());
-+            CHECK(jit_uni_pool_kernel<isa>::init_conf(jpp_, scratchpad, this));
-+
-+            return status::success;
-         }
- 
-         jit_pool_conf_t jpp_;
-@@ -130,9 +131,11 @@ struct jit_uni_pooling_bwd_t : public primitive_t {
-                 init_default_ws();
-                 if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
-             }
-+
-             auto scratchpad = scratchpad_registry().registrar();
--            return jit_uni_pool_kernel<isa>::init_conf(
--                    jpp_, scratchpad, this, dnnl_get_max_threads());
-+            CHECK(jit_uni_pool_kernel<isa>::init_conf(jpp_, scratchpad, this));
-+
-+            return status::success;
-         }
- 
-         jit_pool_conf_t jpp_;
diff --git a/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch b/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch
deleted file mode 100644
index 0c3b6a76..00000000
--- a/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
-index 1397073ba..041a3436f 100644
---- a/src/cpu/platform.cpp
-+++ b/src/cpu/platform.cpp
-@@ -154,6 +154,8 @@ unsigned get_num_cores() {
- // function supports process affinity.
- unsigned get_max_threads_to_use() {
-     int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores();
-+    if (num_cores_per_socket <= 1)
-+        num_cores_per_socket = std::thread::hardware_concurrency();
- #if defined(_WIN32)
-     DWORD_PTR proc_affinity_mask;
-     DWORD_PTR sys_affinity_mask;
diff --git a/third_party/patch/onednn/0004-fix-dnnl-limits.patch b/third_party/patch/onednn/0004-fix-dnnl-limits.patch
deleted file mode 100644
index 7638e4ae..00000000
--- a/third_party/patch/onednn/0004-fix-dnnl-limits.patch
+++ /dev/null
@@ -1,10 +0,0 @@
---- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
-+++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
-@@ -28,6 +28,7 @@
- #include <deque>
- #include <initializer_list>
- #include <iostream>
-+#include <limits>
- #include <list>
- #include <type_traits>
- #include <unordered_map>
-- 
Gitee