From 34af30cce570ab20bb7403cba80a63c45e6b2f64 Mon Sep 17 00:00:00 2001 From: qiuleilei Date: Mon, 4 Aug 2025 20:51:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=B8=89=E6=96=B9=E4=BE=9D?= =?UTF-8?q?=E8=B5=96onednn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/external_libs/mkl_dnn.cmake | 44 --- cmake/package_lite.cmake | 4 - mindspore-lite/CMakeLists.txt | 1 - mindspore-lite/cmake/ccsrc_module.cmake | 1 - mindspore-lite/cmake/lite_dependences.cmake | 10 - .../providers/siteai/CMakeLists.txt | 1 - mindspore-lite/src/extendrt/CMakeLists.txt | 15 - .../acl/cxx_api_lite/cxx_api/CMakeLists.txt | 4 +- .../onednn/0001-fix-user-threadpool-bug.patch | 20 -- .../patch/onednn/0002-fix-pool-nthr-bug.patch | 334 ------------------ ...3-fix-zero-threads-identified-on-AMD.patch | 13 - .../patch/onednn/0004-fix-dnnl-limits.patch | 10 - 12 files changed, 2 insertions(+), 455 deletions(-) delete mode 100644 cmake/external_libs/mkl_dnn.cmake delete mode 100644 third_party/patch/onednn/0001-fix-user-threadpool-bug.patch delete mode 100644 third_party/patch/onednn/0002-fix-pool-nthr-bug.patch delete mode 100644 third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch delete mode 100644 third_party/patch/onednn/0004-fix-dnnl-limits.patch diff --git a/cmake/external_libs/mkl_dnn.cmake b/cmake/external_libs/mkl_dnn.cmake deleted file mode 100644 index 07256948..00000000 --- a/cmake/external_libs/mkl_dnn.cmake +++ /dev/null @@ -1,44 +0,0 @@ -set(onednn_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2") -set(onednn_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") -set(onednn_LDFLAGS "-s") - -if(NOT MINDSPORE_PROJECT_DIR) -set(MINDSPORE_PROJECT_DIR ${CMAKE_SOURCE_DIR}) -endif() - -if(USE_MS_THREADPOOL_FOR_DNNL) - set(USE_MS_THREADPOOL "-DDNNL_CPU_RUNTIME=THREADPOOL") -else() - set(USE_MS_THREADPOOL "") -endif() -if(ENABLE_GITEE_EULER) - set(GIT_REPOSITORY "git@gitee.com:src-openeuler/onednn.git") - set(GIT_TAG "0d726f1") - set(SHA256 "4d655c0751ee6439584ef5e3d465953fe0c2f4ee2700bc02699bdc1d1572af0d") - __download_pkg_with_git(ONEDNN ${GIT_REPOSITORY} ${GIT_TAG} ${SHA256}) - set(ONE_DNN_SRC "${CMAKE_BINARY_DIR}/_deps/onednn-src") - execute_process(COMMAND tar -xf ${ONE_DNN_SRC}/v2.2.tar.gz --strip-components 1 -C ${ONE_DNN_SRC}) -endif() - -if(ENABLE_GITEE) - set(REQ_URL "https://gitee.com/mirrors/MKL-DNN/repository/archive/v2.2.tar.gz") - set(SHA256 "2e809b11727af9d10784a5481b445a14387297161b5cc7f9c969c57fe40752bc") -else() - set(REQ_URL "https://github.com/oneapi-src/oneDNN/archive/v2.2.tar.gz") - set(SHA256 "4d655c0751ee6439584ef5e3d465953fe0c2f4ee2700bc02699bdc1d1572af0d") -endif() -mindspore_add_pkg(onednn - VER 2.2 - LIBS dnnl mkldnn - URL ${REQ_URL} - SHA256 ${SHA256} - PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch - PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch - PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch - PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/onednn/0004-fix-dnnl-limits.patch - CMAKE_OPTION -DDNNL_ARCH_OPT_FLAGS='' -DDNNL_BUILD_EXAMPLES=OFF -DDNNL_BUILD_TESTS=OFF - ${USE_MS_THREADPOOL} -DDNNL_ENABLE_CONCURRENT_EXEC=ON) - -include_directories(${onednn_INC}) -add_library(mindspore::dnnl ALIAS onednn::dnnl) -add_library(mindspore::mkldnn ALIAS onednn::mkldnn) diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake index 8e33fc3d..06cca6c5 100644 --- a/cmake/package_lite.cmake +++ b/cmake/package_lite.cmake @@ -867,10 +867,6 @@ else() endif() install(FILES ${glog_LIBPATH}/${glog_name} DESTINATION ${RUNTIME_LIB_DIR} RENAME libmindspore_glog.so.0 COMPONENT ${RUNTIME_COMPONENT_NAME}) - if(MSLITE_DEPS_MKLDNN) - install(FILES ${onednn_LIBPATH}/libdnnl.so.2.2 DESTINATION ${DNNL_DIR} - RENAME libdnnl.so.2 COMPONENT ${RUNTIME_COMPONENT_NAME}) - endif() install(TARGETS mindspore_core mindspore_ops DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME}) install(FILES ${BUILD_DIR}/src/extendrt/convert/libruntime_convert_plugin.so DESTINATION ${RUNTIME_LIB_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME}) diff --git a/mindspore-lite/CMakeLists.txt b/mindspore-lite/CMakeLists.txt index c497ee12..ea780917 100644 --- a/mindspore-lite/CMakeLists.txt +++ b/mindspore-lite/CMakeLists.txt @@ -947,7 +947,6 @@ endif() if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE OR MSLITE_ENABLE_CLOUD_INFERENCE) if(NOT MSLITE_TARGET_SITEAI) - set(MSLITE_DEPS_MKLDNN on) set(MSLITE_DEPS_LIBEVENT on) set(MSLITE_DEPS_PYBIND11 on) endif() diff --git a/mindspore-lite/cmake/ccsrc_module.cmake b/mindspore-lite/cmake/ccsrc_module.cmake index 6283cf57..23950e5c 100644 --- a/mindspore-lite/cmake/ccsrc_module.cmake +++ b/mindspore-lite/cmake/ccsrc_module.cmake @@ -18,7 +18,6 @@ list(APPEND MINDSPORE_PROTO_LIST ${COMM_PROTO_SRCS}) include(${TOP_DIR}/cmake/external_libs/robin.cmake) include(${TOP_DIR}/cmake/external_libs/eigen.cmake) -include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake) find_package(Python3 COMPONENTS Interpreter Development) if(Python3_FOUND) diff --git a/mindspore-lite/cmake/lite_dependences.cmake b/mindspore-lite/cmake/lite_dependences.cmake index 295a51cd..c49ad38f 100644 --- a/mindspore-lite/cmake/lite_dependences.cmake +++ b/mindspore-lite/cmake/lite_dependences.cmake @@ -39,16 +39,6 @@ if(MSLITE_DEPS_FAST_TRANSFORMERS) include(${TOP_DIR}/cmake/external_libs/fast_transformers.cmake) endif() -if(MSLITE_DEPS_MKLDNN) - if(CMAKE_SYSTEM_NAME MATCHES "Linux") - set(USE_MS_THREADPOOL_FOR_DNNL ON) - endif() - if(USE_MS_THREADPOOL_FOR_DNNL) - add_compile_definitions(USE_MS_THREADPOOL_FOR_DNNL) - endif() - include(${TOP_DIR}/cmake/external_libs/mkl_dnn.cmake) -endif() - if(MSLITE_DEPS_LIBEVENT) include(${TOP_DIR}/cmake/external_libs/libevent.cmake) endif() diff --git a/mindspore-lite/providers/siteai/CMakeLists.txt b/mindspore-lite/providers/siteai/CMakeLists.txt index dccb9942..05a3f086 100644 --- a/mindspore-lite/providers/siteai/CMakeLists.txt +++ b/mindspore-lite/providers/siteai/CMakeLists.txt @@ -4,7 +4,6 @@ project(SiteAi) ##disable external libs set(MSLITE_DEPS_PYBIND11 on CACHE INTERNAL "setting MSLITE_DEPS_PYBIND11 value") set(MSLITE_DEPS_ROBIN_HOOD_HASHING off CACHE INTERNAL "setting MSLITE_DEPS_ROBIN_HOOD_HASHING value") -set(MSLITE_DEPS_MKLDNN off CACHE INTERNAL "setting MSLITE_DEPS_MKLDNN value") set(MSLITE_DEPS_LIBEVENT off CACHE INTERNAL "setting MSLITE_DEPS_LIBEVENT value") set(MSLITE_DEPS_OPENSSL off CACHE INTERNAL "setting MSLITE_DEPS_OPENSSL value") set(MSLITE_DEPS_CMSIS off CACHE INTERNAL "setting MSLITE_DEPS_CMSIS value") diff --git a/mindspore-lite/src/extendrt/CMakeLists.txt b/mindspore-lite/src/extendrt/CMakeLists.txt index afd29018..7084b282 100644 --- a/mindspore-lite/src/extendrt/CMakeLists.txt +++ b/mindspore-lite/src/extendrt/CMakeLists.txt @@ -177,21 +177,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE OR MSLITE_ENABLE_CLOUD_INFERENCE) mindspore-lite-proto) target_link_libraries(mindspore-extendrt_static _mindspore_cpu_kernel_mod_depend_obj mindspore-lite-proto) - if(MSLITE_DEPS_MKLDNN) - add_dependencies(mindspore-extendrt mindspore::dnnl) - target_link_libraries(mindspore-extendrt mindspore::dnnl) - add_dependencies(mindspore-extendrt_static mindspore::dnnl) - target_link_libraries(mindspore-extendrt_static mindspore::dnnl) - endif() - - if(MSLITE_DEPS_MKLDNN) - set(CPU_KERNEL_OBJECT_COUNT 0) - add_subdirectory(${OPS_DIR}/kernel/cpu lite_kernel_mod) - foreach(number RANGE 1 ${CPU_KERNEL_OBJECT_COUNT}) - target_link_libraries(mindspore-extendrt _mindspore_ops_cpu_kernel_obj) - target_link_libraries(mindspore-extendrt_static _mindspore_ops_cpu_kernel_obj) - endforeach() - endif() endif() diff --git a/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt b/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt index e1c1a89e..a187f3ae 100644 --- a/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt +++ b/mindspore-lite/tools/converter/adapter/acl/cxx_api_lite/cxx_api/CMakeLists.txt @@ -186,9 +186,9 @@ endif() if(ENABLE_CPU) if(BUILD_LITE) - target_link_libraries(mslite_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn nnacl) + target_link_libraries(mslite_shared_lib PRIVATE mindspore::dnnl nnacl) else() - target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn nnacl) + target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl nnacl) endif() endif() diff --git a/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch b/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch deleted file mode 100644 index f5256517..00000000 --- a/third_party/patch/onednn/0001-fix-user-threadpool-bug.patch +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/src/common/dnnl_thread.hpp b/src/common/dnnl_thread.hpp -index 342bc3b00..0b9190f9c 100644 ---- a/src/common/dnnl_thread.hpp -+++ b/src/common/dnnl_thread.hpp -@@ -104,10 +104,11 @@ inline int dnnl_get_max_threads() { - def_max_threads - = (int)dnnl::impl::cpu::platform::get_max_threads_to_use(); - assert(def_max_threads > 0); -- // Use the default value if the threadpool-provided is outside the range -- // [1, def_max_threads] -- return tp ? std::min(std::max(1, tp->get_num_threads()), def_max_threads) -- : def_max_threads; -+ -+ // Make user responsible for number of threads provided at execution time. -+ // This relates to the fact that the library may identify `def_max_threads` -+ // incorrectly for a platform. -+ return tp ? std::max(1, tp->get_num_threads()) : def_max_threads; - } - inline int dnnl_in_parallel() { - using namespace dnnl::impl::threadpool_utils; diff --git a/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch b/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch deleted file mode 100644 index d0ecb2f0..00000000 --- a/third_party/patch/onednn/0002-fix-pool-nthr-bug.patch +++ /dev/null @@ -1,334 +0,0 @@ -diff --git a/src/cpu/nchw_pooling.cpp b/src/cpu/nchw_pooling.cpp -index b678200a1..09736ccae 100644 ---- a/src/cpu/nchw_pooling.cpp -+++ b/src/cpu/nchw_pooling.cpp -@@ -609,10 +609,12 @@ status_t nchw_pooling_bwd_t::execute_backward( - int od_end = min(OD, 1 + (padF + ID - 1) / SD); - - dim_t c_blk = pd()->channel_block_size_; -- int c_blk_tail = C % c_blk; -+ dim_t c_blk_tail = C % c_blk; -+ const int nthr = pd()->nthr_; -+ - if (alg == alg_kind::pooling_max) { -- parallel_nd_ext(0, MB, utils::div_up(C, c_blk), -- [&](int ithr, int, int mb, int cb) { -+ parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk), -+ [&](int ithr, int, dim_t mb, dim_t cb) { - bool is_last_c_block - = c_blk_tail > 0 && (cb + 1) * c_blk > C; - int curr_c_block = is_last_c_block ? c_blk_tail : c_blk; -@@ -649,8 +651,8 @@ status_t nchw_pooling_bwd_t::execute_backward( - diff_src_fp32, src_sp_size * curr_c_block); - }); - } else { -- parallel_nd_ext(0, MB, utils::div_up(C, c_blk), -- [&](int ithr, int, int mb, int cb) { -+ parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk), -+ [&](int ithr, int, dim_t mb, dim_t cb) { - bool is_last_c_block - = c_blk_tail > 0 && (cb + 1) * c_blk > C; - int curr_c_block = is_last_c_block ? c_blk_tail : c_blk; -diff --git a/src/cpu/nchw_pooling.hpp b/src/cpu/nchw_pooling.hpp -index 9d649f3f5..2a73f6ae6 100644 ---- a/src/cpu/nchw_pooling.hpp -+++ b/src/cpu/nchw_pooling.hpp -@@ -139,6 +139,7 @@ struct nchw_pooling_bwd_t : public primitive_t { - ws_md_ = *hint_fwd_pd_->workspace_md(); - } - -+ nthr_ = dnnl_get_max_threads(); - calculate_channel_block_size(); - init_scratchpad(); - -@@ -146,6 +147,7 @@ struct nchw_pooling_bwd_t : public primitive_t { - } - - dim_t channel_block_size_; -+ int nthr_; // To not exceed the limit in execute used for set up. - - private: - void init_scratchpad() { -@@ -153,13 +155,12 @@ struct nchw_pooling_bwd_t : public primitive_t { - if (diff_dst_md()->data_type == data_type::bf16) { - size_t dst_sz_ = OD() * OH() * OW(); - size_t src_sz_ = ID() * IH() * IW(); -- size_t nthrs = dnnl_get_max_threads(); - auto scratchpad = scratchpad_registry().registrar(); - - scratchpad.template book(key_pool_src_bf16cvt, -- src_sz_ * nthrs * channel_block_size_); -+ src_sz_ * nthr_ * channel_block_size_); - scratchpad.template book(key_pool_dst_bf16cvt, -- dst_sz_ * nthrs * channel_block_size_); -+ dst_sz_ * nthr_ * channel_block_size_); - } - } - -@@ -169,8 +170,7 @@ struct nchw_pooling_bwd_t : public primitive_t { - // spatial - dim_t dst_sz_ = OD() * OH() * OW(); - dim_t src_sz_ = ID() * IH() * IW(); -- dim_t nthrs = dnnl_get_max_threads(); -- dim_t C_per_thr = nstl::min(MB() * C() / nthrs, C()); -+ dim_t C_per_thr = nstl::min(MB() * C() / nthr_, C()); - const dim_t max_block_size - = platform::get_per_core_cache_size(1) / 2; - dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16 -diff --git a/src/cpu/nhwc_pooling.cpp b/src/cpu/nhwc_pooling.cpp -index 48d9e1240..efe3083f7 100644 ---- a/src/cpu/nhwc_pooling.cpp -+++ b/src/cpu/nhwc_pooling.cpp -@@ -378,8 +378,9 @@ status_t nhwc_pooling_fwd_t::execute_forward( - return OSP * OC * mb + OSP * oc + SP * od + OW * oh + ow; - }; - const bool are_postops_set = !(pd()->attr()->post_ops_.entry_.empty()); -+ const int nthr = pd()->nthr_; - -- parallel_nd_ext(0, MB, OD, OH, OW, -+ parallel_nd_ext(nthr, MB, OD, OH, OW, - [&](int ithr, int, int mb, int od, int oh, int ow) { - const size_t dst_offset_init = strided_offset(mb, dst_n_stride, - od, dst_d_stride, oh, dst_h_stride, ow, dst_w_stride); -@@ -682,8 +683,9 @@ status_t nhwc_pooling_bwd_t::execute_backward( - auto apply_offset = [=](int index, int offset) { - return (index > offset) ? index - offset : 0; - }; -+ const int nthr = pd()->nthr_; - -- parallel_nd_ext(0, MB, ID, IH, IW, -+ parallel_nd_ext(nthr, MB, ID, IH, IW, - [&](int ithr, int, int mb, int id, int ih, int iw) { - size_t src_offset_init = strided_offset(mb, diff_src_n_stride, - id, diff_src_d_stride, ih, diff_src_h_stride, iw, -diff --git a/src/cpu/nhwc_pooling.hpp b/src/cpu/nhwc_pooling.hpp -index c65196a94..c16e840a2 100644 ---- a/src/cpu/nhwc_pooling.hpp -+++ b/src/cpu/nhwc_pooling.hpp -@@ -73,16 +73,19 @@ struct nhwc_pooling_fwd_t : public primitive_t { - init_default_ws(); - } - -+ nthr_ = dnnl_get_max_threads(); - init_scratchpad(); - - return status::success; - } - -+ int nthr_; // To not exceed the limit in execute used for set up. -+ - private: - void init_scratchpad() { - using namespace memory_tracking::names; - if (src_md()->data_type == data_type::bf16) { -- const size_t bf16cvt_sz_ = C() * dnnl_get_max_threads(); -+ const size_t bf16cvt_sz_ = C() * nthr_; - auto scratchpad = scratchpad_registry().registrar(); - scratchpad.template book( - key_pool_src_bf16cvt, bf16cvt_sz_); -@@ -148,16 +151,19 @@ struct nhwc_pooling_bwd_t : public primitive_t { - if (!compare_ws(hint_fwd_pd_)) return status::unimplemented; - } - -+ nthr_ = dnnl_get_max_threads(); - init_scratchpad(); - - return status::success; - } - -+ int nthr_; // To not exceed the limit in execute used for set up. -+ - private: - void init_scratchpad() { - using namespace memory_tracking::names; - if (diff_src_md()->data_type == data_type::bf16) { -- size_t bf16cvt_sz_ = C() * dnnl_get_max_threads(); -+ size_t bf16cvt_sz_ = C() * nthr_; - auto scratchpad = scratchpad_registry().registrar(); - scratchpad.template book( - key_pool_src_bf16cvt, bf16cvt_sz_); -diff --git a/src/cpu/x64/jit_primitive_conf.hpp b/src/cpu/x64/jit_primitive_conf.hpp -index a2a181cfa..5befb81ac 100644 ---- a/src/cpu/x64/jit_primitive_conf.hpp -+++ b/src/cpu/x64/jit_primitive_conf.hpp -@@ -672,6 +672,7 @@ struct jit_pool_conf_t { - bool with_postops; - bool with_eltwise; - bool with_binary; -+ int nthr; - }; - - struct jit_pool_call_s { -diff --git a/src/cpu/x64/jit_uni_pool_kernel.cpp b/src/cpu/x64/jit_uni_pool_kernel.cpp -index 36d129e6d..ebd4f3af1 100644 ---- a/src/cpu/x64/jit_uni_pool_kernel.cpp -+++ b/src/cpu/x64/jit_uni_pool_kernel.cpp -@@ -76,8 +76,7 @@ jit_uni_pool_kernel::jit_uni_pool_kernel( - - template - status_t jit_uni_pool_kernel::init_conf(jit_pool_conf_t &jpp, -- memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd, -- int nthreads) { -+ memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd) { - - const auto &pd = *ppd->desc(); - const memory_desc_wrapper src_d( -@@ -87,6 +86,7 @@ status_t jit_uni_pool_kernel::init_conf(jit_pool_conf_t &jpp, - - const int ndims = src_d.ndims(); - -+ jpp.nthr = dnnl_get_max_threads(); - jpp.is_training = pd.prop_kind == prop_kind::forward_training; - jpp.is_backward = pd.prop_kind == prop_kind::backward_data; - -@@ -248,7 +248,7 @@ status_t jit_uni_pool_kernel::init_conf(jit_pool_conf_t &jpp, - ? (ndims == 5 && jpp.simple_alg ? jpp.od : 1) - : (ndims == 5 ? jpp.od : jpp.oh); - work *= jpp.mb * nb2_c; -- auto eff = (float)work / utils::rnd_up(work, nthreads); -+ auto eff = (float)work / utils::rnd_up(work, jpp.nthr); - if (eff > best_eff) { - - best_eff = eff; -diff --git a/src/cpu/x64/jit_uni_pool_kernel.hpp b/src/cpu/x64/jit_uni_pool_kernel.hpp -index d5d5f25a2..57ce6f43d 100644 ---- a/src/cpu/x64/jit_uni_pool_kernel.hpp -+++ b/src/cpu/x64/jit_uni_pool_kernel.hpp -@@ -46,8 +46,7 @@ struct jit_uni_pool_kernel : public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_pool_kernel) - - static status_t init_conf(jit_pool_conf_t &jbp, -- memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd, -- int nthreads); -+ memory_tracking::registrar_t &scratchpad, const pooling_pd_t *ppd); - - private: - using Xmm = Xbyak::Xmm; -diff --git a/src/cpu/x64/jit_uni_pooling.cpp b/src/cpu/x64/jit_uni_pooling.cpp -index b2055f2a9..29987f70c 100644 ---- a/src/cpu/x64/jit_uni_pooling.cpp -+++ b/src/cpu/x64/jit_uni_pooling.cpp -@@ -612,6 +612,8 @@ void jit_uni_pooling_fwd_t::execute_forward(const data_t *src, - (*kernel_)(&arg); - }; - -+ const int nthr = jpp.nthr; -+ - if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) { - const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc); - parallel_nd(jpp.mb, jpp.oh, nb2_c, [&](int n, int oh, int b2_c) { -@@ -622,7 +624,7 @@ void jit_uni_pooling_fwd_t::execute_forward(const data_t *src, - } else { - if (trans_src || trans_dst) { - // ncsp format -- parallel_nd_ext(0, jpp.mb, jpp.nb_c, -+ parallel_nd_ext(nthr, jpp.mb, jpp.nb_c, - [&](int ithr, int nthr, int n, int b_c) { - if (trans_src) - transpose_facade.execute_transpose_input( -@@ -635,7 +637,7 @@ void jit_uni_pooling_fwd_t::execute_forward(const data_t *src, - }); - } else { - // nChw16c, nChw8c format -- parallel(0, [&](std::size_t ithr, std::size_t nthr) { -+ parallel(nthr, [&](int ithr, int nthr) { - const std::size_t work_amount - = static_cast(jpp.mb) * jpp.nb_c * jpp.oh; - if (ithr >= work_amount) return; -@@ -739,6 +741,8 @@ void jit_uni_pooling_fwd_t::execute_forward_3d(const data_t *src, - (*kernel_)(&arg); - }; - -+ const int nthr = jpp.nthr; -+ - if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) { - const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc); - parallel_nd(jpp.mb, jpp.od, nb2_c, [&](int n, int od, int b2_c) { -@@ -757,7 +761,7 @@ void jit_uni_pooling_fwd_t::execute_forward_3d(const data_t *src, - }); - } else { - if (trans_src || trans_dst) { -- parallel_nd_ext(0, jpp.mb, jpp.nb_c, -+ parallel_nd_ext(nthr, jpp.mb, jpp.nb_c, - [&](int ithr, int nthr, int n, int b_c) { - if (trans_src) - transpose_facade.execute_transpose_input( -@@ -948,7 +952,9 @@ void jit_uni_pooling_bwd_t::execute_backward( - transpose_facade.execute_transpose_output(ithr, n, b_c); - }; - -- parallel(0, [&](int ithr, int nthr) { -+ const int nthr = jpp.nthr; -+ -+ parallel(nthr, [&](int ithr, int nthr) { - const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc); - const std::size_t work_amount - = static_cast(jpp.mb) * nb2_c; -@@ -1098,6 +1104,8 @@ void jit_uni_pooling_bwd_t::execute_backward_3d( - } - }; - -+ const int nthr = jpp.nthr; -+ - if (jpp.simple_alg) { - if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) { - const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc); -@@ -1109,7 +1117,7 @@ void jit_uni_pooling_bwd_t::execute_backward_3d( - } else { - assert(jpp.ur_bc == 1); - if (trans_src || trans_dst) { -- parallel_nd_ext(0, jpp.mb, jpp.nb_c, -+ parallel_nd_ext(nthr, jpp.mb, jpp.nb_c, - [&](int ithr, int nthr, int n, int b_c) { - if (trans_src) - transpose_facade.execute_transpose_input( -@@ -1142,7 +1150,7 @@ void jit_uni_pooling_bwd_t::execute_backward_3d( - if (!trans_src) { - const size_t chunk_size - = (size_t)jpp.id * jpp.ih * jpp.iw * jpp.c_block; -- parallel_nd_ext(0, jpp.mb, jpp.nb_c, -+ parallel_nd_ext(nthr, jpp.mb, jpp.nb_c, - [&](int ithr, int nthr, int n, int b_c) { - const size_t offset - = ((size_t)n * jpp.nb_c + b_c) * chunk_size; -@@ -1155,8 +1163,8 @@ void jit_uni_pooling_bwd_t::execute_backward_3d( - - const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc); - if (trans_src || trans_dst) { -- parallel_nd_ext( -- 0, jpp.mb, nb2_c, [&](int ithr, int nthr, int n, int b2_c) { -+ parallel_nd_ext(nthr, jpp.mb, nb2_c, -+ [&](int ithr, int nthr, int n, int b2_c) { - const auto b_c = b2_c * jpp.ur_bc; - - if (trans_dst) { -diff --git a/src/cpu/x64/jit_uni_pooling.hpp b/src/cpu/x64/jit_uni_pooling.hpp -index ec4b04a2b..e25d9ce05 100644 ---- a/src/cpu/x64/jit_uni_pooling.hpp -+++ b/src/cpu/x64/jit_uni_pooling.hpp -@@ -66,8 +66,9 @@ struct jit_uni_pooling_fwd_t : public primitive_t { - init_default_ws(); - - auto scratchpad = scratchpad_registry().registrar(); -- return jit_uni_pool_kernel::init_conf( -- jpp_, scratchpad, this, dnnl_get_max_threads()); -+ CHECK(jit_uni_pool_kernel::init_conf(jpp_, scratchpad, this)); -+ -+ return status::success; - } - - jit_pool_conf_t jpp_; -@@ -130,9 +131,11 @@ struct jit_uni_pooling_bwd_t : public primitive_t { - init_default_ws(); - if (!compare_ws(hint_fwd_pd_)) return status::unimplemented; - } -+ - auto scratchpad = scratchpad_registry().registrar(); -- return jit_uni_pool_kernel::init_conf( -- jpp_, scratchpad, this, dnnl_get_max_threads()); -+ CHECK(jit_uni_pool_kernel::init_conf(jpp_, scratchpad, this)); -+ -+ return status::success; - } - - jit_pool_conf_t jpp_; diff --git a/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch b/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch deleted file mode 100644 index 0c3b6a76..00000000 --- a/third_party/patch/onednn/0003-fix-zero-threads-identified-on-AMD.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp -index 1397073ba..041a3436f 100644 ---- a/src/cpu/platform.cpp -+++ b/src/cpu/platform.cpp -@@ -154,6 +154,8 @@ unsigned get_num_cores() { - // function supports process affinity. - unsigned get_max_threads_to_use() { - int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores(); -+ if (num_cores_per_socket <= 1) -+ num_cores_per_socket = std::thread::hardware_concurrency(); - #if defined(_WIN32) - DWORD_PTR proc_affinity_mask; - DWORD_PTR sys_affinity_mask; diff --git a/third_party/patch/onednn/0004-fix-dnnl-limits.patch b/third_party/patch/onednn/0004-fix-dnnl-limits.patch deleted file mode 100644 index 7638e4ae..00000000 --- a/third_party/patch/onednn/0004-fix-dnnl-limits.patch +++ /dev/null @@ -1,10 +0,0 @@ ---- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h -+++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #include -- Gitee