From d560171e789e65dc179b729e39b5dc343b7ced82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= Date: Wed, 23 Jul 2025 10:05:16 +0800 Subject: [PATCH] Split async examples --- .../CMakeLists.txt | 23 +- .../README.md | 111 ++++------ .../cmake/cpu_lib.cmake | 4 - .../cmake/npu_lib.cmake | 1 - .../main.cpp | 17 +- .../matmul_async_iterate_custom_tiling.cpp} | 7 +- .../matmul_async_iterate_custom_tiling.h} | 10 +- .../matmul_async_iterate_custom_kernel.cpp} | 45 ++-- .../matmul_async_iterate_custom_kernel.h} | 19 +- .../run.sh | 18 +- .../matmul_async_iterate/scripts/exec_test.py | 128 ++++++++++++ .../testcase/case.csv | 0 .../matmul_async_iterate_all/CMakeLists.txt | 79 +++++++ .../matrix/matmul_async_iterate_all/README.md | 107 ++++++++++ .../cmake/cpu_lib.cmake | 31 +++ .../cmake/npu_lib.cmake | 27 +++ .../matrix/matmul_async_iterate_all/main.cpp | 197 ++++++++++++++++++ ...matmul_async_iterate_all_custom_tiling.cpp | 51 +++++ .../matmul_async_iterate_all_custom_tiling.h | 39 ++++ ...matmul_async_iterate_all_custom_kernel.cpp | 112 ++++++++++ .../matmul_async_iterate_all_custom_kernel.h | 64 ++++++ .../matrix/matmul_async_iterate_all/run.sh | 86 ++++++++ .../scripts/exec_test.py | 11 +- .../testcase/case.csv | 1 + examples/readme.md | 10 +- 25 files changed, 1030 insertions(+), 168 deletions(-) rename examples/matrix/{matmul_async => matmul_async_iterate}/CMakeLists.txt (79%) rename examples/matrix/{matmul_async => matmul_async_iterate}/README.md (56%) rename examples/matrix/{matmul_async => matmul_async_iterate}/cmake/cpu_lib.cmake (89%) rename examples/matrix/{matmul_async => matmul_async_iterate}/cmake/npu_lib.cmake (94%) rename examples/matrix/{matmul_async => matmul_async_iterate}/main.cpp (93%) rename examples/matrix/{matmul_async/op_host/matmul_async_custom_tiling.cpp => matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp} (90%) rename examples/matrix/{matmul_async/op_host/matmul_async_custom_tiling.h => matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h} (75%) rename examples/matrix/{matmul_async/op_kernel/matmul_async_custom_kernel.cpp => matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp} (78%) rename examples/matrix/{matmul_async/op_kernel/matmul_async_custom_kernel.h => matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h} (79%) rename examples/matrix/{matmul_async => matmul_async_iterate}/run.sh (79%) create mode 100644 examples/matrix/matmul_async_iterate/scripts/exec_test.py rename examples/matrix/{matmul_async => matmul_async_iterate}/testcase/case.csv (100%) create mode 100644 examples/matrix/matmul_async_iterate_all/CMakeLists.txt create mode 100644 examples/matrix/matmul_async_iterate_all/README.md create mode 100644 examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake create mode 100644 examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake create mode 100644 examples/matrix/matmul_async_iterate_all/main.cpp create mode 100644 examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp create mode 100644 examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h create mode 100644 examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp create mode 100644 examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h create mode 100644 examples/matrix/matmul_async_iterate_all/run.sh rename examples/matrix/{matmul_async => matmul_async_iterate_all}/scripts/exec_test.py (92%) create mode 100644 examples/matrix/matmul_async_iterate_all/testcase/case.csv diff --git a/examples/matrix/matmul_async/CMakeLists.txt b/examples/matrix/matmul_async_iterate/CMakeLists.txt similarity index 79% rename from examples/matrix/matmul_async/CMakeLists.txt rename to examples/matrix/matmul_async_iterate/CMakeLists.txt index e703a3f9..5cdf60c8 100644 --- a/examples/matrix/matmul_async/CMakeLists.txt +++ b/examples/matrix/matmul_async_iterate/CMakeLists.txt @@ -15,9 +15,6 @@ endif() if (${SOC_VERSION}) set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") endif() -if (${ASYNC_MODE}) - set(ASYNC_MODE "GM" CACHE STRING "GM/VECIN") -endif() set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") if(NOT CMAKE_BUILD_TYPE) @@ -29,10 +26,8 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) endif() file(GLOB KERNEL_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_custom_kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_iterate_custom_kernel.cpp ) -set(CUSTOM_ASYNC_LIST "VECIN") - if("${RUN_MODE}" STREQUAL "cpu") include(cmake/cpu_lib.cmake) elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") @@ -41,31 +36,29 @@ else() message("invalid RUN_MODE: ${RUN_MODE}") endif() -add_executable(ascendc_matmul_async_bbit +add_executable(ascendc_matmul_async_iterate_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_custom_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_iterate_custom_tiling.cpp ) -target_compile_options(ascendc_matmul_async_bbit PRIVATE +target_compile_options(ascendc_matmul_async_iterate_bbit PRIVATE $:-g>> -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 ) -target_compile_definitions(ascendc_matmul_async_bbit PRIVATE +target_compile_definitions(ascendc_matmul_async_iterate_bbit PRIVATE SOC_VERSION="${SOC_VERSION}" - $<$>:CUSTOM_ASYNC_VECIN> - ASYNC_MODE="${ASYNC_MODE}" ) -target_include_directories(ascendc_matmul_async_bbit PRIVATE +target_include_directories(ascendc_matmul_async_iterate_bbit PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} $:${ASCEND_CANN_PACKAGE_PATH}/include>> $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> ) -target_link_libraries(ascendc_matmul_async_bbit PRIVATE +target_link_libraries(ascendc_matmul_async_iterate_bbit PRIVATE $,$>:host_intf_pub>> $:tikicpulib::${SOC_VERSION}>> $:ascendcl>> @@ -78,7 +71,7 @@ target_link_libraries(ascendc_matmul_async_bbit PRIVATE dl ) -install(TARGETS ascendc_matmul_async_bbit +install(TARGETS ascendc_matmul_async_iterate_bbit LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} diff --git a/examples/matrix/matmul_async/README.md b/examples/matrix/matmul_async_iterate/README.md similarity index 56% rename from examples/matrix/matmul_async/README.md rename to examples/matrix/matmul_async_iterate/README.md index 34e59352..2767c464 100644 --- a/examples/matrix/matmul_async/README.md +++ b/examples/matrix/matmul_async_iterate/README.md @@ -1,7 +1,7 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法,包含调用IterateAll输出到GM,和调用Iterate和GetTensorC输出到VECIN两种异步实现方式。 +本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法,实现方式为调用Iterate和GetTensorC输出到VECIN。 异步场景指的是程序执行时,不需要等待某个操作完成就可以执行下一步操作。异步场景可以减少同步等待,提高并行度,开发者对计算性能要求较高时,可以选用该方式。 本样例以直调的方式调用算子核函数。 @@ -24,7 +24,7 @@ ## 算子描述 - 算子功能 - MatmulAsyncCustom单算子,实现了调用IterateAll输出到GM,或者调用Iterate和GetTensorC输出到VECIN,两种异步场景实现方式下的矩阵乘计算。 + MatmulAsyncIterateCustom单算子,实现了调用Iterate和GetTensorC输出到VECIN的异步场景矩阵乘计算。 - 算子规格 @@ -39,12 +39,12 @@ - +
算子输出c (640, 1024)floatND-
核函数名matmul_async_custom
核函数名matmul_async_iterate_custom
## 算子实现介绍 -本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncCustom单算子。根据算子执行时传入的参数设置对应的编译宏,再根据不同的编译宏选择用IterateAll或者Iterate&GetTensorC的方式实现异步。 +本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncIterateCustom单算子。 - Kernel实现 - 计算逻辑:C = A * B + Bias。 @@ -52,77 +52,58 @@ - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 - 具体步骤: - - 创建Matmul对象,根据异步实现方式,选择创建输出到GM或者VECIN的Matmul对象。 + - 创建Matmul对象,输出C矩阵的TPosition为VECIN。 ``` - #if defined(CUSTOM_ASYNC_VECIN) - AscendC::Matmul, - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, CFG_MDL> matmulObj; - #else - AscendC::Matmul, - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, CFG_MDL> matmulObj; - #endif + AscendC::Matmul, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, CFG_MDL> matmulObj; ``` - 初始化操作。 - 设置左矩阵A、右矩阵B、Bias。 - 获取矩阵乘计算结果。 - - 调用IterateAll输出到GM的异步场景实现 - ``` - matmulObj.template IterateAll(cGlobal, 0, false, true); - matmulObj.WaitIterateAll(); - ``` - - 调用Iterate和GetTensorC输出到VECIN的异步场景实现 - ``` - matmulObj.SetWorkspace(workspaceGlobal); - matmulObj.template Iterate(); - uint32_t baseM = this->tiling.baseM; - uint32_t baseN = this->tiling.baseN; - pipe->InitBuffer(cInQueue, 1, baseM * baseN * sizeof(CType)); - pipe->InitBuffer(cOutQueue, 1, baseM * baseN * sizeof(CType)); - AscendC::DataCopyParams copyParams = { - (uint16_t)baseM, - (uint16_t)(baseN * sizeof(CType) / AscendC::DEFAULT_C0_SIZE), - (uint16_t)0, - (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE) - }; - uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN); - for (uint32_t i = 0; i < iterateTimes; ++i) { - // compute - auto cInLocal = cInQueue.AllocTensor(); - matmulObj.template GetTensorC(cInLocal); - cInQueue.EnQue(cInLocal); - // any vector operator - auto src = cInQueue.DeQue(); - auto dst = cOutQueue.AllocTensor(); - DataCopy(dst, src, baseM * baseN); - cOutQueue.EnQue(dst); - cInQueue.FreeTensor(src); - // copy out - auto cOutLocal = cOutQueue.DeQue(); - DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams); - cOutQueue.FreeTensor(cOutLocal); - } - ``` + ``` + matmulObj.SetWorkspace(workspaceGlobal); + matmulObj.template Iterate(); + uint32_t baseM = this->tiling.baseM; + uint32_t baseN = this->tiling.baseN; + pipe->InitBuffer(cInQueue, 1, baseM * baseN * sizeof(CType)); + pipe->InitBuffer(cOutQueue, 1, baseM * baseN * sizeof(CType)); + AscendC::DataCopyParams copyParams = { + (uint16_t)baseM, + (uint16_t)(baseN * sizeof(CType) / AscendC::DEFAULT_C0_SIZE), + (uint16_t)0, + (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE) + }; + uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN); + for (uint32_t i = 0; i < iterateTimes; ++i) { + // compute + auto cInLocal = cInQueue.AllocTensor(); + matmulObj.template GetTensorC(cInLocal); + cInQueue.EnQue(cInLocal); + // any vector operator + auto src = cInQueue.DeQue(); + auto dst = cOutQueue.AllocTensor(); + DataCopy(dst, src, baseM * baseN); + cOutQueue.EnQue(dst); + cInQueue.FreeTensor(src); + // copy out + auto cOutLocal = cOutQueue.DeQue(); + DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams); + cOutQueue.FreeTensor(cOutLocal); + } + ``` - 完成矩阵乘操作。 - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - 获取Tiling参数的流程如下: - 创建一个Tiling对象。 - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 - 设置C的参数时,需要根据异步的实现方式,选择C的TPosition为GM或者VECIN。 + 设置C的TPosition为VECIN。 ``` - #if defined(CUSTOM_ASYNC_VECIN) - cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND, - matmul_tiling::DataType::DT_FLOAT); - #else - cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, - matmul_tiling::DataType::DT_FLOAT); - #endif + cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); ``` - 调用GetTiling接口,获取Tiling信息。 @@ -139,17 +120,15 @@ - 编译执行 ``` - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -m [ASYNC_MODE] + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] ``` 其中脚本参数说明如下: - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - Atlas A2训练系列产品/Atlas 800I A2推理产品 - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 - - ASYNC_MODE : 选择异步场景的实现方式是调用IterateAll输出到GM,或者调用Iterate和GetTensorC输出到VECIN,对应参数分别为[GM / VECIN]。 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` - bash run.sh -r cpu -v Ascendxxxyy -p 0 -m GM - # bash run.sh -r cpu -v Ascendxxxyy -p 0 -m VECIN + bash run.sh -r cpu -v Ascendxxxyy -p 0 ``` \ No newline at end of file diff --git a/examples/matrix/matmul_async/cmake/cpu_lib.cmake b/examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake similarity index 89% rename from examples/matrix/matmul_async/cmake/cpu_lib.cmake rename to examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake index aff00e34..2af75433 100644 --- a/examples/matrix/matmul_async/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake @@ -20,10 +20,6 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE tikicpulib::${SOC_VERSION} ) -target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - $<$>:CUSTOM_ASYNC_VECIN> -) - target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O2 diff --git a/examples/matrix/matmul_async/cmake/npu_lib.cmake b/examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake similarity index 94% rename from examples/matrix/matmul_async/cmake/npu_lib.cmake rename to examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake index d62c3df9..bc803099 100644 --- a/examples/matrix/matmul_async/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake @@ -21,7 +21,6 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ) ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - $<$>:CUSTOM_ASYNC_VECIN> -DASCENDC_DUMP -DHAVE_WORKSPACE -DHAVE_TILING diff --git a/examples/matrix/matmul_async/main.cpp b/examples/matrix/matmul_async_iterate/main.cpp similarity index 93% rename from examples/matrix/matmul_async/main.cpp rename to examples/matrix/matmul_async_iterate/main.cpp index d3a456c2..0afbed88 100644 --- a/examples/matrix/matmul_async/main.cpp +++ b/examples/matrix/matmul_async_iterate/main.cpp @@ -12,15 +12,15 @@ #include "../../common/data_utils.h" #include "kernel_tiling/kernel_tiling.h" #include "tiling/platform/platform_ascendc.h" -#include "./op_host/matmul_async_custom_tiling.h" +#include "./op_host/matmul_async_iterate_custom_tiling.h" #ifdef ASCENDC_CPU_DEBUG #include "tikicpulib.h" -extern "C" void matmul_async_custom( +extern "C" void matmul_async_iterate_custom( uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); #else #include "acl/acl.h" -extern void matmul_async_custom_do(uint32_t coreDim, void* stream, +extern void matmul_async_iterate_custom_do(uint32_t coreDim, void* stream, uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); #endif @@ -38,7 +38,7 @@ void TestMatmulCpu(const optiling::TestcaseParams& caseParams) size_t bFileSize = static_cast(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half size_t biasFileSize = static_cast(sizeof(float) * caseParams.n); size_t cFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); - size_t userWorkspaceSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); // async workspace, len is M * N + size_t userWorkspaceSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); // async_iterate workspace, len is M * N size_t systemWorkspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize; size_t tilingFileSize = sizeof(TCubeTiling); @@ -61,7 +61,7 @@ void TestMatmulCpu(const optiling::TestcaseParams& caseParams) memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize); uint32_t blockDim = ((uint32_t*)tiling)[0] / mixRatio; - ICPU_RUN_KF(matmul_async_custom, blockDim, a, b, bias, c, workspace, tiling); + ICPU_RUN_KF(matmul_async_iterate_custom, blockDim, a, b, bias, c, workspace, tiling); WriteFile("../output/output.bin", c, cFileSize); @@ -84,7 +84,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling: uint32_t aivCoreNum = ascendcPlatform->GetCoreNum(); uint8_t *workspaceDevice; - size_t userWorkspaceSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); // async workspace, len is M * N + size_t userWorkspaceSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); // async_iterate workspace, len is M * N size_t systemWorkspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize; CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); @@ -99,7 +99,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling: CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); uint32_t blockDim = ((uint32_t*)tilingHost)[0] / mixRatio; - matmul_async_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice); + matmul_async_iterate_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtFreeHost(tilingHost)); @@ -190,7 +190,8 @@ int32_t main(int32_t argc, const char* args[]) ss >> problem[i - 1]; } - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]}; + bool isBias = problem[3]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG MatmulHost::TestMatmulCpu(caseParams); diff --git a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp similarity index 90% rename from examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp rename to examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp index 6a58b2b6..1fb3a260 100644 --- a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp +++ b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#include "matmul_async_custom_tiling.h" +#include "matmul_async_iterate_custom_tiling.h" #include namespace optiling { @@ -21,13 +21,8 @@ bool ComputeTiling(uint32_t blockDim, TCubeTiling& tiling, matmul_tiling::MultiC matmul_tiling::DataType::DT_FLOAT16); cubeTiling->SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); -#if defined(CUSTOM_ASYNC_VECIN) cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); -#else - cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, - matmul_tiling::DataType::DT_FLOAT); -#endif cubeTiling->SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); cubeTiling->SetShape(caseParams.m, caseParams.n, caseParams.k); diff --git a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h similarity index 75% rename from examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h rename to examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h index 4ab3ca35..c76a6388 100644 --- a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h +++ b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h @@ -7,17 +7,17 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H -#define EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H +#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H #include "register/tilingdata_base.h" #include "tiling/tiling_api.h" namespace optiling { -BEGIN_TILING_DATA_DEF(MatmulAsyncCustomTilingData) +BEGIN_TILING_DATA_DEF(MatmulAsyncIterateCustomTilingData) TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData); END_TILING_DATA_DEF; -REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncCustomTilingData) +REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncIterateCustomTilingData) struct TestcaseParams { uint32_t m; @@ -36,4 +36,4 @@ void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cub const TestcaseParams& caseParams, uint8_t* tilingBuffer); } // namespace optiling -#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H \ No newline at end of file +#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp similarity index 78% rename from examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp rename to examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp index f8ff7e71..ebb8c284 100644 --- a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp +++ b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp @@ -8,20 +8,12 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#include "matmul_async_custom_kernel.h" +#include "matmul_async_iterate_custom_kernel.h" namespace MatmulCustom { -__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) -{ - if (b == 0) { - return a; - } - return (a + b - 1) / b; -} - template -__aicore__ inline void MatmulAsyncKernel::Init(GM_ADDR a, +__aicore__ inline void MatmulAsyncIterateKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling) { this->tiling = tiling; @@ -49,7 +41,7 @@ __aicore__ inline void MatmulAsyncKernel::Init(GM } template -__aicore__ inline void MatmulAsyncKernel::Process(AscendC::TPipe* pipe) +__aicore__ inline void MatmulAsyncIterateKernel::Process(AscendC::TPipe* pipe) { REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, &(this->tiling)); @@ -58,7 +50,7 @@ __aicore__ inline void MatmulAsyncKernel::Process if (this->tiling.isBias) { matmulObj.SetBias(biasGlobal); } -#if defined(CUSTOM_ASYNC_VECIN) + matmulObj.SetWorkspace(workspaceGlobal); matmulObj.template Iterate(); @@ -73,7 +65,8 @@ __aicore__ inline void MatmulAsyncKernel::Process (uint16_t)0, (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE) }; - uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN); + uint32_t iterateTimes = AscendC::Ceil(this->tiling.singleCoreM, baseM) * + AscendC::Ceil(this->tiling.singleCoreN, baseN); for (uint32_t i = 0; i < iterateTimes; ++i) { // compute auto cInLocal = cInQueue.AllocTensor(); @@ -90,19 +83,15 @@ __aicore__ inline void MatmulAsyncKernel::Process DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams); cOutQueue.FreeTensor(cOutLocal); } -#else - matmulObj.template IterateAll(cGlobal, 0, false, true); - matmulObj.WaitIterateAll(); -#endif matmulObj.End(); } template -__aicore__ inline void MatmulAsyncKernel::CalcOffset( +__aicore__ inline void MatmulAsyncIterateKernel::CalcOffset( uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias) { - auto mSingleBlocks = Ceiling(this->tiling.M, this->tiling.singleCoreM); + auto mSingleBlocks = AscendC::Ceil(this->tiling.M, this->tiling.singleCoreM); auto mCoreIndx = blockIdx % mSingleBlocks; auto nCoreIndx = blockIdx / mSingleBlocks; @@ -122,16 +111,16 @@ __aicore__ inline void MatmulAsyncKernel::CalcOff } template -__aicore__ inline uint32_t MatmulAsyncKernel::CalcDstOffset(uint32_t i) +__aicore__ inline uint32_t MatmulAsyncIterateKernel::CalcDstOffset(uint32_t i) { uint32_t mIter = 0; uint32_t nIter = 0; if (this->tiling.iterateOrder != 1) { - uint32_t mIterTimes = Ceiling(this->tiling.singleCoreM, this->tiling.baseM); + uint32_t mIterTimes = AscendC::Ceil(this->tiling.singleCoreM, this->tiling.baseM); mIter = i % mIterTimes; nIter = i / mIterTimes; } else { - uint32_t nIterTimes = Ceiling(this->tiling.singleCoreN, this->tiling.baseN); + uint32_t nIterTimes = AscendC::Ceil(this->tiling.singleCoreN, this->tiling.baseN); mIter = i / nIterTimes; nIter = i % nIterTimes; } @@ -152,22 +141,22 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) } } -extern "C" __global__ __aicore__ void matmul_async_custom( +extern "C" __global__ __aicore__ void matmul_async_iterate_custom( GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { TCubeTiling tiling; CopyTiling(&tiling, tilingGm); AscendC::TPipe pipe; - MatmulCustom::MatmulAsyncKernel matmulAsyncKernel; - matmulAsyncKernel.Init(a, b, bias, c, workspace, tiling); - matmulAsyncKernel.Process(&pipe); + MatmulCustom::MatmulAsyncIterateKernel matmulAsyncIterateKernel; + matmulAsyncIterateKernel.Init(a, b, bias, c, workspace, tiling); + matmulAsyncIterateKernel.Process(&pipe); } #ifndef ASCENDC_CPU_DEBUG -void matmul_async_custom_do(uint32_t blockDim, void* stream, +void matmul_async_iterate_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { - matmul_async_custom<<>>(a, b, bias, c, workspace, tilingGm); + matmul_async_iterate_custom<<>>(a, b, bias, c, workspace, tilingGm); } #endif \ No newline at end of file diff --git a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h similarity index 79% rename from examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h rename to examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h index 3da1cfa5..231a54c7 100644 --- a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h +++ b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h @@ -8,19 +8,17 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H -#define EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H +#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H #include "kernel_operator.h" #include "lib/matmul_intf.h" namespace MatmulCustom { -__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b); - template -class MatmulAsyncKernel { +class MatmulAsyncIterateKernel { public: - __aicore__ inline MatmulAsyncKernel(){}; + __aicore__ inline MatmulAsyncIterateKernel(){}; /** * @brief Initialization before process. * @param a: A matrix gm addr. @@ -40,17 +38,10 @@ public: */ __aicore__ inline void Process(AscendC::TPipe* pipe); -#if defined(CUSTOM_ASYNC_VECIN) AscendC::Matmul, AscendC::MatmulType, AscendC::MatmulType, AscendC::MatmulType, CFG_MDL> matmulObj; -#else - AscendC::Matmul, - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, CFG_MDL> matmulObj; -#endif private: /** @@ -76,4 +67,4 @@ private: TCubeTiling tiling; }; } // namespace MatmulCustom -#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H \ No newline at end of file +#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H \ No newline at end of file diff --git a/examples/matrix/matmul_async/run.sh b/examples/matrix/matmul_async_iterate/run.sh similarity index 79% rename from examples/matrix/matmul_async/run.sh rename to examples/matrix/matmul_async_iterate/run.sh index 8a539cfc..34f794f7 100644 --- a/examples/matrix/matmul_async/run.sh +++ b/examples/matrix/matmul_async_iterate/run.sh @@ -10,8 +10,8 @@ export IS_PERF="0" -SHORT=r:,v:,p:,m:, -LONG=run-mode:,soc-version:,perf:,async-mode:, +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" while : @@ -26,9 +26,6 @@ do (-p | --perf) IS_PERF="$2" shift 2;; - (-m | --async-mode) - ASYNC_MODE="$2" - shift 2;; (--) shift; break;; @@ -60,11 +57,6 @@ if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then exit 1 fi -if [ "${ASYNC_MODE}" != "GM" ] && [ "${ASYNC_MODE}" != "VECIN" ]; then - echo "[ERROR] ASYNC_MODE only supports 'GM' and 'VECIN', but input: ${ASYNC_MODE}." - exit 1 -fi - rm -rf build mkdir build cd build @@ -74,7 +66,7 @@ export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} set -euo pipefail -cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASYNC_MODE=${ASYNC_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. make -j16 cd ../ @@ -87,8 +79,8 @@ mkdir output rm -rf bin/ mkdir -p bin cd bin -cp ../build/ascendc_matmul_async_bbit ./ +cp ../build/ascendc_matmul_async_iterate_bbit ./ export TF_CPP_MIN_LOG_LEVEL=3 -python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF} -m ${ASYNC_MODE} +python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF} diff --git a/examples/matrix/matmul_async_iterate/scripts/exec_test.py b/examples/matrix/matmul_async_iterate/scripts/exec_test.py new file mode 100644 index 00000000..fe85a1af --- /dev/null +++ b/examples/matrix/matmul_async_iterate/scripts/exec_test.py @@ -0,0 +1,128 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import shlex +import subprocess +import sys +import csv +import time +import logging +import argparse +from collections import namedtuple + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from common_scripts.gen_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import clear_file_cache +from common_scripts.exec_utils import get_perf_task_duration + +DATA_TYPE_STR = "float16_float32" + +logging.basicConfig(level=logging.INFO) + +ProcessParams = namedtuple("ProcessParams", ["isProcess", "caseName", 'm', 'n', 'k', 'isBias']) + + +def get_case_list(work_path): + case_list = [] + case_dir = os.path.join(work_path, "testcase") + if not os.path.isdir(case_dir): + logging.error(f"case path: {case_dir} not exist!") + return case_list + + for file_name in os.listdir(case_dir): + if not file_name.endswith(".csv"): + continue + + abs_file_name = os.path.join(case_dir, file_name) + with open(abs_file_name, mode='r', encoding='utf-8') as file: + csv_reader = csv.reader(file) + for row in csv_reader: + item_list = [int(i) if i.strip().isdigit() else i.strip() for i in row] + case_list.append(item_list) + return case_list + + +def process_case(work_path, args, params): + case_name, m, n, k, is_bias = (params.caseName, params.m, params.n, params.k, params.isBias) + run_mode, is_perf = args.r, args.p + + logging.info(f"start process case [{case_name}]") + logging.info(f"IS_PERF is set [{is_perf}]") + + clear_file_cache(work_path) + + # gen data + MatmulGenData(m, n, k, 1, False, False, is_bias, DATA_TYPE_STR).gen_golden_data(work_path) + + params_str = f"{m} {n} {k} {is_bias}" + if is_perf: # npu(is_perf = 1) + cmd = f"msprof op --application=\"./ascendc_matmul_async_iterate_bbit {params_str}\" --output=./prof_out" + elif run_mode == "sim": # sim + cmd = (f"msprof op simulator --application=\"./ascendc_matmul_async_iterate_bbit {params_str}\"" + f" --output=./sim_out") + else: # cpu or npu(is_perf = 0) + cmd = f"./ascendc_matmul_async_iterate_bbit {params_str}" + subprocess.run(shlex.split(cmd)) + + total_num = params.m * params.n + if is_perf: + wrong_num = -1 + result = None + task_duration = get_perf_task_duration("./prof_out") + else: + logging.info(f"compare data case[{case_name}]") + wrong_num = compare_data(work_path, DATA_TYPE_STR) + result = "Fail" if (wrong_num / total_num > 0.001 or wrong_num < 0) else "Success" + task_duration = None + + return [params.caseName, wrong_num, total_num, result, task_duration] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', type=str, choices=['cpu', 'sim', 'npu'], required=True) + parser.add_argument('-p', type=int, default=0, required=False) + args = parser.parse_args() + + if args.r != 'npu' and args.p: + logging.error(f"IS_PERF can only be enabled in 'npu' mode, current mode: '{args.r}'.") + return -1 + + work_path = os.path.dirname(os.getcwd()) + + case_list = get_case_list(work_path) + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + + for case in case_list: + try: + process_params = ProcessParams(*case) + except Exception as e: + logging.error(f"{case[0]}: invalid parameter, error msg: {e}") + continue + if process_params.isProcess: + res_data = process_case(work_path, args, process_params) + res_list.append(res_data) + + result_file_name = f"result_{time.time()}.csv" + with open(os.path.join(work_path, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/matmul_async/testcase/case.csv b/examples/matrix/matmul_async_iterate/testcase/case.csv similarity index 100% rename from examples/matrix/matmul_async/testcase/case.csv rename to examples/matrix/matmul_async_iterate/testcase/case.csv diff --git a/examples/matrix/matmul_async_iterate_all/CMakeLists.txt b/examples/matrix/matmul_async_iterate_all/CMakeLists.txt new file mode 100644 index 00000000..f760a2d2 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/CMakeLists.txt @@ -0,0 +1,79 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_iterate_all_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_async_iterate_all_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_iterate_all_custom_tiling.cpp +) + +target_compile_options(ascendc_matmul_async_iterate_all_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_async_iterate_all_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_matmul_async_iterate_all_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_async_iterate_all_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) + +install(TARGETS ascendc_matmul_async_iterate_all_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/README.md b/examples/matrix/matmul_async_iterate_all/README.md new file mode 100644 index 00000000..c01c5bd7 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/README.md @@ -0,0 +1,107 @@ + +## 概述 + +本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法,实现方式为调用IterateAll输出到GM。 +异步场景指的是程序执行时,不需要等待某个操作完成就可以执行下一步操作。异步场景可以减少同步等待,提高并行度,开发者对计算性能要求较高时,可以选用该方式。 + +本样例以直调的方式调用算子核函数。 +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方式。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录 | 描述 | +|------------------------------------|----------------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例tiling代码实现 | +| [op_kernel](op_kernel) | 本样例kernel代码实现 | +| [scripts](scripts) | 执行文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| [CMakeLists.txt](CMakeLists.txt) | 编译工程文件 | +| [main.cpp](main.cpp) | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| [run.sh](run.sh) | 编译执行脚本 | + +## 算子描述 +- 算子功能 + MatmulAsyncIterateAllCustom单算子,实现了调用IterateAll输出到GM的异步场景矩阵乘计算。 + +- 算子规格 + + + + + + + + + + + + + +
算子类型(OpType)MatmulAsyncIterateAllCustom
算子输入nameshapedata typeformatisTrans
a(640, 512)halfNDflase
b(512, 1024)halfNDfalse
bias(1024, )floatND-
算子输出c(640, 1024)floatND-
核函数名matmul_async_iterate_all_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncIterateAllCustom单算子。 + +- Kernel实现 + - 计算逻辑:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + - 具体步骤: + - 创建Matmul对象,输出C矩阵的TPosition为GM。 + ``` + AscendC::Matmul, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, CFG_MDL> matmulObj; + ``` + - 初始化操作。 + - 设置左矩阵A、右矩阵B、Bias。 + - 获取矩阵乘计算结果。 + ``` + matmulObj.template IterateAll(cGlobal, 0, false, true); + matmulObj.WaitIterateAll(); + ``` + - 完成矩阵乘操作。 + - 结束矩阵乘操作。 + +- Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + 设置C的TPosition为GM。 + ``` + cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + ``` + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest。 + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 + ``` \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake b/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake new file mode 100644 index 00000000..2af75433 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O2 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake b/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake new file mode 100644 index 00000000..bc803099 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/main.cpp b/examples/matrix/matmul_async_iterate_all/main.cpp new file mode 100644 index 00000000..141d51fd --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/main.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "./op_host/matmul_async_iterate_all_custom_tiling.h" + +#ifdef ASCENDC_CPU_DEBUG +#include "tikicpulib.h" +extern "C" void matmul_async_iterate_all_custom( + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); +#else +#include "acl/acl.h" +extern void matmul_async_iterate_all_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); +#endif + +namespace MatmulHost { + +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(const optiling::TestcaseParams& caseParams) +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t mixRatio = 2; // AIC:AIV = 1:2 + uint32_t aivCoreNum = ascendcPlatform->GetCoreNum(); + + size_t aFileSize = static_cast(sizeof(uint16_t) * caseParams.m * caseParams.k); // uint16_t represent half + size_t bFileSize = static_cast(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half + size_t biasFileSize = static_cast(sizeof(float) * caseParams.n); + size_t cFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); + size_t workspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + size_t tilingFileSize = sizeof(TCubeTiling); + + uint8_t* a = (uint8_t *)AscendC::GmAlloc(aFileSize); + uint8_t* b = (uint8_t *)AscendC::GmAlloc(bFileSize); + uint8_t* bias; + uint8_t* c = (uint8_t *)AscendC::GmAlloc(cFileSize); + uint8_t* workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize); + uint8_t* tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize); + + ReadFile("../input/x1_gm.bin", aFileSize, a, aFileSize); + ReadFile("../input/x2_gm.bin", bFileSize, b, bFileSize); + if (caseParams.isBias) { + bias = (uint8_t *)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + + optiling::GenerateTiling(aivCoreNum, &cubeTiling, caseParams, tiling); + memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize); + + uint32_t blockDim = ((uint32_t*)tiling)[0] / mixRatio; + ICPU_RUN_KF(matmul_async_iterate_all_custom, blockDim, a, b, bias, c, workspace, tiling); + + WriteFile("../output/output.bin", c, cFileSize); + + AscendC::GmFree((void*)a); + AscendC::GmFree((void*)b); + if (caseParams.isBias) { + AscendC::GmFree((void*)bias); + } + AscendC::GmFree((void*)c); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); +} +#else +void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling::TestcaseParams& caseParams, + void* stream = nullptr) +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t mixRatio = 2; // AIC:AIV = 1:2 + uint32_t aivCoreNum = ascendcPlatform->GetCoreNum(); + + uint8_t *workspaceDevice; + size_t workspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t* tilingHost; + uint8_t* tilingDevice; + size_t tilingFileSize = sizeof(TCubeTiling); + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + optiling::GenerateTiling(aivCoreNum, &cubeTiling, caseParams, tilingHost); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint32_t blockDim = ((uint32_t*)tilingHost)[0] / mixRatio; + matmul_async_iterate_all_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(const optiling::TestcaseParams& caseParams) +{ + size_t aFileSize = static_cast(sizeof(uint16_t) * caseParams.m * caseParams.k); // uint16_t represent half + size_t bFileSize = static_cast(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half + size_t biasFileSize = static_cast(sizeof(float) * caseParams.n); + size_t cFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t* aHost; + uint8_t* aDevice; + CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize)); + CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", aFileSize, aHost, aFileSize); + CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* bHost; + uint8_t* bDevice; + CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize)); + CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", bFileSize, bHost, bFileSize); + CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost; + uint8_t* biasDevice; + if (caseParams.isBias) { + CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + + uint8_t* cHost; + uint8_t* cDevice; + CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize)); + CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(aDevice, bDevice, biasDevice, cDevice, caseParams, stream); + + CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", cHost, cFileSize); + + CHECK_ACL(aclrtFree(aDevice)); + CHECK_ACL(aclrtFreeHost(aHost)); + + CHECK_ACL(aclrtFree(bDevice)); + CHECK_ACL(aclrtFreeHost(bHost)); + + CHECK_ACL(aclrtFree(cDevice)); + CHECK_ACL(aclrtFreeHost(cHost)); + + TestAclDeInit(context, stream, deviceId); +} +#endif +} // end namespace MatmulHost + +int32_t main(int32_t argc, const char* args[]) +{ + uint32_t problem[4]; + for (int32_t i = 1; i < argc && i < 5; ++i) { + std::stringstream ss(args[i]); + ss >> problem[i - 1]; + } + bool isBias = problem[3]; + optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias}; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(caseParams); +#else + MatmulHost::TestMatmul(caseParams); +#endif + return 0; +} \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp new file mode 100644 index 00000000..eab6037a --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_async_iterate_all_custom_tiling.h" +#include + +namespace optiling { + +bool ComputeTiling(uint32_t blockDim, TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, + const TestcaseParams& caseParams) +{ + cubeTiling->SetDim(blockDim); + cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16); + cubeTiling->SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16); + cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling->SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling->SetShape(caseParams.m, caseParams.n, caseParams.k); + cubeTiling->SetOrgShape(caseParams.m, caseParams.n, caseParams.k); + cubeTiling->EnableBias(caseParams.isBias); + cubeTiling->SetBufferSpace(-1, -1, -1); + if (cubeTiling->GetTiling(tiling) == -1) { + return false; + } + return true; +} + +void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, + const TestcaseParams& caseParams, uint8_t* tilingBuffer) +{ + TCubeTiling tilingData; + + bool res = optiling::ComputeTiling(blockDim, tilingData, cubeTiling, caseParams); + if (!res) { + std::cout << "gen tiling failed." << std::endl; + } + + uint32_t tilingSize = tilingData.GetDataSize(); + tilingData.SaveToBuffer(tilingBuffer, tilingSize); +} +} // namespace optiling \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h new file mode 100644 index 00000000..eef4289f --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h @@ -0,0 +1,39 @@ +/** +* Copyright (c) 2025 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace optiling { + +BEGIN_TILING_DATA_DEF(MatmulAsyncIterateAllCustomTilingData) + TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncIterateAllCustomTilingData) + +struct TestcaseParams { + uint32_t m; + uint32_t n; + uint32_t k; + bool isBias; +}; +/** + * @brief Generate matmul tiling. + * @param blockDim: Number of cores involved in the computation. + * @param cubeTiling: TCubeTiling structure. + * @param caseParams: Testcase parameters. + * @param tilingBuf: Data buffer. + */ +void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, + const TestcaseParams& caseParams, uint8_t* tilingBuffer); + +} // namespace optiling +#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp new file mode 100644 index 00000000..e6f43190 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_async_iterate_all_custom_kernel.h" + +namespace MatmulCustom { + +template +__aicore__ inline void MatmulAsyncIterateAllKernel::Init(GM_ADDR a, + GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling) +{ + this->tiling = tiling; + + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + + uint32_t offsetA = 0; + uint32_t offsetB = 0; + uint32_t offsetC = 0; + uint32_t offsetBias = 0; + CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + + if(GetSysWorkSpacePtr() == nullptr){ + return; + } +} + +template +__aicore__ inline void MatmulAsyncIterateAllKernel::Process(AscendC::TPipe* pipe) +{ + REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, &(this->tiling)); + + matmulObj.SetTensorA(aGlobal); + matmulObj.SetTensorB(bGlobal); + if (this->tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.template IterateAll(cGlobal, 0, false, true); + matmulObj.WaitIterateAll(); + matmulObj.End(); +} + + +template +__aicore__ inline void MatmulAsyncIterateAllKernel::CalcOffset( + uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias) +{ + auto mSingleBlocks = AscendC::Ceil(this->tiling.M, this->tiling.singleCoreM); + auto mCoreIndx = blockIdx % mSingleBlocks; + auto nCoreIndx = blockIdx / mSingleBlocks; + + offsetA = mCoreIndx * this->tiling.Ka * this->tiling.singleCoreM; + offsetB = nCoreIndx * this->tiling.singleCoreN; + offsetC = mCoreIndx * this->tiling.N * this->tiling.singleCoreM + nCoreIndx * this->tiling.singleCoreN; + offsetBias = nCoreIndx * this->tiling.singleCoreN; + + // process with tail block + int32_t tailM = this->tiling.M - mCoreIndx * this->tiling.singleCoreM; + tailM = tailM < this->tiling.singleCoreM ? tailM : this->tiling.singleCoreM; + int32_t tailN = this->tiling.N - nCoreIndx * this->tiling.singleCoreN; + tailN = tailN < this->tiling.singleCoreN ? tailN : this->tiling.singleCoreN; + if (tailM < this->tiling.singleCoreM || tailN < this->tiling.singleCoreN) { + matmulObj.SetTail(tailM, tailN); + } +} +} // namespace MatmulCustom + +namespace { +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} +} + +extern "C" __global__ __aicore__ void matmul_async_iterate_all_custom( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + AscendC::TPipe pipe; + + MatmulCustom::MatmulAsyncIterateAllKernel matmulAsyncIterateAllKernel; + matmulAsyncIterateAllKernel.Init(a, b, bias, c, tiling); + matmulAsyncIterateAllKernel.Process(&pipe); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_async_iterate_all_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_async_iterate_all_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h new file mode 100644 index 00000000..c6b67993 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h @@ -0,0 +1,64 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +namespace MatmulCustom { + +template +class MatmulAsyncIterateAllKernel { +public: + __aicore__ inline MatmulAsyncIterateAllKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling); + /**, + * @brief Process matrix calculation. + * @param pipe: The TPipe object which manages global memory and synchronization. + * @retval None + */ + __aicore__ inline void Process(AscendC::TPipe* pipe); + + AscendC::Matmul, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, CFG_MDL> matmulObj; + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset( + uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor biasGlobal; + AscendC::GlobalTensor cGlobal; + TCubeTiling tiling; +}; +} // namespace MatmulCustom +#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H \ No newline at end of file diff --git a/examples/matrix/matmul_async_iterate_all/run.sh b/examples/matrix/matmul_async_iterate_all/run.sh new file mode 100644 index 00000000..c7e80377 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/run.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" + +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf) + IS_PERF="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_matmul_async_iterate_all_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF} diff --git a/examples/matrix/matmul_async/scripts/exec_test.py b/examples/matrix/matmul_async_iterate_all/scripts/exec_test.py similarity index 92% rename from examples/matrix/matmul_async/scripts/exec_test.py rename to examples/matrix/matmul_async_iterate_all/scripts/exec_test.py index d6e0d3b7..51e14acb 100644 --- a/examples/matrix/matmul_async/scripts/exec_test.py +++ b/examples/matrix/matmul_async_iterate_all/scripts/exec_test.py @@ -54,10 +54,10 @@ def get_case_list(work_path): def process_case(work_path, args, params): case_name, m, n, k, is_bias = (params.caseName, params.m, params.n, params.k, params.isBias) - run_mode, is_perf, async_mode = args.r, args.p, args.m + run_mode, is_perf = args.r, args.p logging.info(f"start process case [{case_name}]") - logging.info(f"ASYNC_MODE is set [{async_mode}], IS_PERF is set [{is_perf}]") + logging.info(f"IS_PERF is set [{is_perf}]") clear_file_cache(work_path) @@ -66,11 +66,12 @@ def process_case(work_path, args, params): params_str = f"{m} {n} {k} {is_bias}" if is_perf: # npu(is_perf = 1) - cmd = f"msprof op --application=\"./ascendc_matmul_async_bbit {params_str}\" --output=./prof_out" + cmd = f"msprof op --application=\"./ascendc_matmul_async_iterate_all_bbit {params_str}\" --output=./prof_out" elif run_mode == "sim": # sim - cmd = f"msprof op simulator --application=\"./ascendc_matmul_async_bbit {params_str}\" --output=./sim_out" + cmd = (f"msprof op simulator --application=\"./ascendc_matmul_async_iterate_all_bbit {params_str}\"" + f" --output=./sim_out") else: # cpu or npu(is_perf = 0) - cmd = f"./ascendc_matmul_async_bbit {params_str}" + cmd = f"./ascendc_matmul_async_iterate_all_bbit {params_str}" subprocess.run(shlex.split(cmd)) total_num = params.m * params.n diff --git a/examples/matrix/matmul_async_iterate_all/testcase/case.csv b/examples/matrix/matmul_async_iterate_all/testcase/case.csv new file mode 100644 index 00000000..a38c3b73 --- /dev/null +++ b/examples/matrix/matmul_async_iterate_all/testcase/case.csv @@ -0,0 +1 @@ +1, case001, 640, 1024, 512, 1 \ No newline at end of file diff --git a/examples/readme.md b/examples/readme.md index 5b41ae3d..8a0e1fc8 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -36,7 +36,7 @@ 对输入tensor按行做如下公式的计算:zi = (xi - ∑(xi * yi)) * yi,其中∑为按行reduce求和。 - matrix + matrix basic_block_matmul 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -49,8 +49,12 @@ 实现Matmul矩阵乘法,计算公式为:C = A * B + Bias。 - matmul_async - 实现异步场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + matmul_async_iterate + 调用Iterate实现异步场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + + matmul_async_iterate_all + 调用IterateAll实现异步场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 matmul_constant -- Gitee