From d560171e789e65dc179b729e39b5dc343b7ced82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <wangxu243@huawei.com>
Date: Wed, 23 Jul 2025 10:05:16 +0800
Subject: [PATCH] Split async examples

---
 .../CMakeLists.txt                            |  23 +-
 .../README.md                                 | 111 ++++------
 .../cmake/cpu_lib.cmake                       |   4 -
 .../cmake/npu_lib.cmake                       |   1 -
 .../main.cpp                                  |  17 +-
 .../matmul_async_iterate_custom_tiling.cpp}   |   7 +-
 .../matmul_async_iterate_custom_tiling.h}     |  10 +-
 .../matmul_async_iterate_custom_kernel.cpp}   |  45 ++--
 .../matmul_async_iterate_custom_kernel.h}     |  19 +-
 .../run.sh                                    |  18 +-
 .../matmul_async_iterate/scripts/exec_test.py | 128 ++++++++++++
 .../testcase/case.csv                         |   0
 .../matmul_async_iterate_all/CMakeLists.txt   |  79 +++++++
 .../matrix/matmul_async_iterate_all/README.md | 107 ++++++++++
 .../cmake/cpu_lib.cmake                       |  31 +++
 .../cmake/npu_lib.cmake                       |  27 +++
 .../matrix/matmul_async_iterate_all/main.cpp  | 197 ++++++++++++++++++
 ...matmul_async_iterate_all_custom_tiling.cpp |  51 +++++
 .../matmul_async_iterate_all_custom_tiling.h  |  39 ++++
 ...matmul_async_iterate_all_custom_kernel.cpp | 112 ++++++++++
 .../matmul_async_iterate_all_custom_kernel.h  |  64 ++++++
 .../matrix/matmul_async_iterate_all/run.sh    |  86 ++++++++
 .../scripts/exec_test.py                      |  11 +-
 .../testcase/case.csv                         |   1 +
 examples/readme.md                            |  10 +-
 25 files changed, 1030 insertions(+), 168 deletions(-)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/CMakeLists.txt (79%)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/README.md (56%)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/cmake/cpu_lib.cmake (89%)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/cmake/npu_lib.cmake (94%)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/main.cpp (93%)
 rename examples/matrix/{matmul_async/op_host/matmul_async_custom_tiling.cpp => matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp} (90%)
 rename examples/matrix/{matmul_async/op_host/matmul_async_custom_tiling.h => matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h} (75%)
 rename examples/matrix/{matmul_async/op_kernel/matmul_async_custom_kernel.cpp => matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp} (78%)
 rename examples/matrix/{matmul_async/op_kernel/matmul_async_custom_kernel.h => matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h} (79%)
 rename examples/matrix/{matmul_async => matmul_async_iterate}/run.sh (79%)
 create mode 100644 examples/matrix/matmul_async_iterate/scripts/exec_test.py
 rename examples/matrix/{matmul_async => matmul_async_iterate}/testcase/case.csv (100%)
 create mode 100644 examples/matrix/matmul_async_iterate_all/CMakeLists.txt
 create mode 100644 examples/matrix/matmul_async_iterate_all/README.md
 create mode 100644 examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/matmul_async_iterate_all/main.cpp
 create mode 100644 examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp
 create mode 100644 examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h
 create mode 100644 examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h
 create mode 100644 examples/matrix/matmul_async_iterate_all/run.sh
 rename examples/matrix/{matmul_async => matmul_async_iterate_all}/scripts/exec_test.py (92%)
 create mode 100644 examples/matrix/matmul_async_iterate_all/testcase/case.csv

diff --git a/examples/matrix/matmul_async/CMakeLists.txt b/examples/matrix/matmul_async_iterate/CMakeLists.txt
similarity index 79%
rename from examples/matrix/matmul_async/CMakeLists.txt
rename to examples/matrix/matmul_async_iterate/CMakeLists.txt
index e703a3f9..5cdf60c8 100644
--- a/examples/matrix/matmul_async/CMakeLists.txt
+++ b/examples/matrix/matmul_async_iterate/CMakeLists.txt
@@ -15,9 +15,6 @@ endif()
 if (${SOC_VERSION})
     set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
 endif()
-if (${ASYNC_MODE})
-    set(ASYNC_MODE "GM" CACHE STRING "GM/VECIN")
-endif()
 
 set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
 if(NOT CMAKE_BUILD_TYPE)
@@ -29,10 +26,8 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
 endif()
 
 file(GLOB KERNEL_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_custom_kernel.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_iterate_custom_kernel.cpp
 )
-set(CUSTOM_ASYNC_LIST "VECIN")
-
 if("${RUN_MODE}" STREQUAL "cpu")
     include(cmake/cpu_lib.cmake)
 elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
@@ -41,31 +36,29 @@ else()
     message("invalid RUN_MODE: ${RUN_MODE}")
 endif()
 
-add_executable(ascendc_matmul_async_bbit
+add_executable(ascendc_matmul_async_iterate_bbit
     ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_custom_tiling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_iterate_custom_tiling.cpp
 )
 
-target_compile_options(ascendc_matmul_async_bbit PRIVATE
+target_compile_options(ascendc_matmul_async_iterate_bbit PRIVATE
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
     -O2
     -std=c++17
     -D_GLIBCXX_USE_CXX11_ABI=0
 )
 
-target_compile_definitions(ascendc_matmul_async_bbit PRIVATE
+target_compile_definitions(ascendc_matmul_async_iterate_bbit PRIVATE
     SOC_VERSION="${SOC_VERSION}"
-    $<$<BOOL:$<IN_LIST:${ASYNC_MODE},${CUSTOM_ASYNC_LIST}>>:CUSTOM_ASYNC_VECIN>
-    ASYNC_MODE="${ASYNC_MODE}"
 )
 
-target_include_directories(ascendc_matmul_async_bbit PRIVATE
+target_include_directories(ascendc_matmul_async_iterate_bbit PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
 )
 
-target_link_libraries(ascendc_matmul_async_bbit PRIVATE
+target_link_libraries(ascendc_matmul_async_iterate_bbit PRIVATE
     $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
@@ -78,7 +71,7 @@ target_link_libraries(ascendc_matmul_async_bbit PRIVATE
     dl
 )
 
-install(TARGETS ascendc_matmul_async_bbit
+install(TARGETS ascendc_matmul_async_iterate_bbit
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/examples/matrix/matmul_async/README.md b/examples/matrix/matmul_async_iterate/README.md
similarity index 56%
rename from examples/matrix/matmul_async/README.md
rename to examples/matrix/matmul_async_iterate/README.md
index 34e59352..2767c464 100644
--- a/examples/matrix/matmul_async/README.md
+++ b/examples/matrix/matmul_async_iterate/README.md
@@ -1,7 +1,7 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法，包含调用IterateAll输出到GM，和调用Iterate和GetTensorC输出到VECIN两种异步实现方式。  
+本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法，实现方式为调用Iterate和GetTensorC输出到VECIN。  
 异步场景指的是程序执行时，不需要等待某个操作完成就可以执行下一步操作。异步场景可以减少同步等待，提高并行度，开发者对计算性能要求较高时，可以选用该方式。
 
 本样例以直调的方式调用算子核函数。  
@@ -24,7 +24,7 @@
 
 ## 算子描述
 - 算子功能  
-  MatmulAsyncCustom单算子，实现了调用IterateAll输出到GM，或者调用Iterate和GetTensorC输出到VECIN，两种异步场景实现方式下的矩阵乘计算。
+  MatmulAsyncIterateCustom单算子，实现了调用Iterate和GetTensorC输出到VECIN的异步场景矩阵乘计算。
 
 - 算子规格
     <table>
@@ -39,12 +39,12 @@
     <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td>
     <td align="center">(640, 1024)</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
     </tr>
-    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_async_custom</td></tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_async_iterate_custom</td></tr>
     </table>
 
 ## 算子实现介绍
 
-本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncCustom单算子。根据算子执行时传入的参数设置对应的编译宏，再根据不同的编译宏选择用IterateAll或者Iterate&GetTensorC的方式实现异步。
+本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncIterateCustom单算子。
 
 - Kernel实现
   - 计算逻辑：C = A * B + Bias。
@@ -52,77 +52,58 @@
     - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
     - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
   - 具体步骤：
-    - 创建Matmul对象，根据异步实现方式，选择创建输出到GM或者VECIN的Matmul对象。
+    - 创建Matmul对象，输出C矩阵的TPosition为VECIN。
       ```
-      #if defined(CUSTOM_ASYNC_VECIN)
-          AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
-          AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
-          AscendC::MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, CType>,
-          AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
-      #else
-          AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
-          AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
-          AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>,
-          AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
-      #endif
+      AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
+      AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
+      AscendC::MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, CType>,
+      AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
       ```
     - 初始化操作。
     - 设置左矩阵A、右矩阵B、Bias。
     - 获取矩阵乘计算结果。
-      - 调用IterateAll输出到GM的异步场景实现
-        ```
-        matmulObj.template IterateAll<false>(cGlobal, 0, false, true);
-        matmulObj.WaitIterateAll();
-        ```
-      - 调用Iterate和GetTensorC输出到VECIN的异步场景实现
-        ```
-        matmulObj.SetWorkspace(workspaceGlobal);
-        matmulObj.template Iterate<false>();
-        uint32_t baseM = this->tiling.baseM;
-        uint32_t baseN = this->tiling.baseN;
-        pipe->InitBuffer(cInQueue, 1, baseM * baseN * sizeof(CType));
-        pipe->InitBuffer(cOutQueue, 1, baseM * baseN * sizeof(CType));
-        AscendC::DataCopyParams copyParams = {
-            (uint16_t)baseM,
-            (uint16_t)(baseN * sizeof(CType) / AscendC::DEFAULT_C0_SIZE),
-            (uint16_t)0,
-            (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE)
-        };
-        uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN);
-        for (uint32_t i = 0; i < iterateTimes; ++i) {
-            // compute
-            auto cInLocal = cInQueue.AllocTensor<CType>();
-            matmulObj.template GetTensorC<false>(cInLocal);
-            cInQueue.EnQue(cInLocal);
-            // any vector operator
-            auto src = cInQueue.DeQue<CType>();
-            auto dst = cOutQueue.AllocTensor<CType>();
-            DataCopy(dst, src, baseM * baseN);
-            cOutQueue.EnQue(dst);
-            cInQueue.FreeTensor(src);
-            // copy out
-            auto cOutLocal = cOutQueue.DeQue<CType>();
-            DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams);
-            cOutQueue.FreeTensor(cOutLocal);
-        }
-        ```
+      ```
+      matmulObj.SetWorkspace(workspaceGlobal);
+      matmulObj.template Iterate<false>();
+      uint32_t baseM = this->tiling.baseM;
+      uint32_t baseN = this->tiling.baseN;
+      pipe->InitBuffer(cInQueue, 1, baseM * baseN * sizeof(CType));
+      pipe->InitBuffer(cOutQueue, 1, baseM * baseN * sizeof(CType));
+      AscendC::DataCopyParams copyParams = {
+          (uint16_t)baseM,
+          (uint16_t)(baseN * sizeof(CType) / AscendC::DEFAULT_C0_SIZE),
+          (uint16_t)0,
+          (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE)
+      };
+      uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN);
+      for (uint32_t i = 0; i < iterateTimes; ++i) {
+          // compute
+          auto cInLocal = cInQueue.AllocTensor<CType>();
+          matmulObj.template GetTensorC<false>(cInLocal);
+          cInQueue.EnQue(cInLocal);
+          // any vector operator
+          auto src = cInQueue.DeQue<CType>();
+          auto dst = cOutQueue.AllocTensor<CType>();
+          DataCopy(dst, src, baseM * baseN);
+          cOutQueue.EnQue(dst);
+          cInQueue.FreeTensor(src);
+          // copy out
+          auto cOutLocal = cOutQueue.DeQue<CType>();
+          DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams);
+          cOutQueue.FreeTensor(cOutLocal);
+      }
+      ```
     - 完成矩阵乘操作。
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
-        设置C的参数时，需要根据异步的实现方式，选择C的TPosition为GM或者VECIN。
+        设置C的TPosition为VECIN。
         ```
-        #if defined(CUSTOM_ASYNC_VECIN)
-            cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND,
-                                 matmul_tiling::DataType::DT_FLOAT);
-        #else
-            cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
-                                 matmul_tiling::DataType::DT_FLOAT);
-        #endif
+        cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
         ```
       - 调用GetTiling接口，获取Tiling信息。
 
@@ -139,17 +120,15 @@
   - 编译执行
 
     ```
-    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -m [ASYNC_MODE]
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
     ```
     其中脚本参数说明如下：
     - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
     - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
       - Atlas A2训练系列产品/Atlas 800I A2推理产品
     - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
-    - ASYNC_MODE : 选择异步场景的实现方式是调用IterateAll输出到GM，或者调用Iterate和GetTensorC输出到VECIN，对应参数分别为[GM / VECIN]。
 
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
-    bash run.sh -r cpu -v Ascendxxxyy -p 0 -m GM
-    # bash run.sh -r cpu -v Ascendxxxyy -p 0 -m VECIN
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
     ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_async/cmake/cpu_lib.cmake b/examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake
similarity index 89%
rename from examples/matrix/matmul_async/cmake/cpu_lib.cmake
rename to examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake
index aff00e34..2af75433 100644
--- a/examples/matrix/matmul_async/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_async_iterate/cmake/cpu_lib.cmake
@@ -20,10 +20,6 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
     tikicpulib::${SOC_VERSION}
 )
 
-target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-    $<$<BOOL:$<IN_LIST:${ASYNC_MODE},${CUSTOM_ASYNC_LIST}>>:CUSTOM_ASYNC_VECIN>
-)
-
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
     -g
     -O2
diff --git a/examples/matrix/matmul_async/cmake/npu_lib.cmake b/examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake
similarity index 94%
rename from examples/matrix/matmul_async/cmake/npu_lib.cmake
rename to examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake
index d62c3df9..bc803099 100644
--- a/examples/matrix/matmul_async/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_async_iterate/cmake/npu_lib.cmake
@@ -21,7 +21,6 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
 )
 
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-    $<$<BOOL:$<IN_LIST:${ASYNC_MODE},${CUSTOM_ASYNC_LIST}>>:CUSTOM_ASYNC_VECIN>
     -DASCENDC_DUMP
     -DHAVE_WORKSPACE
     -DHAVE_TILING
diff --git a/examples/matrix/matmul_async/main.cpp b/examples/matrix/matmul_async_iterate/main.cpp
similarity index 93%
rename from examples/matrix/matmul_async/main.cpp
rename to examples/matrix/matmul_async_iterate/main.cpp
index d3a456c2..0afbed88 100644
--- a/examples/matrix/matmul_async/main.cpp
+++ b/examples/matrix/matmul_async_iterate/main.cpp
@@ -12,15 +12,15 @@
 #include "../../common/data_utils.h"
 #include "kernel_tiling/kernel_tiling.h"
 #include "tiling/platform/platform_ascendc.h"
-#include "./op_host/matmul_async_custom_tiling.h"
+#include "./op_host/matmul_async_iterate_custom_tiling.h"
 
 #ifdef ASCENDC_CPU_DEBUG
 #include "tikicpulib.h"
-extern "C" void matmul_async_custom(
+extern "C" void matmul_async_iterate_custom(
     uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
 #else
 #include "acl/acl.h"
-extern void matmul_async_custom_do(uint32_t coreDim, void* stream,
+extern void matmul_async_iterate_custom_do(uint32_t coreDim, void* stream,
     uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
 #endif
 
@@ -38,7 +38,7 @@ void TestMatmulCpu(const optiling::TestcaseParams& caseParams)
     size_t bFileSize = static_cast<size_t>(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half
     size_t biasFileSize = static_cast<size_t>(sizeof(float) * caseParams.n);
     size_t cFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);
-    size_t userWorkspaceSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);  // async workspace, len is M * N
+    size_t userWorkspaceSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);  // async_iterate workspace, len is M * N
     size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
     size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize;
     size_t tilingFileSize = sizeof(TCubeTiling);
@@ -61,7 +61,7 @@ void TestMatmulCpu(const optiling::TestcaseParams& caseParams)
     memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize);
 
     uint32_t blockDim = ((uint32_t*)tiling)[0] / mixRatio;
-    ICPU_RUN_KF(matmul_async_custom, blockDim, a, b, bias, c, workspace, tiling);
+    ICPU_RUN_KF(matmul_async_iterate_custom, blockDim, a, b, bias, c, workspace, tiling);
 
     WriteFile("../output/output.bin", c, cFileSize);
 
@@ -84,7 +84,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling:
     uint32_t aivCoreNum = ascendcPlatform->GetCoreNum();
 
     uint8_t *workspaceDevice;
-    size_t userWorkspaceSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);  // async workspace, len is M * N
+    size_t userWorkspaceSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);  // async_iterate workspace, len is M * N
     size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
     size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize;
     CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
@@ -99,7 +99,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling:
     CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
     uint32_t blockDim = ((uint32_t*)tilingHost)[0] / mixRatio;
-    matmul_async_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice);
+    matmul_async_iterate_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice);
     CHECK_ACL(aclrtSynchronizeStream(stream));
 
     CHECK_ACL(aclrtFreeHost(tilingHost));
@@ -190,7 +190,8 @@ int32_t main(int32_t argc, const char* args[])
         ss >> problem[i - 1];
     }
 
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]};
+    bool isBias = problem[3];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 #ifdef ASCENDC_CPU_DEBUG
     MatmulHost::TestMatmulCpu(caseParams);
diff --git a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp
similarity index 90%
rename from examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp
rename to examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp
index 6a58b2b6..1fb3a260 100644
--- a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.cpp
+++ b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.cpp
@@ -8,7 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#include "matmul_async_custom_tiling.h"
+#include "matmul_async_iterate_custom_tiling.h"
 #include <iostream>
 
 namespace optiling {
@@ -21,13 +21,8 @@ bool ComputeTiling(uint32_t blockDim, TCubeTiling& tiling, matmul_tiling::MultiC
                          matmul_tiling::DataType::DT_FLOAT16);
     cubeTiling->SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
                          matmul_tiling::DataType::DT_FLOAT16);
-#if defined(CUSTOM_ASYNC_VECIN)
     cubeTiling->SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND,
                          matmul_tiling::DataType::DT_FLOAT);
-#else
-    cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
-                         matmul_tiling::DataType::DT_FLOAT);
-#endif
     cubeTiling->SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
                             matmul_tiling::DataType::DT_FLOAT);
     cubeTiling->SetShape(caseParams.m, caseParams.n, caseParams.k);
diff --git a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h
similarity index 75%
rename from examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h
rename to examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h
index 4ab3ca35..c76a6388 100644
--- a/examples/matrix/matmul_async/op_host/matmul_async_custom_tiling.h
+++ b/examples/matrix/matmul_async_iterate/op_host/matmul_async_iterate_custom_tiling.h
@@ -7,17 +7,17 @@
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */
-#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H
-#define EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H
+#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H
 #include "register/tilingdata_base.h"
 #include "tiling/tiling_api.h"
 
 namespace optiling {
 
-BEGIN_TILING_DATA_DEF(MatmulAsyncCustomTilingData)
+BEGIN_TILING_DATA_DEF(MatmulAsyncIterateCustomTilingData)
   TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData);
 END_TILING_DATA_DEF;
-REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncCustomTilingData)
+REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncIterateCustomTilingData)
 
 struct TestcaseParams {
     uint32_t m;
@@ -36,4 +36,4 @@ void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cub
     const TestcaseParams& caseParams, uint8_t* tilingBuffer);
 
 } // namespace optiling
-#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_OP_HOST_MATMUL_ASYNC_CUSTOM_TILING_H
\ No newline at end of file
+#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_HOST_MATMUL_ASYNC_ITERATE_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp
similarity index 78%
rename from examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp
rename to examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp
index f8ff7e71..ebb8c284 100644
--- a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.cpp
+++ b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.cpp
@@ -8,20 +8,12 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#include "matmul_async_custom_kernel.h"
+#include "matmul_async_iterate_custom_kernel.h"
 
 namespace MatmulCustom {
 
-__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
-{
-    if (b == 0) {
-        return a;
-    }
-    return (a + b - 1) / b;
-}
-
 template <typename AType, typename BType, typename CType, typename BiasType>
-__aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Init(GM_ADDR a,
+__aicore__ inline void MatmulAsyncIterateKernel<AType, BType, CType, BiasType>::Init(GM_ADDR a,
         GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling)
 {
     this->tiling = tiling;
@@ -49,7 +41,7 @@ __aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Init(GM
 }
 
 template <typename AType, typename BType, typename CType, typename BiasType>
-__aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Process(AscendC::TPipe* pipe)
+__aicore__ inline void MatmulAsyncIterateKernel<AType, BType, CType, BiasType>::Process(AscendC::TPipe* pipe)
 {
     REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, &(this->tiling));
 
@@ -58,7 +50,7 @@ __aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Process
     if (this->tiling.isBias) {
         matmulObj.SetBias(biasGlobal);
     }
-#if defined(CUSTOM_ASYNC_VECIN)
+
     matmulObj.SetWorkspace(workspaceGlobal);
     matmulObj.template Iterate<false>();
 
@@ -73,7 +65,8 @@ __aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Process
         (uint16_t)0,
         (uint16_t)((this->tiling.N - baseN) * sizeof(CType) / AscendC::DEFAULT_C0_SIZE)
     };
-    uint32_t iterateTimes = Ceiling(this->tiling.singleCoreM, baseM) * Ceiling(this->tiling.singleCoreN, baseN);
+    uint32_t iterateTimes = AscendC::Ceil(this->tiling.singleCoreM, baseM) *
+            AscendC::Ceil(this->tiling.singleCoreN, baseN);
     for (uint32_t i = 0; i < iterateTimes; ++i) {
         // compute
         auto cInLocal = cInQueue.AllocTensor<CType>();
@@ -90,19 +83,15 @@ __aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::Process
         DataCopy(cGlobal[CalcDstOffset(i)], cOutLocal, copyParams);
         cOutQueue.FreeTensor(cOutLocal);
     }
-#else
-    matmulObj.template IterateAll<false>(cGlobal, 0, false, true);
-    matmulObj.WaitIterateAll();
-#endif
     matmulObj.End();
 }
 
 
 template <typename AType, typename BType, typename CType, typename BiasType>
-__aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::CalcOffset(
+__aicore__ inline void MatmulAsyncIterateKernel<AType, BType, CType, BiasType>::CalcOffset(
     uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias)
 {
-    auto mSingleBlocks = Ceiling(this->tiling.M, this->tiling.singleCoreM);
+    auto mSingleBlocks = AscendC::Ceil(this->tiling.M, this->tiling.singleCoreM);
     auto mCoreIndx = blockIdx % mSingleBlocks;
     auto nCoreIndx = blockIdx / mSingleBlocks;
 
@@ -122,16 +111,16 @@ __aicore__ inline void MatmulAsyncKernel<AType, BType, CType, BiasType>::CalcOff
 }
 
 template <typename aType, typename bType, typename CType, typename BiasType>
-__aicore__ inline uint32_t MatmulAsyncKernel<aType, bType, CType, BiasType>::CalcDstOffset(uint32_t i)
+__aicore__ inline uint32_t MatmulAsyncIterateKernel<aType, bType, CType, BiasType>::CalcDstOffset(uint32_t i)
 {
     uint32_t mIter = 0;
     uint32_t nIter = 0;
     if (this->tiling.iterateOrder != 1) {
-        uint32_t mIterTimes = Ceiling(this->tiling.singleCoreM, this->tiling.baseM);
+        uint32_t mIterTimes = AscendC::Ceil(this->tiling.singleCoreM, this->tiling.baseM);
         mIter = i % mIterTimes;
         nIter = i / mIterTimes;
     } else {
-        uint32_t nIterTimes = Ceiling(this->tiling.singleCoreN, this->tiling.baseN);
+        uint32_t nIterTimes = AscendC::Ceil(this->tiling.singleCoreN, this->tiling.baseN);
         mIter = i / nIterTimes;
         nIter = i % nIterTimes;
     }
@@ -152,22 +141,22 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
 }
 }
 
-extern "C" __global__ __aicore__ void matmul_async_custom(
+extern "C" __global__ __aicore__ void matmul_async_iterate_custom(
     GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
     AscendC::TPipe pipe;
 
-    MatmulCustom::MatmulAsyncKernel<half, half, float, float> matmulAsyncKernel;
-    matmulAsyncKernel.Init(a, b, bias, c, workspace, tiling);
-    matmulAsyncKernel.Process(&pipe);
+    MatmulCustom::MatmulAsyncIterateKernel<half, half, float, float> matmulAsyncIterateKernel;
+    matmulAsyncIterateKernel.Init(a, b, bias, c, workspace, tiling);
+    matmulAsyncIterateKernel.Process(&pipe);
 }
 
 #ifndef ASCENDC_CPU_DEBUG
-void matmul_async_custom_do(uint32_t blockDim, void* stream,
+void matmul_async_iterate_custom_do(uint32_t blockDim, void* stream,
     GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
-    matmul_async_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+    matmul_async_iterate_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
 }
 #endif
\ No newline at end of file
diff --git a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h
similarity index 79%
rename from examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h
rename to examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h
index 3da1cfa5..231a54c7 100644
--- a/examples/matrix/matmul_async/op_kernel/matmul_async_custom_kernel.h
+++ b/examples/matrix/matmul_async_iterate/op_kernel/matmul_async_iterate_custom_kernel.h
@@ -8,19 +8,17 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H
-#define EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H
+#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
 
 namespace MatmulCustom {
 
-__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b);
-
 template <typename AType, typename BType, typename CType, typename BiasType>
-class MatmulAsyncKernel {
+class MatmulAsyncIterateKernel {
 public:
-    __aicore__ inline MatmulAsyncKernel(){};
+    __aicore__ inline MatmulAsyncIterateKernel(){};
     /**
       * @brief  Initialization before process.
       * @param  a: A matrix gm addr.
@@ -40,17 +38,10 @@ public:
       */
     __aicore__ inline void Process(AscendC::TPipe* pipe);
 
-#if defined(CUSTOM_ASYNC_VECIN)
     AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
     AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
     AscendC::MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, CType>,
     AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
-#else
-    AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
-    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
-    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>,
-    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
-#endif
 
 private:
     /**
@@ -76,4 +67,4 @@ private:
     TCubeTiling tiling;
 };
 }  // namespace MatmulCustom
-#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_OP_KERNEL_MATMUL_ASYNC_CUSTOM_KERNEL_H
\ No newline at end of file
+#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_OP_KERNEL_MATMUL_ASYNC_ITERATE_CUSTOM_KERNEL_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_async/run.sh b/examples/matrix/matmul_async_iterate/run.sh
similarity index 79%
rename from examples/matrix/matmul_async/run.sh
rename to examples/matrix/matmul_async_iterate/run.sh
index 8a539cfc..34f794f7 100644
--- a/examples/matrix/matmul_async/run.sh
+++ b/examples/matrix/matmul_async_iterate/run.sh
@@ -10,8 +10,8 @@
 
 export IS_PERF="0"
 
-SHORT=r:,v:,p:,m:,
-LONG=run-mode:,soc-version:,perf:,async-mode:,
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
 while :
@@ -26,9 +26,6 @@ do
         (-p | --perf)
             IS_PERF="$2"
             shift 2;;
-        (-m | --async-mode)
-            ASYNC_MODE="$2"
-            shift 2;;
         (--)
             shift;
             break;;
@@ -60,11 +57,6 @@ if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
     exit 1
 fi
 
-if [ "${ASYNC_MODE}" != "GM" ] && [ "${ASYNC_MODE}" != "VECIN" ]; then
-    echo "[ERROR] ASYNC_MODE only supports 'GM' and 'VECIN', but input: ${ASYNC_MODE}."
-    exit 1
-fi
-
 rm -rf build
 mkdir build
 cd build
@@ -74,7 +66,7 @@ export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD
 export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
 set -euo pipefail
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASYNC_MODE=${ASYNC_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
 make -j16
 
 cd ../
@@ -87,8 +79,8 @@ mkdir output
 rm -rf bin/
 mkdir -p bin
 cd bin
-cp ../build/ascendc_matmul_async_bbit ./
+cp ../build/ascendc_matmul_async_iterate_bbit ./
 
 export TF_CPP_MIN_LOG_LEVEL=3
 
-python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF} -m ${ASYNC_MODE}
+python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF}
diff --git a/examples/matrix/matmul_async_iterate/scripts/exec_test.py b/examples/matrix/matmul_async_iterate/scripts/exec_test.py
new file mode 100644
index 00000000..fe85a1af
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate/scripts/exec_test.py
@@ -0,0 +1,128 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import shlex
+import subprocess
+import sys
+import csv
+import time
+import logging
+import argparse
+from collections import namedtuple
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from common_scripts.gen_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import clear_file_cache
+from common_scripts.exec_utils import get_perf_task_duration
+
+DATA_TYPE_STR = "float16_float32"
+
+logging.basicConfig(level=logging.INFO)
+
+ProcessParams = namedtuple("ProcessParams", ["isProcess", "caseName", 'm', 'n', 'k', 'isBias'])
+
+
+def get_case_list(work_path):
+    case_list = []
+    case_dir = os.path.join(work_path, "testcase")
+    if not os.path.isdir(case_dir):
+        logging.error(f"case path: {case_dir} not exist!")
+        return case_list
+
+    for file_name in os.listdir(case_dir):
+        if not file_name.endswith(".csv"):
+            continue
+
+        abs_file_name = os.path.join(case_dir, file_name)
+        with open(abs_file_name, mode='r', encoding='utf-8') as file:
+            csv_reader = csv.reader(file)
+            for row in csv_reader:
+                item_list = [int(i) if i.strip().isdigit() else i.strip() for i in row]
+                case_list.append(item_list)
+    return case_list
+
+
+def process_case(work_path, args, params):
+    case_name, m, n, k, is_bias = (params.caseName, params.m, params.n, params.k, params.isBias)
+    run_mode, is_perf = args.r, args.p
+
+    logging.info(f"start process case [{case_name}]")
+    logging.info(f"IS_PERF is set [{is_perf}]")
+
+    clear_file_cache(work_path)
+
+    # gen data
+    MatmulGenData(m, n, k, 1, False, False, is_bias, DATA_TYPE_STR).gen_golden_data(work_path)
+
+    params_str = f"{m} {n} {k} {is_bias}"
+    if is_perf: # npu(is_perf = 1)
+        cmd = f"msprof op --application=\"./ascendc_matmul_async_iterate_bbit {params_str}\" --output=./prof_out"
+    elif run_mode == "sim": # sim
+        cmd = (f"msprof op simulator --application=\"./ascendc_matmul_async_iterate_bbit {params_str}\""
+               f" --output=./sim_out")
+    else: # cpu or npu(is_perf = 0)
+        cmd = f"./ascendc_matmul_async_iterate_bbit {params_str}"
+    subprocess.run(shlex.split(cmd))
+
+    total_num = params.m * params.n
+    if is_perf:
+        wrong_num = -1
+        result = None
+        task_duration = get_perf_task_duration("./prof_out")
+    else:
+        logging.info(f"compare data case[{case_name}]")
+        wrong_num = compare_data(work_path, DATA_TYPE_STR)
+        result = "Fail" if (wrong_num / total_num > 0.001 or wrong_num < 0) else "Success"
+        task_duration = None
+
+    return [params.caseName, wrong_num, total_num, result, task_duration]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', type=str, choices=['cpu', 'sim', 'npu'], required=True)
+    parser.add_argument('-p', type=int, default=0, required=False)
+    args = parser.parse_args()
+
+    if args.r != 'npu' and args.p:
+        logging.error(f"IS_PERF can only be enabled in 'npu' mode, current mode: '{args.r}'.")
+        return -1
+
+    work_path = os.path.dirname(os.getcwd())
+
+    case_list = get_case_list(work_path)
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+
+    for case in case_list:
+        try:
+            process_params = ProcessParams(*case)
+        except Exception as e:
+            logging.error(f"{case[0]}: invalid parameter, error msg: {e}")
+            continue
+        if process_params.isProcess:
+            res_data = process_case(work_path, args, process_params)
+            res_list.append(res_data)
+
+    result_file_name = f"result_{time.time()}.csv"
+    with open(os.path.join(work_path, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_async/testcase/case.csv b/examples/matrix/matmul_async_iterate/testcase/case.csv
similarity index 100%
rename from examples/matrix/matmul_async/testcase/case.csv
rename to examples/matrix/matmul_async_iterate/testcase/case.csv
diff --git a/examples/matrix/matmul_async_iterate_all/CMakeLists.txt b/examples/matrix/matmul_async_iterate_all/CMakeLists.txt
new file mode 100644
index 00000000..f760a2d2
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_async_iterate_all_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_async_iterate_all_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_async_iterate_all_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_async_iterate_all_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_async_iterate_all_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_matmul_async_iterate_all_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_async_iterate_all_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
+
+install(TARGETS ascendc_matmul_async_iterate_all_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/README.md b/examples/matrix/matmul_async_iterate_all/README.md
new file mode 100644
index 00000000..c01c5bd7
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/README.md
@@ -0,0 +1,107 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul高阶API实现异步场景下的Matmul矩阵乘法，实现方式为调用IterateAll输出到GM。  
+异步场景指的是程序执行时，不需要等待某个操作完成就可以执行下一步操作。异步场景可以减少同步等待，提高并行度，开发者对计算性能要求较高时，可以选用该方式。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方式。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录                                 | 描述                         |
+|------------------------------------|----------------------------|
+| [cmake](cmake)                     | 编译工程文件                     |
+| [op_host](op_host)                 | 本样例tiling代码实现              |
+| [op_kernel](op_kernel)             | 本样例kernel代码实现              |
+| [scripts](scripts)                 | 执行文件      |
+| [testcase](testcase)               | 用例文件，配置用例的计算shape信息        |
+| [CMakeLists.txt](CMakeLists.txt)   | 编译工程文件                     |
+| [main.cpp](main.cpp)               | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| [run.sh](run.sh)                   | 编译执行脚本                     |
+
+## 算子描述
+- 算子功能  
+  MatmulAsyncIterateAllCustom单算子，实现了调用IterateAll输出到GM的异步场景矩阵乘计算。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulAsyncIterateAllCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">(640, 512)</td><td align="center">half</td><td align="center">ND</td><td align="center">flase</td></tr>
+    <tr><td align="center">b</td><td align="center">(512, 1024)</td><td align="center">half</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">bias</td><td align="center">(1024, )</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td>
+    <td align="center">(640, 1024)</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_async_iterate_all_custom</td></tr>
+    </table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为[M, N, K] = [640, 1024, 512], bias = [1024]的MatmulAsyncIterateAllCustom单算子。
+
+- Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象，输出C矩阵的TPosition为GM。
+      ```
+      AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
+      AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
+      AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>,
+      AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
+      ```
+    - 初始化操作。
+    - 设置左矩阵A、右矩阵B、Bias。
+    - 获取矩阵乘计算结果。
+      ```
+      matmulObj.template IterateAll<false>(cGlobal, 0, false, true);
+      matmulObj.WaitIterateAll();
+      ```
+    - 完成矩阵乘操作。
+    - 结束矩阵乘操作。
+
+- Tiling实现
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - 获取Tiling参数的流程如下：
+      - 创建一个Tiling对象。
+      - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+        设置C的TPosition为GM。
+        ```
+        cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+        ```
+      - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest。
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake b/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..2af75433
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O2
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake b/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake
new file mode 100644
index 00000000..bc803099
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/main.cpp b/examples/matrix/matmul_async_iterate_all/main.cpp
new file mode 100644
index 00000000..141d51fd
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/main.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "./op_host/matmul_async_iterate_all_custom_tiling.h"
+
+#ifdef ASCENDC_CPU_DEBUG
+#include "tikicpulib.h"
+extern "C" void matmul_async_iterate_all_custom(
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
+#else
+#include "acl/acl.h"
+extern void matmul_async_iterate_all_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace MatmulHost {
+
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(const optiling::TestcaseParams& caseParams)
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
+    uint32_t aivCoreNum = ascendcPlatform->GetCoreNum();
+
+    size_t aFileSize = static_cast<size_t>(sizeof(uint16_t) * caseParams.m * caseParams.k); // uint16_t represent half
+    size_t bFileSize = static_cast<size_t>(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half
+    size_t biasFileSize = static_cast<size_t>(sizeof(float) * caseParams.n);
+    size_t cFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);
+    size_t workspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+    size_t tilingFileSize = sizeof(TCubeTiling);
+
+    uint8_t* a = (uint8_t *)AscendC::GmAlloc(aFileSize);
+    uint8_t* b = (uint8_t *)AscendC::GmAlloc(bFileSize);
+    uint8_t* bias;
+    uint8_t* c = (uint8_t *)AscendC::GmAlloc(cFileSize);
+    uint8_t* workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize);
+    uint8_t* tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize);
+
+    ReadFile("../input/x1_gm.bin", aFileSize, a, aFileSize);
+    ReadFile("../input/x2_gm.bin", bFileSize, b, bFileSize);
+    if (caseParams.isBias) {
+        bias = (uint8_t *)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+
+    optiling::GenerateTiling(aivCoreNum, &cubeTiling, caseParams, tiling);
+    memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize);
+
+    uint32_t blockDim = ((uint32_t*)tiling)[0] / mixRatio;
+    ICPU_RUN_KF(matmul_async_iterate_all_custom, blockDim, a, b, bias, c, workspace, tiling);
+
+    WriteFile("../output/output.bin", c, cFileSize);
+
+    AscendC::GmFree((void*)a);
+    AscendC::GmFree((void*)b);
+    if (caseParams.isBias) {
+        AscendC::GmFree((void*)bias);
+    }
+    AscendC::GmFree((void*)c);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+}
+#else
+void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling::TestcaseParams& caseParams,
+    void* stream = nullptr)
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
+    uint32_t aivCoreNum = ascendcPlatform->GetCoreNum();
+
+    uint8_t *workspaceDevice;
+    size_t workspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t* tilingHost;
+    uint8_t* tilingDevice;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    optiling::GenerateTiling(aivCoreNum, &cubeTiling, caseParams, tilingHost);
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint32_t blockDim = ((uint32_t*)tilingHost)[0] / mixRatio;
+    matmul_async_iterate_all_custom_do(blockDim, stream, a, b, bias, c, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(const optiling::TestcaseParams& caseParams)
+{
+    size_t aFileSize = static_cast<size_t>(sizeof(uint16_t) * caseParams.m * caseParams.k); // uint16_t represent half
+    size_t bFileSize = static_cast<size_t>(sizeof(uint16_t) * caseParams.k * caseParams.n); // uint16_t represent half
+    size_t biasFileSize = static_cast<size_t>(sizeof(float) * caseParams.n);
+    size_t cFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* aHost;
+    uint8_t* aDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", aFileSize, aHost, aFileSize);
+    CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* bHost;
+    uint8_t* bDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", bFileSize, bHost, bFileSize);
+    CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost;
+    uint8_t* biasDevice;
+    if (caseParams.isBias) {
+        CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    uint8_t* cHost;
+    uint8_t* cDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(aDevice, bDevice, biasDevice, cDevice, caseParams, stream);
+
+    CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", cHost, cFileSize);
+
+    CHECK_ACL(aclrtFree(aDevice));
+    CHECK_ACL(aclrtFreeHost(aHost));
+
+    CHECK_ACL(aclrtFree(bDevice));
+    CHECK_ACL(aclrtFreeHost(bHost));
+
+    CHECK_ACL(aclrtFree(cDevice));
+    CHECK_ACL(aclrtFreeHost(cHost));
+
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // end namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    uint32_t problem[4];
+    for (int32_t i = 1; i < argc && i < 5; ++i) {
+        std::stringstream ss(args[i]);
+        ss >> problem[i - 1];
+    }
+    bool isBias = problem[3];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias};
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(caseParams);
+#else
+    MatmulHost::TestMatmul(caseParams);
+#endif
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp
new file mode 100644
index 00000000..eab6037a
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_async_iterate_all_custom_tiling.h"
+#include <iostream>
+
+namespace optiling {
+
+bool ComputeTiling(uint32_t blockDim, TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling,
+                   const TestcaseParams& caseParams)
+{
+    cubeTiling->SetDim(blockDim);
+    cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+                         matmul_tiling::DataType::DT_FLOAT16);
+    cubeTiling->SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+                         matmul_tiling::DataType::DT_FLOAT16);
+    cubeTiling->SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+                         matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling->SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+                            matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling->SetShape(caseParams.m, caseParams.n, caseParams.k);
+    cubeTiling->SetOrgShape(caseParams.m, caseParams.n, caseParams.k);
+    cubeTiling->EnableBias(caseParams.isBias);
+    cubeTiling->SetBufferSpace(-1, -1, -1);
+    if (cubeTiling->GetTiling(tiling) == -1) {
+        return false;
+    }
+    return true;
+}
+
+void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cubeTiling,
+    const TestcaseParams& caseParams, uint8_t* tilingBuffer)
+{
+    TCubeTiling tilingData;
+
+    bool res = optiling::ComputeTiling(blockDim, tilingData, cubeTiling, caseParams);
+    if (!res) {
+        std::cout << "gen tiling failed." << std::endl;
+    }
+
+    uint32_t tilingSize = tilingData.GetDataSize();
+    tilingData.SaveToBuffer(tilingBuffer, tilingSize);
+}
+} // namespace optiling
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h
new file mode 100644
index 00000000..eef4289f
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/op_host/matmul_async_iterate_all_custom_tiling.h
@@ -0,0 +1,39 @@
+/**
+* Copyright (c) 2025 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+
+BEGIN_TILING_DATA_DEF(MatmulAsyncIterateAllCustomTilingData)
+  TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulAsyncIterateAllCustomTilingData)
+
+struct TestcaseParams {
+    uint32_t m;
+    uint32_t n;
+    uint32_t k;
+    bool isBias;
+};
+/**
+  * @brief Generate matmul tiling.
+  * @param blockDim: Number of cores involved in the computation.
+  * @param cubeTiling: TCubeTiling structure.
+  * @param caseParams: Testcase parameters.
+  * @param tilingBuf: Data buffer.
+  */
+void GenerateTiling(uint32_t blockDim, matmul_tiling::MultiCoreMatmulTiling* cubeTiling,
+    const TestcaseParams& caseParams, uint8_t* tilingBuffer);
+
+} // namespace optiling
+#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_HOST_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp
new file mode 100644
index 00000000..e6f43190
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_async_iterate_all_custom_kernel.h"
+
+namespace MatmulCustom {
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulAsyncIterateAllKernel<AType, BType, CType, BiasType>::Init(GM_ADDR a,
+        GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling)
+{
+    this->tiling = tiling;
+
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    uint32_t offsetA = 0;
+    uint32_t offsetB = 0;
+    uint32_t offsetC = 0;
+    uint32_t offsetBias = 0;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+
+    if(GetSysWorkSpacePtr() == nullptr){
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulAsyncIterateAllKernel<AType, BType, CType, BiasType>::Process(AscendC::TPipe* pipe)
+{
+    REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, &(this->tiling));
+
+    matmulObj.SetTensorA(aGlobal);
+    matmulObj.SetTensorB(bGlobal);
+    if (this->tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.template IterateAll<false>(cGlobal, 0, false, true);
+    matmulObj.WaitIterateAll();
+    matmulObj.End();
+}
+
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulAsyncIterateAllKernel<AType, BType, CType, BiasType>::CalcOffset(
+    uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias)
+{
+    auto mSingleBlocks = AscendC::Ceil(this->tiling.M, this->tiling.singleCoreM);
+    auto mCoreIndx = blockIdx % mSingleBlocks;
+    auto nCoreIndx = blockIdx / mSingleBlocks;
+
+    offsetA = mCoreIndx * this->tiling.Ka * this->tiling.singleCoreM;
+    offsetB = nCoreIndx * this->tiling.singleCoreN;
+    offsetC = mCoreIndx * this->tiling.N * this->tiling.singleCoreM + nCoreIndx * this->tiling.singleCoreN;
+    offsetBias = nCoreIndx * this->tiling.singleCoreN;
+
+    // process with tail block
+    int32_t tailM = this->tiling.M - mCoreIndx * this->tiling.singleCoreM;
+    tailM = tailM < this->tiling.singleCoreM ? tailM : this->tiling.singleCoreM;
+    int32_t tailN = this->tiling.N - nCoreIndx * this->tiling.singleCoreN;
+    tailN = tailN < this->tiling.singleCoreN ? tailN : this->tiling.singleCoreN;
+    if (tailM < this->tiling.singleCoreM || tailN < this->tiling.singleCoreN) {
+        matmulObj.SetTail(tailM, tailN);
+    }
+}
+}  // namespace MatmulCustom
+
+namespace {
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+extern "C" __global__ __aicore__ void matmul_async_iterate_all_custom(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    AscendC::TPipe pipe;
+
+    MatmulCustom::MatmulAsyncIterateAllKernel<half, half, float, float> matmulAsyncIterateAllKernel;
+    matmulAsyncIterateAllKernel.Init(a, b, bias, c, tiling);
+    matmulAsyncIterateAllKernel.Process(&pipe);
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_async_iterate_all_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_async_iterate_all_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h
new file mode 100644
index 00000000..c6b67993
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/op_kernel/matmul_async_iterate_all_custom_kernel.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+namespace MatmulCustom {
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulAsyncIterateAllKernel {
+public:
+    __aicore__ inline MatmulAsyncIterateAllKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling);
+    /**,
+      * @brief  Process matrix calculation.
+      * @param  pipe: The TPipe object which manages global memory and synchronization.
+      * @retval None
+      */
+    __aicore__ inline void Process(AscendC::TPipe* pipe);
+
+    AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>,
+    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>,
+    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>,
+    AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>, CFG_MDL> matmulObj;
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        uint32_t blockIdx, uint32_t& offsetA, uint32_t& offsetB, uint32_t& offsetC, uint32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    TCubeTiling tiling;
+};
+}  // namespace MatmulCustom
+#endif // EXAMPLES_MATRIX_MATMUL_ASYNC_ITERATE_ALL_OP_KERNEL_MATMUL_ASYNC_ITERATE_ALL_CUSTOM_KERNEL_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_async_iterate_all/run.sh b/examples/matrix/matmul_async_iterate_all/run.sh
new file mode 100644
index 00000000..c7e80377
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/run.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf)
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_async_iterate_all_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF}
diff --git a/examples/matrix/matmul_async/scripts/exec_test.py b/examples/matrix/matmul_async_iterate_all/scripts/exec_test.py
similarity index 92%
rename from examples/matrix/matmul_async/scripts/exec_test.py
rename to examples/matrix/matmul_async_iterate_all/scripts/exec_test.py
index d6e0d3b7..51e14acb 100644
--- a/examples/matrix/matmul_async/scripts/exec_test.py
+++ b/examples/matrix/matmul_async_iterate_all/scripts/exec_test.py
@@ -54,10 +54,10 @@ def get_case_list(work_path):
 
 def process_case(work_path, args, params):
     case_name, m, n, k, is_bias = (params.caseName, params.m, params.n, params.k, params.isBias)
-    run_mode, is_perf, async_mode = args.r, args.p, args.m
+    run_mode, is_perf = args.r, args.p
 
     logging.info(f"start process case [{case_name}]")
-    logging.info(f"ASYNC_MODE is set [{async_mode}], IS_PERF is set [{is_perf}]")
+    logging.info(f"IS_PERF is set [{is_perf}]")
 
     clear_file_cache(work_path)
 
@@ -66,11 +66,12 @@ def process_case(work_path, args, params):
 
     params_str = f"{m} {n} {k} {is_bias}"
     if is_perf: # npu(is_perf = 1)
-        cmd = f"msprof op --application=\"./ascendc_matmul_async_bbit {params_str}\" --output=./prof_out"
+        cmd = f"msprof op --application=\"./ascendc_matmul_async_iterate_all_bbit {params_str}\" --output=./prof_out"
     elif run_mode == "sim": # sim
-        cmd = f"msprof op simulator --application=\"./ascendc_matmul_async_bbit {params_str}\" --output=./sim_out"
+        cmd = (f"msprof op simulator --application=\"./ascendc_matmul_async_iterate_all_bbit {params_str}\""
+               f" --output=./sim_out")
     else: # cpu or npu(is_perf = 0)
-        cmd = f"./ascendc_matmul_async_bbit {params_str}"
+        cmd = f"./ascendc_matmul_async_iterate_all_bbit {params_str}"
     subprocess.run(shlex.split(cmd))
 
     total_num = params.m * params.n
diff --git a/examples/matrix/matmul_async_iterate_all/testcase/case.csv b/examples/matrix/matmul_async_iterate_all/testcase/case.csv
new file mode 100644
index 00000000..a38c3b73
--- /dev/null
+++ b/examples/matrix/matmul_async_iterate_all/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 640, 1024, 512, 1
\ No newline at end of file
diff --git a/examples/readme.md b/examples/readme.md
index 5b41ae3d..8a0e1fc8 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -36,7 +36,7 @@
         <td> 对输入tensor按行做如下公式的计算：zi = (xi - ∑(xi * yi)) * yi，其中∑为按行reduce求和。</td>
     </tr>
     <tr>
-        <th rowspan="26"><a href="./matrix"> matrix</th>
+        <th rowspan="27"><a href="./matrix"> matrix</th>
         <td><a href="./matrix/basic_block_matmul"> basic_block_matmul</td>
         <td> 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
@@ -49,8 +49,12 @@
         <td> 实现Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
-        <td><a href="./matrix/matmul_async"> matmul_async</td>
-        <td> 实现异步场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+        <td><a href="./matrix/matmul_async_iterate"> matmul_async_iterate</td>
+        <td> 调用Iterate实现异步场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
+    <tr>
+        <td><a href="./matrix/matmul_async_iterate_all"> matmul_async_iterate_all</td>
+        <td> 调用IterateAll实现异步场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
         <td><a href="./matrix/matmul_constant"> matmul_constant</td>
-- 
Gitee