From ab1b3048be382b41dcf795b516cd379f3590ddc9 Mon Sep 17 00:00:00 2001
From: suqwe <sujianjia@huawei.com>
Date: Thu, 21 Aug 2025 10:48:19 +0800
Subject: [PATCH] add l2 cache example

---
 examples/matrix/matmul_l2cache/CMakeLists.txt | 149 +++++------
 examples/matrix/matmul_l2cache/README.md      |  85 ++++--
 .../matrix/matmul_l2cache/cmake/cpu_lib.cmake |   2 +-
 .../matrix/matmul_l2cache/cmake/npu_lib.cmake |   5 +-
 examples/matrix/matmul_l2cache/main.cpp       | 252 ++++++++++--------
 .../op_kernel/l2_cache_optimizer.h            | 229 ++++++++++++++++
 .../matmul_l2cache_custom_kernel.cpp          |  32 +++
 .../op_kernel/matmul_l2cache_custom_kernel.h  | 146 ++++++++++
 examples/matrix/matmul_l2cache/run.sh         |  19 +-
 .../matmul_l2cache/scripts/exec_test.py       | 149 +++++------
 .../matrix/matmul_l2cache/testcase/case.csv   |   2 +-
 11 files changed, 765 insertions(+), 305 deletions(-)
 create mode 100644 examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h
 create mode 100644 examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h

diff --git a/examples/matrix/matmul_l2cache/CMakeLists.txt b/examples/matrix/matmul_l2cache/CMakeLists.txt
index 8834905f..aa0e3eed 100644
--- a/examples/matrix/matmul_l2cache/CMakeLists.txt
+++ b/examples/matrix/matmul_l2cache/CMakeLists.txt
@@ -1,80 +1,71 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd.
-# This file is a part of the CANN Open Software.
-# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# ======================================================================================================================
-
-cmake_minimum_required(VERSION 3.16)
-project(Ascend_c)
-if(${RUN_MODE})
-    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
-endif()
-if (${SOC_VERSION})
-    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
-endif()
-
-set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
-endif()
-
-if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
-    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
-endif()
-
-file(GLOB KERNEL_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_l2_cache_custom_kernel.cpp
-)
-
-if("${RUN_MODE}" STREQUAL "cpu")
-    include(cmake/cpu_lib.cmake)
-elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
-    include(cmake/npu_lib.cmake)
-else()
-    message("invalid RUN_MODE: ${RUN_MODE}")
-endif()
-
-add_executable(ascendc_matmul_l2_cache_bbit
-    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_l2_cache_custom_tiling.cpp
-)
-
-target_compile_options(ascendc_matmul_l2_cache_bbit PRIVATE
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
-    -O2
-    -std=c++17
-    -D_GLIBCXX_USE_CXX11_ABI=0
-)
-
-target_compile_definitions(ascendc_matmul_l2_cache_bbit PRIVATE
-    SOC_VERSION="${SOC_VERSION}"
-)
-
-
-target_include_directories(ascendc_matmul_l2_cache_bbit PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
-)
-
-target_link_libraries(ascendc_matmul_l2_cache_bbit PRIVATE
-    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
-    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
-    ascendc_kernels_${RUN_MODE}
-    tiling_api
-    register
-    platform
-    ascendalog
-    dl
-)
-
-install(TARGETS ascendc_matmul_l2_cache_bbit
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_l2cache_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+target_compile_options(ascendc_matmul_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+target_include_directories(ascendc_matmul_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
 )
\ No newline at end of file
diff --git a/examples/matrix/matmul_l2cache/README.md b/examples/matrix/matmul_l2cache/README.md
index 6ccb24ab..fd2b3637 100644
--- a/examples/matrix/matmul_l2cache/README.md
+++ b/examples/matrix/matmul_l2cache/README.md
@@ -13,7 +13,6 @@
 | 目录                              | 描述                                   |
 |----------------------------------|--------------------------------------|
 | [cmake](cmake)                   | 编译工程文件                               |
-| [op_host](op_host)               | 本样例tiling代码实现                        |
 | [op_kernel](op_kernel)           | 本样例kernel代码实现                        |
 | [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件      |
 | [testcase](testcase)             | 用例文件，配置用例的计算shape信息        |
@@ -23,7 +22,7 @@
 
 ## 算子描述
 - 算子功能  
-  MatmulL2CacheCustom算子在M方向切分输入的左矩阵，将左矩阵切分成多块数据，整体按数据切分的块数，分多次进行计算。每次计算前，第一个核第一次访问Global Memory上的左矩阵时，会将切分后的一块左矩阵数据加载到L2 Cache，其它核或第一个核后续数据访问都可以命中L2 Cache，访问完整的右矩阵行为相同，以提高算子性能。
+  MatmulL2CacheCustom算子在M方向或者N方向切分输入的矩阵，将矩阵切分成多块数据，整体按数据切分的块数，分多次进行计算。每次计算前，第一个核第一次访问Global Memory上的矩阵时，会将切分后的一块矩阵数据加载到L2 Cache，其它核或第一个核后续数据访问都可以命中L2 Cache，以提高算子性能。
 
 
 - 算子规格
@@ -31,22 +30,20 @@
     <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulL2CacheCustom</td></tr>
     </tr>
     <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
-    <tr><td align="center">a</td><td align="center">(30720, 1024)</td><td align="center">float</td><td align="center">ND</td><td align="center">false</td></tr>
-    <tr><td align="center">b</td><td align="center">(1024, 1024)</td><td align="center">float</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">a</td><td align="center">(30720, 4096)</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">(4096, 1024)</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
     <tr><td align="center">bias</td><td align="center">(1024, )</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
     </tr>
     </tr>
     <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td>
-    <td align="center">(30720, 1024)</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    <td align="center">(30720, 1024)</td><td align="center">float16</td><td align="center">ND</td><td align="center">-</td></tr>
     </tr>
     <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_l2_cache_custom</td></tr>
     </table>
 
 ## 算子实现介绍
 
-本样例以AI处理器的L2 Cache大小192M为例，根据算子的输入、输出shape，计算算子输入和输出的总数据量为((30720 * 1024) + (1024 * 1024) + 1024 + (30720 * 1024)) * 4 = 255856640字节（约244M），
-大于L2 Cache（192M），无法保证计算前读取的数据能够命中L2 Cache，而Global Memory带宽低于L2 Cache，且两者之间差距较大，搬运数据成为算子运行的性能瓶颈。将左矩阵的输入数据均等切分成两份后，整体分两次矩阵乘计算，
-每次计算的输入和输出总数据量为(((30720 // 2) * 1024) + (1024 * 1024) + 1024 + ((30720 // 2) * 1024)) * 4 = 130027520字节（约124M），以保证计算前读取的数据能够命中L2 Cache。
+本样例以AI处理器的L2 Cache大小192M为例，根据算子的输入、输出shape，计算算子输入和输出的总数据量为((30720 * 4096) + (4096 * 1024) + (1024) + (30720 * 1024)) * 2 = 322963456字节（约308M），大于L2 Cache（192M），无法保证计算前读取的数据能够命中L2 Cache，而Global Memory带宽低于L2 Cache，且两者之间差距较大，搬运数据成为算子运行的性能瓶颈。因此需要将输入数据切分成多块，使得每个数据块的计算数据量（包含输入和输出），能够命中L2 Cache。本样例提供了L2CacheOptimizer类，其中GetTileNum接口用于自动根据左右矩阵的Shape获取左右矩阵总L2切分份数，GetBlockShape接口获取L2切分后M、N、K轴的长度，GetBlockCoord接口返回对应切块的位置坐标，即M、N、K方向相对矩阵起始位置的偏移。
 
 - Kernel实现
   - 计算逻辑：C = A * B + Bias。
@@ -56,34 +53,68 @@
   - 具体步骤：
     - 创建Matmul对象。
     - 初始化操作。
-    - 设置右矩阵A、右矩阵B、Bias。将左矩阵A切分成两块，循环计算两次，每次对切分后一块左矩阵A的数据进行计算，并输出一半的计算结果。
+    - 设置左矩阵A、右矩阵B、Bias。根据L2CacheOptimizer类的GetTileNum接口获取左右矩阵总L2切分份数，循环多次计算。
       ```
-      matmulObj.SetTensorB(bGlobal);
-      if (tiling.isBias) {
-          matmulObj.SetBias(biasGlobal);
-      }
-      const uint32_t splitTimes = 2;
-      const uint32_t splitOffsetA = this->tiling.M / splitTimes * this->tiling.Ka;
-      const uint32_t splitOffsetC = this->tiling.M / splitTimes * this->tiling.N;
-      for (uint32_t i = 0; i < splitTimes; i++) {
-          matmulObj.SetTensorA(aGlobal[splitOffsetA * i]);
-          matmulObj.IterateAll(cGlobal[splitOffsetC * i]);
+      L2CacheOpt l2Opt(shapes, blockNum);
+      matmulObj.SetOrgShape(shapes.m, shapes.n, shapes.k);
+      for (int64_t tileIdx = curBlockIdx; tileIdx < l2Opt.GetTileNum(); tileIdx += blockNum) {
+          auto blockShape = l2Opt.GetBlockShape(tileIdx);  // 获取单次计算L2切分块大小
+          if (Get<0>(blockShape) <= 0 || Get<1>(blockShape) <= 0) {
+              return;
+          }
+          auto blockCoord = l2Opt.GetBlockCoord(tileIdx);  // 获取当前计算下标blockCoord
+          matmulObj.SetTail(Get<0>(blockShape), Get<1>(blockShape), Get<2>(blockShape));
+          const auto& offsetCoord = CalcOffset(shapes, blockCoord); // 基于下标计算矩阵偏移
+          int64_t offsetA = Get<0>(offsetCoord);
+          int64_t offsetB = Get<1>(offsetCoord);
+          int64_t offsetC = Get<2>(offsetCoord);
+          matmulObj.SetTensorA(aGlobal[offsetA], false);
+          matmulObj.SetTensorB(bGlobal[offsetB], false);
+          if (shapes.isBias) {
+              matmulObj.SetBias(biasGlobal);
+          }
+          matmulObj.IterateAll(cGlobal[offsetC]);  // 计算L2切分块
       }
       matmulObj.End();
       ```
-    - 完成两次矩阵乘操作。
+    - 完成矩阵乘操作。
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
-    - 获取Tiling参数的流程如下：
-      - 创建一个Tiling对象。
-      - 设置A、B、C、Bias的参数类型信息。根据数据的切分方式，设置数据切分后每次计算的Shape和数据切分前的完整OrgShape。
+    - 本样例采取了常量化Tiling计算，在Kernel侧通过设置一组固定的基本块信息，其他Tiling信息在Kernel侧通过常量化推导，Kernel侧不再需要运行时Tiling信息。基于这组最优的基本块信息，能够适用输入Shape中M，N较大的场景。样例中提供了一种L2Cache切分算法（参考样例L2CacheOptimizer类）。该算法当前在Kernel侧完成计算L2切块份数，也可以自行迁移代码至Host侧计算。
+    - L2CacheOptimizer具体计算步骤：
+      - 判断是否需要进行L2分块
+        ```
+        bool smallDim = mTileNum_ < L1_MIN_UST_DIM && nTileNum_ < L1_MIN_UST_DIM;
+        if (smallDim || (!EnableL2Tile())) { // 判断计算数据总量是否小于L2Cache阈值
+            mL2TileNum_ = mTileNum_;
+            nL2TileNum_ = nTileNum_;
+            mL2BlockNum_ = 1;
+            nL2BlockNum_ = 1;
+            return; // 不需要切分，提前返回
+        }
+        InitL2TileTail(); // 计算L2切分
+        ```
+      - 基于负载均衡，计算L2最优分块
         ```
-        cubeTiling->SetShape(M / splitTimes, N, K);
-        cubeTiling->SetOrgShape(M, N, K);
+        int64_t mConflict = INT64_MAX;
+        int64_t nConflict = INT64_MAX;
+        constexpr bool isNMajor = l1N > l1M; // 根据shape大小，判断主维度
+        for (int64_t i = maxMajor; i >= L1_MIN_UST_DIM; i--) {
+            for (int64_t j = maxMinor; j >= minMinor; j--) {
+                if (GetTotalSize(j * l1M, i * l1N, k_) <= L2_TILE_THRESHOLD) { // 确保分块小于L2Cache阈值
+                    uint64_t mConflictTmp = AscendC::Ceil(blockNum_, mL2TileNumTailTmp); // 计算负载冲突值
+                    uint64_t nConflictTmp = AscendC::Ceil(blockNum_, nL2TileNumTailTmp);
+                    if (mConflict >= mConflictTmp && nConflict >= nConflictTmp) { // 若冲突值更小，更新分块数量
+                        mConflict = mConflictTmp;
+                        nConflict = nConflictTmp;
+                        mL2TileNum_ = curMajorDim;
+                        nL2TileNum_ = curMinorDim;
+                    }
+                }
+            }
+        }
         ```
-      - 调用GetTiling接口，获取Tiling信息。
 
 ## 编译运行样例
 
diff --git a/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake b/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake
index 583dd703..244469fb 100644
--- a/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake
@@ -22,7 +22,7 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
 
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
     -g
-    -O2
+    -O0
     -std=c++17
 )
 
diff --git a/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake b/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake
index 927d2490..05179a43 100644
--- a/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake
@@ -20,8 +20,11 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
     ${KERNEL_FILES}
 )
 
+ascendc_include_directories(ascendc_kernels_${RUN_MODE} PRIVATE
+    ${ABS_WORK_DIR}
+)
+
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-  -DASCENDC_DUMP
   -DHAVE_WORKSPACE
   -DHAVE_TILING
   )
\ No newline at end of file
diff --git a/examples/matrix/matmul_l2cache/main.cpp b/examples/matrix/matmul_l2cache/main.cpp
index 608da572..d1bfa978 100644
--- a/examples/matrix/matmul_l2cache/main.cpp
+++ b/examples/matrix/matmul_l2cache/main.cpp
@@ -1,4 +1,4 @@
-/*
+/**
  * Copyright (c) 2025 Huawei Technologies Co., Ltd.
  * This file is a part of the CANN Open Software.
  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
@@ -8,96 +8,120 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
+/*!
+* \file main.cpp
+* \brief
+*/
+
+#include <iostream>
+#include <cstdint>
 #include <sstream>
 #include "../../common/data_utils.h"
-#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/tiling_api.h"
 #include "tiling/platform/platform_ascendc.h"
-#include "./op_host/matmul_l2_cache_custom_tiling.h"
-
 #ifndef ASCENDC_CPU_DEBUG
 #include "acl/acl.h"
-extern void matmul_l2_cache_custom_do(uint32_t coreDim, void* stream,
-    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
+
+extern void matmul_l2cache_custom_do(uint32_t coreDim, void* stream,
+                             uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+                             uint8_t* workspace, uint8_t* tiling);
 #else
 #include "tikicpulib.h"
-extern "C" void matmul_l2_cache_custom(
-    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling);
+extern "C" void matmul_l2cache_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+                              uint8_t* workspace, uint8_t* tiling);
 #endif
-
+constexpr bool IS_BIAS = false;
 namespace MatmulHost {
 
-#ifdef ASCENDC_CPU_DEBUG
-void TestMatmulCpu(const optiling::TestcaseParams& caseParams)
+static size_t GetSysWorkSpaceSize() {
+        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+        return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+// Matmul compute shape info
+struct MatmulProblemShape
+{
+int32_t m;
+int32_t n;
+int32_t k;
+int32_t isBias;
+};
+
+// Calculate single core shapes of per core 
+MatmulProblemShape GetSingleCoreShape(const int32_t m, const int32_t n, const int32_t k)
+{
+    MatmulProblemShape shapes{m, n, k, IS_BIAS};
+    return shapes;
+}
+
+int32_t GetUsedCoreNum()
 {
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
-    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
-    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
-    uint32_t aivCoreNum = ascendcPlatform->GetCoreNum();
-    uint32_t aicCoreNum = aivCoreNum / mixRatio;
-
-    size_t aFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.k);
-    size_t bFileSize = static_cast<size_t>(sizeof(float) * caseParams.k * caseParams.n);
-    size_t biasFileSize = static_cast<size_t>(sizeof(float) * caseParams.n);
-    size_t cFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);
-    size_t workspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
-    size_t tilingFileSize = sizeof(TCubeTiling);
-
-    uint8_t* a = (uint8_t *)AscendC::GmAlloc(aFileSize);
-    uint8_t* b = (uint8_t *)AscendC::GmAlloc(bFileSize);
+    return ascendcPlatform->GetCoreNumAic();
+}
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k)
+{
+    size_t x1FileSize = m * k * sizeof(uint16_t);
+    size_t x2FileSize = k * n * sizeof(uint16_t);
+    size_t yFileSize = m * n * sizeof(uint16_t);
+    size_t biasFileSize = 1 * n * sizeof(uint32_t);
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
     uint8_t* bias;
-    uint8_t* c = (uint8_t *)AscendC::GmAlloc(cFileSize);
-    uint8_t* workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize);
-    uint8_t* tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize);
-
-    ReadFile("../input/a_gm.bin", aFileSize, a, aFileSize);
-    ReadFile("../input/b_gm.bin", bFileSize, b, bFileSize);
-    if (caseParams.isBias) {
-        bias = (uint8_t *)AscendC::GmAlloc(biasFileSize);
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
         ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
     }
-
-    optiling::GenerateTiling(aicCoreNum, &cubeTiling, caseParams, tiling);
-    memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize);
-
-    ICPU_RUN_KF(matmul_l2_cache_custom, aicCoreNum, a, b, bias, c, workspace, tiling);
-
-    WriteFile("../output/output.bin", c, cFileSize);
-
-    AscendC::GmFree((void*)a);
-    AscendC::GmFree((void*)b);
-    if (caseParams.isBias) {
-        AscendC::GmFree((void*)bias);
-    }
-    AscendC::GmFree((void*)c);
+    const auto shapes = GetSingleCoreShape(m, n, k);
+    size_t tilingFileSize = sizeof(shapes);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    memcpy_s(tiling, tilingFileSize, &shapes, tilingFileSize);
+    ICPU_RUN_KF(matmul_l2cache_custom, GetUsedCoreNum(), x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
     AscendC::GmFree((void*)workspace);
     AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
 }
 #else
-void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling::TestcaseParams& caseParams,
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
     void* stream = nullptr)
 {
-    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
-    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
-    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
-    uint32_t aivCoreNum = ascendcPlatform->GetCoreNum();
-    uint32_t aicCoreNum = aivCoreNum / mixRatio;
-
+    // Init args
     uint8_t *workspaceDevice;
-    size_t workspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
     CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
+    // Initialize kernel with arguments and workspace pointer
+    const auto shapes = GetSingleCoreShape(m, n, k);
+    uint32_t coreNum = static_cast<uint32_t>(GetUsedCoreNum());
     uint8_t* tilingHost;
     uint8_t* tilingDevice;
-    size_t tilingFileSize = sizeof(TCubeTiling);
+    size_t tilingFileSize = sizeof(shapes);
     CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    optiling::GenerateTiling(aicCoreNum, &cubeTiling, caseParams, tilingHost);
-    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
-    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    matmul_l2_cache_custom_do(aicCoreNum, stream, a, b, bias, c, workspaceDevice, tilingDevice);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &shapes,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_l2cache_custom_do(coreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
     CHECK_ACL(aclrtFreeHost(tilingHost));
     CHECK_ACL(aclrtFree(workspaceDevice));
     CHECK_ACL(aclrtFree(tilingDevice));
@@ -119,83 +143,77 @@ void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId
     CHECK_ACL(aclFinalize());
 }
 
-void TestMatmul(const optiling::TestcaseParams& caseParams)
+void TestMatmul(int64_t m, int64_t n, int64_t k)
 {
-    size_t aFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.k);
-    size_t bFileSize = static_cast<size_t>(sizeof(float) * caseParams.k * caseParams.n);
-    size_t biasFileSize = static_cast<size_t>(sizeof(float) * caseParams.n);
-    size_t cFileSize = static_cast<size_t>(sizeof(float) * caseParams.m * caseParams.n);
+    size_t x1FileSize = static_cast<size_t>(m * k) * sizeof(uint16_t);
+    size_t x2FileSize = static_cast<size_t>(k * n) * sizeof(uint16_t);
+    size_t yFileSize = static_cast<size_t>(m * n) * sizeof(uint16_t);
+    size_t biasFileSize = static_cast<size_t>(1 * n) * sizeof(uint32_t);
 
     aclrtContext context;
     aclrtStream stream = nullptr;
     int64_t deviceId = 0;
     TestAclInit(context, stream, deviceId);
 
-    uint8_t* aHost;
-    uint8_t* aDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    ReadFile("../input/a_gm.bin", aFileSize, aHost, aFileSize);
-    CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    uint8_t* bHost;
-    uint8_t* bDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    ReadFile("../input/b_gm.bin", bFileSize, bHost, bFileSize);
-    CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    uint8_t* biasHost;
-    uint8_t* biasDevice;
-    if (caseParams.isBias) {
+    uint8_t* x1Host;
+    uint8_t* x1Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host;
+    uint8_t* x2Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
         CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize));
         CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
         ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
         CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
     }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
-    uint8_t* cHost;
-    uint8_t* cDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    MatmulOp(aDevice, bDevice, biasDevice, cDevice, caseParams, stream);
-
-    CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    WriteFile("../output/output.bin", cHost, cFileSize);
-
-    CHECK_ACL(aclrtFree(aDevice));
-    CHECK_ACL(aclrtFreeHost(aHost));
-
-    CHECK_ACL(aclrtFree(bDevice));
-    CHECK_ACL(aclrtFreeHost(bHost));
-
-    CHECK_ACL(aclrtFree(biasDevice));
-    CHECK_ACL(aclrtFreeHost(biasHost));
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
 
-    CHECK_ACL(aclrtFree(cDevice));
-    CHECK_ACL(aclrtFreeHost(cHost));
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
 
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
     TestAclDeInit(context, stream, deviceId);
 }
 #endif
-} // end namespace MatmulHost
-
-int32_t main(int32_t argc, const char* args[])
+} // namespace MatmulHost
+int32_t main(int32_t argc, const char *args[])
 {
-    uint32_t problem[5];
-    for (int32_t i = 1; i < argc && i < 6; ++i) {
+    int64_t problem[3] = {1, 1, 1};
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4
         std::stringstream ss(args[i]);
         ss >> problem[i - 1];
     }
-
-    bool isBias = problem[3];
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias, problem[4]};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 #ifdef ASCENDC_CPU_DEBUG
-    MatmulHost::TestMatmulCpu(caseParams);
+    MatmulHost::TestMatmulCpu(problem[0], problem[1], problem[2]); // 2 means problem shape k
 #else
-    MatmulHost::TestMatmul(caseParams);
+    MatmulHost::TestMatmul(problem[0], problem[1], problem[2]); // 2 means problem shape k
 #endif
     return 0;
-}
\ No newline at end of file
+}
diff --git a/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h b/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h
new file mode 100644
index 00000000..6b7c452a
--- /dev/null
+++ b/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h
@@ -0,0 +1,229 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file l2_cache_optimizer.h
+ * \brief
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_L2CACHE_L2_CACHE_OPTIMIZER_H
+#define EXAMPLES_MATRIX_MATMUL_L2CACHE_L2_CACHE_OPTIMIZER_H
+
+namespace CustomMatmulL2Cache {
+
+constexpr int64_t L2_TILE_THRESHOLD = 100 * 1024 * 1024;
+constexpr int64_t L1_MIN_UST_DIM = 4;
+constexpr int64_t L1_MAX_UST_DIM = 8;
+
+template <size_t I, typename T>
+__aicore__ constexpr inline decltype(auto) Get(T &&t)
+{
+    return AscendC::Std::get<I>(AscendC::Std::forward<T>(t));
+}
+
+template <class ProblemShape, const auto &MmTiling>
+class L2CacheOptimizer {
+public:
+    using BlockShape = AscendC::Std::tuple<int64_t, int64_t, int64_t>;
+    using BlockCoord = AscendC::Std::tuple<int64_t, int64_t, int64_t>;
+    static constexpr int64_t l1M = MmTiling.baseM * MmTiling.stepM;
+    static constexpr int64_t l1N = MmTiling.baseN * MmTiling.stepN;
+    static constexpr int64_t l0M = MmTiling.baseM;
+    static constexpr int64_t l0N = MmTiling.baseN;
+    static constexpr int64_t l0K = MmTiling.baseK;
+private:
+    int64_t mTileNum_;
+    int64_t nTileNum_;
+    int64_t blockNum_;
+    int64_t m_; // org shape
+    int64_t n_;
+    int64_t k_;
+    int64_t totalTileNum_;
+    // l2 spilit attribute
+    int64_t newBlockIdx_;
+    int64_t mL2TileNumTmp_;
+    int64_t nL2TileNumTmp_;
+    int64_t nL2Idx_;
+    int64_t mL2Idx_;
+    int64_t mL2BlockNum_;     // l2 m block num
+    int64_t nL2BlockNum_;     // l2 n block num
+    int64_t mL2TileNum_; // a1b1 m tile num of one l2 block
+    int64_t nL2TileNum_; // a1b1 n tile num of one l2 block
+public:
+    __aicore__ inline L2CacheOptimizer(ProblemShape shape, int64_t blockNum) :
+        blockNum_(blockNum)
+    {
+        m_ = shape.m;
+        n_ = shape.n;
+        k_ = shape.k;
+        mTileNum_ = AscendC::Ceil(m_, l1M);
+        nTileNum_ = AscendC::Ceil(n_, l1N);
+        totalTileNum_ = mTileNum_ * nTileNum_;
+        InitL2Tile();
+    }
+
+    __aicore__ inline int64_t GetTileNum()
+    {
+        return totalTileNum_;
+    }
+
+    __aicore__ inline BlockShape GetBlockShape(int64_t tileIdx)
+    {
+        GetCommonTileIndex(tileIdx);
+        int64_t mTileIdx = newBlockIdx_ % mL2TileNumTmp_;
+        mTileIdx = mTileIdx + mL2Idx_ * mL2TileNum_;
+
+        int64_t nTileIdx = 0;
+        if (mL2TileNumTmp_ != 0 && nL2TileNumTmp_ != 0) {
+            int64_t tmp = newBlockIdx_ / CalcLcm(mL2TileNumTmp_, nL2TileNumTmp_);
+            nTileIdx = (newBlockIdx_ + tmp) % nL2TileNumTmp_;
+        }
+        nTileIdx = nTileIdx + nL2Idx_ * nL2TileNum_;
+
+        // calc tail l1block mnk
+        int64_t tailL1M = (m_ % l1M == 0) ? l1M : m_ % l1M;
+        int64_t tailL1N = (n_ % l1N == 0) ? l1N : n_ % l1N;
+        int64_t blockShapeM = IsMTail(mTileIdx, mTileNum_) ? tailL1M : l1M;
+        int64_t blockShapeN = IsNTail(nTileIdx, nTileNum_) ? tailL1N : l1N;
+
+        return {blockShapeM, blockShapeN, k_};
+    }
+
+    __aicore__ inline BlockCoord GetBlockCoord(int64_t tileIdx)
+    {
+        GetCommonTileIndex(tileIdx);
+        int64_t mTileIdx = newBlockIdx_ % mL2TileNumTmp_;
+        mTileIdx = mTileIdx + mL2Idx_ * mL2TileNum_;
+
+        int64_t nTileIdx = 0;
+        if (mL2TileNumTmp_ != 0 && nL2TileNumTmp_ != 0) {
+            int64_t tmp = newBlockIdx_ / CalcLcm(mL2TileNumTmp_, nL2TileNumTmp_);
+            nTileIdx = (newBlockIdx_ + tmp) % nL2TileNumTmp_;
+        }
+        nTileIdx = nTileIdx + nL2Idx_ * nL2TileNum_;
+
+        return {mTileIdx * l1M, nTileIdx * l1N, 0};
+    }
+private:
+    __aicore__ inline int64_t GetTotalSize(int64_t mL2, int64_t nL2, int64_t kL2)
+    {
+        int64_t sizeA = mL2 * kL2 * sizeof(half);
+        int64_t sizeB = kL2 * nL2 * sizeof(half);
+        int64_t sizeC = mL2 * nL2 * sizeof(half);
+        return sizeA + sizeB + sizeC;
+    }
+
+    __aicore__ inline bool EnableL2Tile()
+    {
+        return GetTotalSize(m_, n_, k_) > L2_TILE_THRESHOLD;
+    }
+
+    __aicore__ inline int64_t GetTail(int64_t dividend, int64_t divisor)
+    {
+        if (divisor == 0) {
+            return 0;
+        }
+        auto remainder = dividend % divisor;
+        return (remainder == 0) ? divisor : remainder;
+    }
+
+    __aicore__ inline void InitL2TileTail()
+    {
+        int64_t mConflict = INT64_MAX;
+        int64_t nConflict = INT64_MAX;
+        constexpr bool isNMajor = l1N > l1M;
+        int64_t majorDim = isNMajor ? nTileNum_ : mTileNum_;
+        int64_t minorDim = isNMajor ? mTileNum_ : nTileNum_;
+        int64_t maxMajor = blockNum_ > majorDim ? majorDim : blockNum_;
+        int64_t maxMinor = blockNum_ > minorDim ? minorDim : blockNum_;
+        int64_t minMinor = isNMajor ? L1_MIN_UST_DIM : L1_MAX_UST_DIM;
+        for (int64_t i = maxMajor; i >= L1_MIN_UST_DIM; i--) { // if l1N greater than l1M, indicates n
+            for (int64_t j = maxMinor; j >= minMinor; j--) {
+                if (GetTotalSize(j * l1M, i * l1N, k_) <= L2_TILE_THRESHOLD) {
+                    int64_t curMajorDim = isNMajor ? j : i;
+                    int64_t curMinorDim = isNMajor ? i : j;
+                    int64_t mL2TileNumTailTmp = GetTail(mTileNum_, curMajorDim);
+                    int64_t nL2TileNumTailTmp = GetTail(nTileNum_, curMinorDim);
+                    uint64_t mConflictTmp = AscendC::Ceil(blockNum_, mL2TileNumTailTmp);
+                    uint64_t nConflictTmp = AscendC::Ceil(blockNum_, nL2TileNumTailTmp);
+                    if (mConflict >= mConflictTmp && nConflict >= nConflictTmp) {
+                        mConflict = mConflictTmp;
+                        nConflict = nConflictTmp;
+                        mL2TileNum_ = curMajorDim;
+                        nL2TileNum_ = curMinorDim;
+                    }
+                }
+            }
+        }
+        if (mL2TileNum_ == 0 || nL2TileNum_ == 0) {
+            mL2TileNum_ = mTileNum_;
+            nL2TileNum_ = nTileNum_;
+        }
+    }
+
+    __aicore__ inline void InitL2Tile()
+    {
+        bool smallDim = mTileNum_ < L1_MIN_UST_DIM && nTileNum_ < L1_MIN_UST_DIM;
+        if (smallDim || (!EnableL2Tile())) {
+            mL2TileNum_ = mTileNum_;
+            nL2TileNum_ = nTileNum_;
+            mL2BlockNum_ = 1;
+            nL2BlockNum_ = 1;
+            return;
+        }
+        mL2TileNum_ = 0;
+        nL2TileNum_ = 0;
+        InitL2TileTail();
+        mL2BlockNum_ = AscendC::Ceil(mTileNum_, mL2TileNum_);
+        nL2BlockNum_ = AscendC::Ceil(nTileNum_, nL2TileNum_);
+    }
+
+    __aicore__ inline void GetCommonTileIndex(int64_t tileIdx)
+    {
+        mL2Idx_ = tileIdx / (mL2TileNum_ * nTileNum_);
+        mL2TileNumTmp_ = (mL2Idx_ == mL2BlockNum_ - 1) ? GetTail(mTileNum_, mL2TileNum_) : mL2TileNum_;
+
+        nL2Idx_ = (tileIdx % (mL2TileNum_ * nTileNum_)) / (mL2TileNumTmp_ * nL2TileNum_);
+        nL2TileNumTmp_ = (nL2Idx_ == nL2BlockNum_ - 1) ? GetTail(nTileNum_, nL2TileNum_) : nL2TileNum_;
+
+        int64_t startIdx = mL2Idx_ * mL2TileNum_ * nTileNum_ + nL2Idx_ * nL2TileNum_ * mL2TileNumTmp_;
+        int64_t startBlockIdx = startIdx % blockNum_;
+        newBlockIdx_ = tileIdx - startIdx;
+    }
+
+    __aicore__ inline int64_t CalcLcm(int64_t a, int64_t b)
+    {
+        if (a == 0 || b == 0) {
+            return 0;
+        }
+        // calc GCD
+        int64_t m = a;
+        int64_t n = b;
+        while (n != 0) {
+            int64_t tmp = m % n;
+            m = n;
+            n = tmp;
+        }
+        int64_t gcd = m;
+        return (a / gcd) * b;
+    }
+
+    __aicore__ inline bool IsMTail(int64_t mTileIdx, int64_t mTileNum)
+    {
+        return (mTileIdx - (mTileNum - 1)) % mTileNum == 0;
+    }
+
+    __aicore__ inline bool IsNTail(int64_t nTileIdx, int64_t nTileNum)
+    {
+        return nTileIdx == (nTileNum - 1);
+    }
+};
+} // namespace
+#endif
\ No newline at end of file
diff --git a/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp
new file mode 100644
index 00000000..1291dea4
--- /dev/null
+++ b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_l2cache_custom_kernel.h"
+// matmul kernel function
+extern "C" __global__ __aicore__ void matmul_l2cache_custom(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    if (g_coreType == AscendC::AIV) {
+        return;
+    }
+    CustomMatmulL2Cache::MatmulKernel<half, half, half, float> matmulKernel;
+    AscendC::TPipe pipe;
+    matmulKernel.Init(a, b, bias, c, workspace, tilingGm);
+    matmulKernel.Process(&pipe);
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_l2cache_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
+                    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_l2cache_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
\ No newline at end of file
diff --git a/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h
new file mode 100644
index 00000000..edf1845a
--- /dev/null
+++ b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_L2CACHE_OP_KERNEL_MATMUL_L2_CACHE_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_L2CACHE_OP_KERNEL_MATMUL_L2_CACHE_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+#define ASCENDC_CUBE_ONLY
+#include "lib/matmul_intf.h"
+#include "l2_cache_optimizer.h"
+namespace CustomMatmulL2Cache {
+constexpr static auto GetCFG()
+{
+    auto cfg = CFG_MDL;
+    cfg.singleCoreM = 10000;
+    cfg.singleCoreN = 10000;
+    cfg.singleCoreK = 10000;
+    cfg.basicM = 128;
+    cfg.basicN = 256;
+    cfg.basicK = 64;
+    cfg.enableSetBias = false;
+    cfg.enUnitFlag = true;
+    return cfg;
+}
+constexpr auto CUSTOM_CFG = GetCFG();
+constexpr static auto GetL1Tiling(const MatmulApiStaticTiling& mmTiling)
+{
+    auto tiling = mmTiling;
+    tiling.stepM = 1;
+    tiling.stepN = 1;
+    tiling.stepKa = 4;
+    tiling.stepKb = 4;
+    tiling.depthA1 = 8;
+    tiling.depthB1 = 8;
+    return tiling;
+}
+struct MatmulProblemShape {
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    int32_t isBias;
+};
+
+__aicore__ inline void CopyTiling(MatmulProblemShape* tiling, GM_ADDR tilingGM)
+{
+    int32_t *ptr = reinterpret_cast<int32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ int32_t *>(tilingGM);
+    for (size_t i = 0; i < sizeof(MatmulProblemShape) / sizeof(int32_t); ++i, ++ptr) {
+        *ptr = *(tiling32 + i);
+    }
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+class MatmulKernel {
+public:
+    using BlockCoord = AscendC::Std::tuple<int64_t, int64_t, int64_t>;
+    __aicore__ inline MatmulKernel(){};
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+        GM_ADDR tiling);
+    __aicore__ inline void Process(AscendC::TPipe* pipe);
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>;
+    using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>;
+    constexpr static auto CONSTANT_CFG = AscendC::GetMatmulApiTiling<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(CUSTOM_CFG);
+    constexpr static auto MM_TILING = GetL1Tiling(CONSTANT_CFG);
+    // Use cubeOnly mode
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_TILING> matmulObj;
+    using L2CacheOpt = L2CacheOptimizer<MatmulProblemShape, MM_TILING>;
+    MatmulProblemShape shapes;
+private:
+    __aicore__ inline AscendC::Coord<int64_t, int64_t, int64_t> CalcOffset(
+        const MatmulProblemShape& param, const BlockCoord &blockCoord);
+    __aicore__ inline int64_t GetTotalSize(int64_t m, int64_t n, int64_t k);
+    AscendC::GlobalTensor<aType> aGlobal;
+    AscendC::GlobalTensor<bType> bGlobal;
+    AscendC::GlobalTensor<cType> cGlobal;
+    AscendC::GlobalTensor<biasType> biasGlobal;
+};
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR a,
+        GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling)
+{
+    CopyTiling(&shapes, tiling);
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), shapes.m * shapes.k);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), shapes.k * shapes.n);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), shapes.m * shapes.n);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), shapes.n);
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe* pipe)
+{
+    REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, (TCubeTiling*)nullptr);
+    auto curBlockIdx = AscendC::GetBlockIdx();
+    auto blockNum = AscendC::GetBlockNum();
+    if (curBlockIdx >= blockNum) {
+        return;
+    }
+    L2CacheOpt l2Opt(shapes, blockNum);
+    matmulObj.SetOrgShape(shapes.m, shapes.n, shapes.k);
+    for (int64_t tileIdx = curBlockIdx; tileIdx < l2Opt.GetTileNum(); tileIdx += blockNum) {
+        auto blockShape = l2Opt.GetBlockShape(tileIdx);
+        if (Get<0>(blockShape) <= 0 || Get<1>(blockShape) <= 0) {
+            return;
+        }
+        auto blockCoord = l2Opt.GetBlockCoord(tileIdx);
+        matmulObj.SetTail(Get<0>(blockShape), Get<1>(blockShape), Get<2>(blockShape));
+        const auto& offsetCoord = CalcOffset(shapes, blockCoord);
+        int64_t offsetA = Get<0>(offsetCoord);
+        int64_t offsetB = Get<1>(offsetCoord);
+        int64_t offsetC = Get<2>(offsetCoord);
+        matmulObj.SetTensorA(aGlobal[offsetA], false);
+        matmulObj.SetTensorB(bGlobal[offsetB], false);
+        if (shapes.isBias) {
+            matmulObj.SetBias(biasGlobal);
+        }
+        matmulObj.IterateAll(cGlobal[offsetC]);
+    }
+    matmulObj.End();
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline AscendC::Coord<int64_t, int64_t, int64_t> MatmulKernel<aType, bType, cType, biasType>::CalcOffset(
+    const MatmulProblemShape& param, const BlockCoord& blockCoord)
+{
+    AscendC::Coord<int64_t, int64_t> aCoord = AscendC::MakeCoord(Get<0>(blockCoord), Get<2>(blockCoord));
+    AscendC::Coord<int64_t, int64_t> bCoord = AscendC::MakeCoord(Get<2>(blockCoord), Get<1>(blockCoord));
+    AscendC::Coord<int64_t, int64_t> cCoord = AscendC::MakeCoord(Get<0>(blockCoord), Get<1>(blockCoord));
+    auto aLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.m, param.k), AscendC::MakeStride(param.k, 1));
+    auto bLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.k, param.n), AscendC::MakeStride(param.n, 1));
+    auto cLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.m, param.n), AscendC::MakeStride(param.n, 1));
+    int64_t offsetA = aLayOut(aCoord);
+    int64_t offsetB = bLayOut(bCoord);
+    int64_t offsetC = cLayOut(cCoord);
+    return {offsetA, offsetB, offsetC};
+}
+}  // namespace CustomMatmulL2Cache
+#endif // EXAMPLES_MATRIX_MATMUL_CONSTANT_L2_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_l2cache/run.sh b/examples/matrix/matmul_l2cache/run.sh
index c7bb8bf6..631f7ba3 100644
--- a/examples/matrix/matmul_l2cache/run.sh
+++ b/examples/matrix/matmul_l2cache/run.sh
@@ -8,6 +8,8 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
 
+export IS_PERF="0"
+
 SHORT=r:,v:,p:,
 LONG=run-mode:,soc-version:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
@@ -76,6 +78,19 @@ mkdir output
 rm -rf bin/
 mkdir -p bin
 cd bin
-cp ../build/ascendc_matmul_l2_cache_bbit ./
+cp ../build/ascendc_matmul_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
 
-python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF}
\ No newline at end of file
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_l2cache/scripts/exec_test.py b/examples/matrix/matmul_l2cache/scripts/exec_test.py
index 273be84c..aacdcd2d 100644
--- a/examples/matrix/matmul_l2cache/scripts/exec_test.py
+++ b/examples/matrix/matmul_l2cache/scripts/exec_test.py
@@ -10,109 +10,104 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
 import os
-import shlex
-import subprocess
 import sys
 import csv
 import time
 import logging
-import argparse
-from collections import namedtuple
+import subprocess
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from gen_l2_cache_data import gen_golden_data
+import numpy as np
+
+sys.path.append("../..")
+from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import clear_file_cache
-from common_scripts.exec_utils import get_perf_task_duration
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
-DATA_TYPE_STR = "float16_float32"
+IS_BIAS = False
+IS_TRANS_A = False
+IS_TRANS_B = False
+# float16 in float16 out
+DATA_TYPE_STR = "float16_float16"
 
 logging.basicConfig(level=logging.INFO)
 
-ProcessParams = namedtuple("ProcessParams", ["isProcess", "caseName", 'm', 'n', 'k', 'isBias', 'splitTimes'])
-
-
-def get_case_list(work_path):
-    case_list = []
-    case_dir = os.path.join(work_path, "testcase")
-    if not os.path.isdir(case_dir):
-        logging.error(f"case path: {case_dir} not exist!")
-        return case_list
-
-    for file_name in os.listdir(case_dir):
-        if not file_name.endswith(".csv"):
-            continue
-
-        abs_file_name = os.path.join(case_dir, file_name)
-        with open(abs_file_name, mode='r', encoding='utf-8') as file:
-            csv_reader = csv.reader(file)
-            for row in csv_reader:
-                item_list = [int(i) if i.strip().isdigit() else i.strip() for i in row]
-                case_list.append(item_list)
-    return case_list
-
 
-def process_case(work_path, args, params):
-    case_name, m, n, k, is_bias, split_times = \
-        (params.caseName, params.m, params.n, params.k, params.isBias, params.splitTimes)
-    run_mode, is_perf = args.r, args.p
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
 
-    logging.info(f"start process case [{case_name}]")
-    logging.info(f"IS_PERF is set [{args.p}]")
 
-    clear_file_cache(work_path)
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+    clear_file_cache(file_work_dir)
 
-    gen_golden_data(work_path, params)
-    params_str = f"{m} {n} {k} {is_bias} {split_times}"
-    if args.p: # npu(is_perf = 1)
-        cmd = f"msprof op --application=\"./ascendc_matmul_l2_cache_bbit {params_str}\" --output=./prof_out"
-    elif run_mode == "sim": # sim
-        cmd = f"msprof op simulator --application=\"./ascendc_matmul_l2_cache_bbit {params_str}\" --output=./sim_out"
-    else: # cpu or npu(is_perf = 0)
-        cmd = f"./ascendc_matmul_l2_cache_bbit {params_str}"
-    subprocess.run(shlex.split(cmd))
-
-    total_num = params.m * params.n
-    if args.p:
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
+    if is_perf:
         wrong_num = -1
-        result = None
-        task_duration = get_perf_task_duration("./prof_out")
     else:
-        logging.info(f"compare data case[{case_name}]")
-        wrong_num = compare_data(work_path, DATA_TYPE_STR)
-        result = "Fail" if (wrong_num / total_num > 0.001 or wrong_num < 0) else "Success"
-        task_duration = None
-
-    return [params.caseName, wrong_num, total_num, result, task_duration]
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(b * m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (b * m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-r', type=str, choices=['cpu', 'sim', 'npu'], required=True)
-    parser.add_argument('-p', type=int, default=0, required=False)
-    args = parser.parse_args()
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
 
-    if args.r != 'npu' and args.p:
-        logging.error(f"IS_PERF can only be enabled in 'npu' mode, current mode: '{args.r}'.")
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
         return -1
 
-    work_path = os.path.dirname(os.getcwd())
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
 
-    case_list = get_case_list(work_path)
+    case_list = get_case_list()
     res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
-
-    for case in case_list:
-        try:
-            process_params = ProcessParams(*case)
-        except Exception as e:
-            logging.error(f"{case[0]}: invalid parameter, error msg: {e}")
-            continue
-        if process_params.isProcess:
-            res_data = process_case(work_path, args, process_params)
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
             res_list.append(res_data)
 
-    result_file_name = f"result_{time.time()}.csv"
-    with open(os.path.join(work_path, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
         writer = csv.writer(csvfile)
         writer.writerows(res_list)
 
diff --git a/examples/matrix/matmul_l2cache/testcase/case.csv b/examples/matrix/matmul_l2cache/testcase/case.csv
index c4c30f8f..0ee900b2 100644
--- a/examples/matrix/matmul_l2cache/testcase/case.csv
+++ b/examples/matrix/matmul_l2cache/testcase/case.csv
@@ -1 +1 @@
-1, case001, 30720, 1024, 1024, 1, 2
\ No newline at end of file
+1, case001, 30720, 1024, 4096
\ No newline at end of file
-- 
Gitee