From 0f409d8261edd450c63358383952d67578f1ac07 Mon Sep 17 00:00:00 2001
From: anzoola <liuyuhang24@huawei.com>
Date: Sat, 29 Mar 2025 08:08:10 +0000
Subject: [PATCH 1/3] =?UTF-8?q?!2618=20VectorAdd=E6=A0=B7=E4=BE=8B?=
 =?UTF-8?q?=E6=8B=86=E5=88=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../add_custom.cpp                            | 213 ----
 .../21_vectoradd_kernellaunch/README.md       | 102 +-
 .../CMakeLists.txt                            |  46 +
 .../VectorAddMultiCoreWithTiling/README.md    |  82 ++
 .../add_custom.cpp                            | 399 ++++++++
 .../add_custom_tiling.cpp                     | 112 +++
 .../add_custom_tiling.h                       |   7 +-
 .../cmake/cpu_lib.cmake                       |   0
 .../cmake/npu_lib.cmake                       |   0
 .../data_utils.h                              |   0
 .../main.cpp                                  | 145 +--
 .../run.sh                                    |   0
 .../scripts/gen_data.py                       |  11 +-
 .../scripts/verify_result.py                  |   9 +-
 .../CMakeLists.txt                            |  46 +
 .../README.md                                 |  83 ++
 .../add_custom.cpp                            | 909 ++++++++++++++++++
 .../add_custom_tiling.cpp                     | 111 +++
 .../add_custom_tiling.h                       |  39 +
 .../cmake/cpu_lib.cmake                       |   9 +
 .../cmake/npu_lib.cmake                       |  11 +
 .../data_utils.h                              | 203 ++++
 .../main.cpp                                  | 109 +++
 .../run.sh                                    | 124 +++
 .../scripts/gen_data.py                       |  56 ++
 .../scripts/verify_result.py                  |  58 ++
 .../CMakeLists.txt                            |   0
 .../README.md                                 |   7 +-
 .../VectorAddSingleCore/add_custom.cpp        |  81 ++
 .../VectorAddSingleCore/cmake/cpu_lib.cmake   |   9 +
 .../VectorAddSingleCore/cmake/npu_lib.cmake   |  11 +
 .../VectorAddSingleCore/data_utils.h          | 203 ++++
 .../VectorAddSingleCore/main.cpp              |  83 ++
 .../VectorAddSingleCore/run.sh                | 124 +++
 .../VectorAddSingleCore/scripts/gen_data.py   |  30 +
 .../scripts/verify_result.py                  |  57 ++
 .../CMakeLists.txt                            |  44 +
 .../VectorAddSingleCoreWithTmpbuf/README.md   |  72 ++
 .../add_custom.cpp                            |  95 ++
 .../cmake/cpu_lib.cmake                       |   9 +
 .../cmake/npu_lib.cmake                       |  11 +
 .../data_utils.h                              | 203 ++++
 .../VectorAddSingleCoreWithTmpbuf/main.cpp    |  83 ++
 .../VectorAddSingleCoreWithTmpbuf/run.sh      | 124 +++
 .../scripts/gen_data.py                       |  30 +
 .../scripts/verify_result.py                  |  57 ++
 operator/ascendc/0_introduction/README.md     |   1 +
 47 files changed, 3849 insertions(+), 369 deletions(-)
 delete mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/add_custom_tiling.h (92%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/cmake/cpu_lib.cmake (100%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/cmake/npu_lib.cmake (100%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/data_utils.h (100%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/main.cpp (34%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/run.sh (100%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/scripts/gen_data.py (72%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddMultiCoreWithTiling}/scripts/verify_result.py (87%)
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.h
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddSingleCore}/CMakeLists.txt (100%)
 rename operator/ascendc/0_introduction/21_vectoradd_kernellaunch/{AddKernelInvocationTilingNeo => VectorAddSingleCore}/README.md (81%)
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/main.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/main.cpp
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py

diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
deleted file mode 100644
index 78a6ca001..000000000
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * @file add_custom.cpp
- *
- * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-#include "add_custom_tiling.h"
-#include "kernel_operator.h"
-
-constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
-
-constexpr uint32_t ADD_BFLOAT16 = 0;
-constexpr uint32_t ADD_FLOAT16 = 1;
-constexpr uint32_t ADD_FLOAT32 = 2;
-constexpr uint32_t ADD_INT8 = 3;
-constexpr uint32_t ADD_INT16 = 4;
-constexpr uint32_t ADD_INT32 = 5;
-
-template <typename dataType> class KernelAdd {
-public:
-    __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
-    {
-        this->xLength = tiling.xLength;
-        this->yLength = tiling.yLength;
-        if (tiling.isEvenCore) {
-            this->blockLength = tiling.blockLength;
-            this->tileNum = tiling.tileNum;
-            this->tileLength = tiling.tileLength / BUFFER_NUM;
-            this->lastTileLength = tiling.lastTileLength;
-
-            xGm.SetGlobalBuffer((__gm__ dataType *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            yGm.SetGlobalBuffer((__gm__ dataType *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            zGm.SetGlobalBuffer((__gm__ dataType *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        } else {
-            this->formerNum = tiling.formerNum;
-            this->formerLength = tiling.formerLength;
-            this->formerTileNum = tiling.formerTileNum;
-            this->formerTileLength = tiling.formerTileLength;
-            this->formerLastTileLength = tiling.formerLastTileLength;
-
-            this->tailNum = tiling.tailNum;
-            this->tailLength = tiling.tailLength;
-            this->tailTileNum = tiling.tailTileNum;
-            this->tailTileLength = tiling.tailTileLength;
-            this->tailLastTileLength = tiling.tailLastTileLength;
-            if (AscendC::GetBlockIdx() < this->formerNum) {
-                this->tileNum = this->formerTileNum;
-                this->tileLength = this->formerTileLength / BUFFER_NUM;
-                this->lastTileLength = this->formerLastTileLength;
-
-                xGm.SetGlobalBuffer((__gm__ dataType *)x + this->formerLength * AscendC::GetBlockIdx(), this->formerLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)y + this->formerLength * AscendC::GetBlockIdx(), this->formerLength);
-                zGm.SetGlobalBuffer((__gm__ dataType *)z + this->formerLength * AscendC::GetBlockIdx(), this->formerLength);
-            } else {
-                this->tileNum = this->tailTileNum * BUFFER_NUM;
-                this->tileLength = this->tailTileLength / BUFFER_NUM;
-                this->lastTileLength = tailLastTileLength;
-
-                xGm.SetGlobalBuffer((__gm__ dataType *)x + this->formerLength * this->formerNum + 
-                    this->tailLength * (AscendC::GetBlockIdx() - this->formerNum), this->tailLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)y + this->formerLength * this->formerNum + 
-                    this->tailLength * (AscendC::GetBlockIdx() - this->formerNum), this->tailLength);
-                zGm.SetGlobalBuffer((__gm__ dataType *)z + this->formerLength * this->formerNum + 
-                    this->tailLength * (AscendC::GetBlockIdx() - this->formerNum), this->tailLength);
-            }
-        }
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
-
-        if constexpr (AscendC::IsSameType<dataType, bfloat16_t>::value) {
-            pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
-            pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
-        } else if constexpr (AscendC::IsSameType<dataType, int8_t>::value) {
-            pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
-            pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
-        }
-    }
-    __aicore__ inline void Process()
-    {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
-        }
-    }
-
-private:
-    __aicore__ inline void CopyIn(int32_t progress)
-    {
-        AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
-        AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopy(xLocal, xGm[(progress - 2) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[(progress - 2) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
-        }
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute(int32_t progress)
-    {
-        AscendC::LocalTensor<dataType> xLocal = inQueueX.DeQue<dataType>();
-        AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
-        AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
-
-        if constexpr (AscendC::IsSameType<dataType, bfloat16_t>::value) {
-            AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
-            AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
-
-            AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-            AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-
-            AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
-            AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength); 
-        } else if constexpr (AscendC::IsSameType<dataType, int8_t>::value) {
-            AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
-            AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
-
-            AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-            AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-
-            AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
-            AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength);      
-        } else {
-            AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
-        }
-
-        outQueueZ.EnQue<dataType>(zLocal);
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-
-    }
-    __aicore__ inline void CopyOut(int32_t progress)
-    {
-        AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopy(zGm[(progress - 2) * this->tileLength + this->lastTileLength], zLocal,
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
-        }
-        outQueueZ.FreeTensor(zLocal);
-    }
-
-private:
-    AscendC::TPipe pipe;
-    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0, tmpBuf1;
-
-    AscendC::GlobalTensor<dataType> xGm, yGm, zGm;
-
-    uint32_t xLength;
-    uint32_t yLength;
-
-    uint32_t blockLength;
-    uint32_t tileNum;
-    uint32_t tileLength;
-    uint32_t lastTileLength;
-
-    uint32_t formerNum;
-    uint32_t formerLength;
-    uint32_t formerTileNum;
-    uint32_t formerTileLength;
-    uint32_t formerLastTileLength;
-
-    uint32_t tailNum; 
-    uint32_t tailLength;
-    uint32_t tailTileNum;
-    uint32_t tailTileLength;
-    uint32_t tailLastTileLength; 
-};
-
-extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
-{
-    if (tiling.dataType == ADD_BFLOAT16) {
-        KernelAdd<bfloat16_t> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else if (tiling.dataType == ADD_FLOAT16) {
-        KernelAdd<half> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else if (tiling.dataType == ADD_FLOAT32) {
-        KernelAdd<float> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else if (tiling.dataType == ADD_INT8) {
-        KernelAdd<int8_t> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else if (tiling.dataType == ADD_INT16) {
-        KernelAdd<int16_t> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else if (tiling.dataType == ADD_INT32) {
-        KernelAdd<int32_t> op;
-        op.Init(x,  y,  z, tiling);
-        op.Process();
-    } else {
-        return;
-    }
-}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
index c9e7f6771..bda5a0626 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
@@ -1,10 +1,100 @@
 ## 概述
-本样例介绍Add算子的核函数直调方法。
+本样例介绍三个不同的Add自定义算子，场景分别如下：
+
+- [基础Add算子](./VectorAddSingleCore)：支持的数据类型有：half，算子支持单核运行。
+- [使用临时内存Add算子](./VectorAddSingleCoreWithTmpbuf)：支持的数据类型有：bfloat16_t，算子支持单核运行，算子内部使用TmpBuf。
+- [多核Add算子](./VectorAddMultiCoreWithTiling)：支持的数据类型有：bfloat16_t/int8_t/float/half/int16_t/int32_t，算子支持多核运行、支持核间数据均分或不均分场景并且支持尾块处理。
+- [输入Broadcast的Add算子](./VectorAddMultiCoreWithTilingBroadcast)：两个输入shape不相等，算子对其中一个输入进行Broadcast后再进行计算。支持的数据类型有：bfloat16_t/int8_t/float/half/int16_t/int32_t，算子支持多核运行、支持核间数据均分或不均分场景并且支持尾块处理。
 ## 目录结构介绍
 ```
-├── KernelLaunch                      // 使用核函数直调的方式调用Add自定义算子
-│   ├── AddKernelInvocationTilingNeo  // Kernel Launch方式调用核函数样例，带有Tiling
+├── 21_vectoradd_kernellaunch                      // 使用核函数直调的方式调用Add自定义算子
+│   ├── VectorAddSingleCore                        // Kernel Launch方式调用核函数样例
+│   ├── VectorAddSingleCoreWithTmpbuf              // Kernel Launch方式调用核函数样例，带有TmpBuffer
+│   ├── VectorAddMultiCoreWithTiling               // Kernel Launch方式调用核函数样例，带有多核&tiling切分
+│   └── VectorAddMultiCoreWithTilingBroadcast      // Kernel Launch方式调用核函数样例，多核&tiling场景下增加输入Broadcast
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
 ```
+## 算子规格描述
+- VectorAddSingleCore
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">1 * 2048</td><td align="center">half</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">half</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">1 * 2048</td><td align="center">half</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+- VectorAddSingleCoreWithTmpbuf
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">1 * 2048</td><td align="center">bfloat16_t</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 2048</td><td align="center">bfloat16_t</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">1 * 2048</td><td align="center">bfloat16_t</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+- VectorAddMultiCoreWithTiling
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+- VectorAddMultiCoreWithTilingBroadcast
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">(m, n) / (1, n) / (m, 1)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">(m, n) / (1, n) / (m, 1)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">(m * n))</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+</tr>
+</tr>
+该算子支持对任一输入的某个轴进行广播，对应关系如下
+
+- 针对axis = 0（第一个轴）进行广播
+  - x(m, n), y(1, n)
+  - x(1, n), y(m, n)
+
+  注意，该场景下m需满足32字节对齐。
+
+- 针对axis = 1（第二个轴）进行广播
+  - x(m, n), y(m, 1)
+  - x(m, 1), y(m, n)
+
+
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
 ## 编译运行样例算子
 针对自定义算子工程，编译运行包含如下步骤：
 - 编译自定义算子工程；
@@ -14,5 +104,7 @@
 ### 1. 获取源码包
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
 ### 2. 编译运行样例工程
-- [AddKernelInvocationTilingNeo样例运行](./AddKernelInvocationTilingNeo/README.md)
-
+- [VectorAddSingleCore样例运行](./VectorAddSingleCore/README.md)
+- [VectorAddSingleCoreWithTmpbuf样例运行](./VectorAddSingleCoreWithTmpbuf/README.md)
+- [VectorAddMultiCoreWithTiling样例运行](./VectorAddMultiCoreWithTiling/README.md)
+- [VectorAddMultiCoreWithTilingBroadcast样例运行](./VectorAddMultiCoreWithTilingBroadcast/README.md)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/CMakeLists.txt b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/CMakeLists.txt
new file mode 100644
index 000000000..dc32860d9
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_tiling.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
new file mode 100644
index 000000000..d27f69587
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/README.md
@@ -0,0 +1,82 @@
+## 目录结构介绍
+```
+├── VectorAddMultiCoreWithTiling
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_tiling.h     // 算子tiling结构体
+│   ├── add_custom_tiling.cpp   // 算子tiling实现
+│   ├── add_custom.cpp          // 算子kernel实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+## 代码实现介绍
+- kernel实现
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先计算出不同核上计算的数据相对首地址的偏移并搬运进片上存储。如果涉及到Add算子不支持的数据类型，则需要申请额外的tmpBufeer，并在计算前使用Cast算子进行类型转换，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+
+  具体请参考[add_custom.cpp](./add_custom.cpp)。
+- tiling实现
+  - TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数。
+
+    本样例尝试将输入数据均分在给定数目的核上计算，如不能均分，则计算整核和尾核的个数，整核和尾核分别处理不同数据量的数据。定义UB_BLOCK_NUM最为最大可用的UB数据块，对应UB的大小为UB_BLOCK_NUM * 32 Bytes。 并以UB_BLOCK_NUM为粒度进行tiling切分。算子中使用了17个tiling参数：
+    - totalLength、dataType为输入数据的总长度和数据类型；
+    - blockLength、tileNum、 tileLength、lastTileLength为核均分场景下每个核上的计算量、tiling切分个数以及整块和尾块长度；
+    - former(tail)Num、former(tail)Length、former(tail)TileNum、former(tail)TileLength、former(tail)LastTileLength为核不均分的场景下整核和尾核相关的切分数据，与核均分场景下对应；
+    - isEvenCore用来判断当前是否为核均分场景。
+
+  具体请参考[add_custom_tiling.cpp](./add_custom_tiling.cpp)。
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+    配置仿真模式日志文件目录，默认为sim_log。
+    ```bash
+    export CAMODEL_LOG_PATH=./sim_log
+    ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]，默认值为npu。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
new file mode 100644
index 000000000..8e9dc83f0
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
@@ -0,0 +1,399 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+
+constexpr uint32_t ADD_BFLOAT16 = 0;
+constexpr uint32_t ADD_FLOAT16 = 1;
+constexpr uint32_t ADD_FLOAT32 = 2;
+constexpr uint32_t ADD_INT8 = 3;
+constexpr uint32_t ADD_INT16 = 4;
+constexpr uint32_t ADD_INT32 = 5;
+
+constexpr uint32_t LAST_TWO_TILE = 2;
+
+template <typename dataType> class KernelAdd;
+template <> class KernelAdd <bfloat16_t> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        if (tiling.isEvenCore) {
+            this->blockLength = tiling.blockLength;
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.AllocTensor<bfloat16_t>();
+
+        AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
+        AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength); 
+
+        outQueueZ.EnQue<bfloat16_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+
+    AscendC::GlobalTensor<bfloat16_t> xGm;
+    AscendC::GlobalTensor<bfloat16_t> yGm;
+    AscendC::GlobalTensor<bfloat16_t> zGm;
+
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+template <> class KernelAdd <int8_t> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        if (tiling.isEvenCore) {
+            this->blockLength = tiling.blockLength;
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ int8_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ int8_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ int8_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
+        AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        outQueueZ.EnQue<int8_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+
+    AscendC::GlobalTensor<int8_t> xGm;
+    AscendC::GlobalTensor<int8_t> yGm;
+    AscendC::GlobalTensor<int8_t> zGm;
+
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+template <typename dataType> class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        if (tiling.isEvenCore) {
+            this->blockLength = tiling.blockLength;
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ dataType *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ dataType *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ dataType *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum + 
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
+
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+
+        outQueueZ.EnQue<dataType>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
+                this->tileLength);
+        } else {
+            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::GlobalTensor<dataType> xGm;
+    AscendC::GlobalTensor<dataType> yGm;
+    AscendC::GlobalTensor<dataType> zGm;
+
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+        
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+{
+    if (tiling.dataType == ADD_BFLOAT16) {
+        KernelAdd<bfloat16_t> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else if (tiling.dataType == ADD_FLOAT16) {
+        KernelAdd<half> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else if (tiling.dataType == ADD_FLOAT32) {
+        KernelAdd<float> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else if (tiling.dataType == ADD_INT8) {
+        KernelAdd<int8_t> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else if (tiling.dataType == ADD_INT16) {
+        KernelAdd<int16_t> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else if (tiling.dataType == ADD_INT32) {
+        KernelAdd<int32_t> op;
+        op.Init(x, y, z, tiling);
+        op.Process();
+    } else {
+        return;
+    }
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
new file mode 100644
index 000000000..cfb838a67
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
@@ -0,0 +1,112 @@
+/**
+ * @file add_custom_tiling.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <cassert>
+#include <cstddef>
+#include "add_custom_tiling.h"
+
+// bfloat16, float16, float, int8, int16, int32
+constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
+constexpr uint32_t BLOCK_SIZE = 32;
+constexpr uint32_t BUFFER_NUM = 2;
+constexpr uint32_t UB_BLOCK_NUM = 21;  // UB最大可以使用的block数量
+constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;
+constexpr uint32_t BLOCK_DIM = 9;
+
+// tiling参数计算函数
+void TilingParamsCalc(uint32_t length, uint32_t alignNum,
+    uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength)
+{
+    assert(alignNum != 0U);
+    tileNum = length / (alignNum * MAX_AVAILABLE_UB_BLOCK_NUM);
+    if ((static_cast<uint32_t>(length / alignNum) % MAX_AVAILABLE_UB_BLOCK_NUM == 0U) || tileNum == 0U) {
+        if (tileNum == 0U) {
+            tileNum = 1U;
+        }
+        if (length < MAX_AVAILABLE_UB_BLOCK_NUM * alignNum) {
+            tileLength = ((static_cast<size_t>(length) / alignNum) + 1) / BUFFER_NUM * BUFFER_NUM * alignNum;
+            lastTileLength = tileLength;
+        } else {
+            tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum;
+            lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength);
+        }
+    } else {
+        tileNum++;
+        tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum;
+        lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength);
+    }
+}
+
+void GenerateTilingData(uint8_t* tilingBuf)
+{
+    uint32_t totalLength;
+    uint32_t dataTypeSize;
+    uint32_t blockLength;
+    uint32_t totalLengthAligned;
+
+    AddCustomTilingData *tiling = reinterpret_cast<AddCustomTilingData *>(tilingBuf);
+    totalLength = tiling->totalLength;
+    dataTypeSize = DATA_TYPE_SIZE[tiling->dataType];
+
+    uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
+    assert(alignNum != 0U);
+    /** 计算使用的核数 **/
+    /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/
+    totalLengthAligned = (totalLength % alignNum == 0U)?
+        static_cast<uint32_t>(totalLength) :
+        ((static_cast<uint32_t>(totalLength) + alignNum - 1) / alignNum) * alignNum;
+
+    /* 核间可均分场景 */
+    if ((totalLengthAligned / alignNum) % BLOCK_DIM == 0U) {
+        uint32_t tileNum = 0;
+        uint32_t tileLength = 0;
+        uint32_t lastTileLength = 0;
+        blockLength = totalLengthAligned / BLOCK_DIM;
+        TilingParamsCalc(blockLength, alignNum, tileNum, tileLength, lastTileLength);
+
+        tiling->blockLength = blockLength;
+        tiling->tileNum = tileNum;
+        tiling->tileLength = tileLength;
+        tiling->lastTileLength = lastTileLength;
+        tiling->isEvenCore = 1U;
+    } else {  // 核间不可均分
+        uint32_t formerNum = (totalLengthAligned / alignNum) % BLOCK_DIM;
+        uint32_t tailNum = BLOCK_DIM - formerNum;
+        // 计算整块和尾块的数据量
+        uint32_t formerLength =
+            static_cast<uint32_t>(((totalLengthAligned + BLOCK_DIM - 1) / BLOCK_DIM + alignNum - 1) / alignNum) * alignNum;
+        uint32_t tailLength = (totalLengthAligned / BLOCK_DIM / alignNum) * alignNum;
+
+        uint32_t formerTileNum;
+        uint32_t formerTileLength;
+        uint32_t formerLastTileLength;
+
+        uint32_t tailTileNum;
+        uint32_t tailTileLength;
+        uint32_t tailLastTileLength;
+
+        TilingParamsCalc(formerLength, alignNum,
+            formerTileNum, formerTileLength, formerLastTileLength);
+        TilingParamsCalc(tailLength, alignNum,
+            tailTileNum, tailTileLength, tailLastTileLength);
+
+        tiling->formerNum = formerNum;
+        tiling->formerLength = formerLength;
+        tiling->formerTileNum = formerTileNum;
+        tiling->formerTileLength = formerTileLength;
+        tiling->formerLastTileLength = formerLastTileLength;
+
+        tiling->tailNum = tailNum;
+        tiling->tailLength = tailLength;
+        tiling->tailTileNum = tailTileNum;
+        tiling->tailTileLength = tailTileLength;
+        tiling->tailLastTileLength = tailLastTileLength;
+        tiling->isEvenCore = 0U;
+    }
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom_tiling.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
similarity index 92%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom_tiling.h
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
index 4f9e259ed..6293b7713 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/add_custom_tiling.h
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
@@ -12,8 +12,7 @@
 #include <cstdint>
 
 struct AddCustomTilingData {
-    uint32_t xLength;
-    uint32_t yLength;
+    uint32_t totalLength;
     uint32_t dataType;
 
     uint32_t blockLength;
@@ -25,16 +24,14 @@ struct AddCustomTilingData {
     uint32_t formerLength;
     uint32_t formerTileNum;
     uint32_t formerTileLength;
+    uint32_t formerLastTileLength;
 
     uint32_t tailNum; 
     uint32_t tailLength;
     uint32_t tailTileNum;
     uint32_t tailTileLength;
-
-    uint32_t formerLastTileLength;
     uint32_t tailLastTileLength; 
 
     uint32_t isEvenCore;
-    uint32_t needBroadcast;
 };
 #endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/cmake/cpu_lib.cmake
similarity index 100%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/cmake/cpu_lib.cmake
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/cmake/cpu_lib.cmake
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/cmake/npu_lib.cmake
similarity index 100%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/cmake/npu_lib.cmake
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/cmake/npu_lib.cmake
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/data_utils.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/data_utils.h
similarity index 100%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/data_utils.h
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/data_utils.h
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
similarity index 34%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/main.cpp
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
index f855ea411..3e35408b7 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/main.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
@@ -17,47 +17,20 @@
 #include "tikicpulib.h"
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling);
 #endif
+extern void GenerateTilingData(uint8_t* tilingBuf);
 
-// bfloat16, float16, float, int8, int16, int32
-constexpr uint32_t DataTypeSize[] = {2, 2, 4, 1, 2, 4};
-constexpr uint32_t BlockSize = 32;
-constexpr uint32_t BufferNum = 2;
-constexpr uint32_t UbBlockNum = 20;  // UB最大可以使用的block数量
-constexpr uint32_t BlockDim = 9;
-void UnevenCoreTilingCalc(uint32_t length, uint32_t alignNum,
-    uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength)
-{
-    tileNum = length / alignNum / UbBlockNum;
-    if ((length / alignNum) % UbBlockNum == 0 || tileNum == 0) {
-        if (tileNum == 0) {
-            tileNum = 1;
-        }
-        if (length < UbBlockNum * alignNum) {
-            tileLength = ((length / alignNum) + 1) / BufferNum * BufferNum * alignNum;
-            lastTileLength = tileLength;
-        } else {
-            tileLength = UbBlockNum * alignNum;
-            lastTileLength = tileLength;
-        }
-    } else {
-        tileNum = tileNum + 1;
-        tileLength = UbBlockNum * alignNum;
-        lastTileLength = (length - (tileNum - 1) * tileLength);
-    }
-}
 
 int32_t main(int32_t argc, char *argv[])
 {
-    size_t tilingSize = 19 * sizeof(uint32_t);
-    uint32_t xLength, yLength, totalLength, tileNum, dataTypeSize, blockLength, totalLengthAligned;
+    constexpr uint32_t BLOCK_DIM = 9;
+    constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
+    uint8_t *tiling = nullptr;
+    size_t tilingSize = 17 * sizeof(uint32_t);
+
 
 #ifdef ASCENDC_CPU_DEBUG
-    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+    tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
     ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
-
-    xLength = reinterpret_cast<AddCustomTilingData *>(tiling)->xLength;
-    yLength = reinterpret_cast<AddCustomTilingData *>(tiling)->yLength;
-    dataTypeSize = DataTypeSize[reinterpret_cast<AddCustomTilingData *>(tiling)->dataType];
 #else
     CHECK_ACL(aclInit(nullptr));
     int32_t deviceId = 0;
@@ -65,107 +38,16 @@ int32_t main(int32_t argc, char *argv[])
     aclrtStream stream = nullptr;
     CHECK_ACL(aclrtCreateStream(&stream));
 
-    AddCustomTilingData *tiling;
     uint8_t *xHost, *yHost, *zHost;
     uint8_t *xDevice, *yDevice, *zDevice;
 
     CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize));
     ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
-
-    xLength = tiling->xLength;
-    yLength = tiling->yLength;
-    dataTypeSize = DataTypeSize[tiling->dataType];
-#endif
-    totalLength = (xLength > yLength)? xLength : yLength;
-
-    size_t inputByteSize = totalLength * dataTypeSize;
-    size_t outputByteSize = totalLength * dataTypeSize;
-    uint32_t alignNum = BlockSize / dataTypeSize;
-
-    /** 计算使用的核数 **/
-    /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/
-    totalLengthAligned = (totalLength % alignNum == 0)?
-        totalLength : ((totalLength + alignNum - 1) / alignNum) * alignNum;
-
-    /* 核间可均分场景 */
-    if ((totalLengthAligned / alignNum) % BlockDim == 0) {
-        uint32_t tileLength = 0;
-        uint32_t lastTileLength = 0;
-        blockLength = totalLengthAligned / BlockDim;
-        tileNum = blockLength / alignNum / UbBlockNum;
-
-        if ((blockLength / alignNum) % UbBlockNum == 0 || tileNum == 0) {    //满足32字节对齐，可以核内均分
-            if (tileNum == 0) {
-                tileNum = 1;
-            }
-            if (blockLength < UbBlockNum * alignNum) {
-                tileLength = ((blockLength / alignNum) + 1) / BufferNum * BufferNum * alignNum;
-                lastTileLength = tileLength;
-            } else {
-                tileLength = UbBlockNum * alignNum;
-                lastTileLength = tileLength;
-            }
-        } else {
-            tileNum = tileNum + 1;
-            tileLength = UbBlockNum * alignNum;
-            lastTileLength = blockLength - (tileNum - 1) * tileLength;
-        }
-
-#ifdef ASCENDC_CPU_DEBUG
-        reinterpret_cast<AddCustomTilingData *>(tiling)->blockLength = blockLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tileNum = tileNum;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tileLength = tileLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->lastTileLength = lastTileLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->isEvenCore = 1;
-#else
-        tiling->blockLength = blockLength;
-        tiling->tileNum = tileNum;
-        tiling->tileLength = tileLength;
-        tiling->lastTileLength = lastTileLength;
-        tiling->isEvenCore = 1;
-#endif
-    } else {  // 核间不可均分
-        uint32_t formerNum = (totalLengthAligned / alignNum) % BlockDim;
-        uint32_t tailNum = BlockDim - formerNum;
-        // 计算整块和尾块的数据量
-        uint32_t formerLength =
-                (((totalLengthAligned + BlockDim - 1) / BlockDim + alignNum - 1) / alignNum) * alignNum;
-        uint32_t tailLength = (totalLengthAligned / BlockDim / alignNum) * alignNum;
-
-        uint32_t formerTileNum, formerTileLength, formerLastTileLength;
-        uint32_t tailTileNum, tailTileLength, tailLastTileLength;
-
-        UnevenCoreTilingCalc(formerLength, alignNum, formerTileNum, formerTileLength, formerLastTileLength);
-        UnevenCoreTilingCalc(tailLength, alignNum, tailTileNum, tailTileLength, tailLastTileLength);
-
-#ifdef ASCENDC_CPU_DEBUG
-        reinterpret_cast<AddCustomTilingData *>(tiling)->formerNum = formerNum;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->formerLength = formerLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->formerTileNum = formerTileNum;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->formerTileLength = formerTileLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->formerLastTileLength = formerLastTileLength;
-
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tailNum = tailNum;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tailLength = tailLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tailTileNum = tailTileNum;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tailTileLength = tailTileLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->tailLastTileLength = tailLastTileLength;
-        reinterpret_cast<AddCustomTilingData *>(tiling)->isEvenCore = 0;
-#else
-        tiling->formerNum = formerNum;
-        tiling->formerLength = formerLength;
-        tiling->formerTileNum = formerTileNum;
-        tiling->formerTileLength = formerTileLength;
-        tiling->formerLastTileLength = formerLastTileLength;
-
-        tiling->tailNum = tailNum;
-        tiling->tailLength = tailLength;
-        tiling->tailTileNum = tailTileNum;
-        tiling->tailTileLength = tailTileLength;
-        tiling->tailLastTileLength = tailLastTileLength;
-        tiling->isEvenCore = 0;
 #endif
-    }
+    GenerateTilingData(tiling);
+    uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast<AddCustomTilingData *>(tiling)->dataType];
+    size_t inputByteSize = reinterpret_cast<AddCustomTilingData *>(tiling)->totalLength * dataTypeSize;
+    size_t outputByteSize = reinterpret_cast<AddCustomTilingData *>(tiling)->totalLength * dataTypeSize;
 
 #ifdef ASCENDC_CPU_DEBUG
     uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize);
@@ -177,7 +59,7 @@ int32_t main(int32_t argc, char *argv[])
 
     AscendC::SetKernelMode(KernelMode::AIV_MODE);
 
-    ICPU_RUN_KF(add_custom, BlockDim, x, y, z,
+    ICPU_RUN_KF(add_custom, BLOCK_DIM, x, y, z,
                 *reinterpret_cast<AddCustomTilingData *>(tiling));
 
     WriteFile("./output/output_z.bin", z, outputByteSize);
@@ -200,7 +82,8 @@ int32_t main(int32_t argc, char *argv[])
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
     CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
-    ACLRT_LAUNCH_KERNEL(add_custom)(BlockDim, stream, xDevice, yDevice, zDevice, tiling);
+    ACLRT_LAUNCH_KERNEL(add_custom)(BLOCK_DIM, stream, xDevice, yDevice, zDevice,
+        reinterpret_cast<AddCustomTilingData *>(tiling));
     CHECK_ACL(aclrtSynchronizeStream(stream));
 
     CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh
similarity index 100%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/run.sh
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/run.sh
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
similarity index 72%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/gen_data.py
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
index 6b5a30741..70f6bae30 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
@@ -14,22 +14,19 @@ bfloat16 = tf.bfloat16.as_numpy_dtype
 dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5}
 
 def gen_golden_data_simple():
-    input_shape_x = [32, 6737]
-    input_shape_y = [32, 6737]
+    input_shape = [32, 6737]
     dtype = np.int8
 
-    input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype)
-    input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)
+    input_x = np.random.uniform(-50, 50, input_shape).astype(dtype)
+    input_y = np.random.uniform(-50, 50, input_shape).astype(dtype)
     golden = (input_x + input_y).astype(dtype)
 
-    tiling = np.array([input_shape_x[0] * input_shape_x[1], input_shape_y[0] * input_shape_y[1], 
-        dtype_emu[dtype]], dtype=np.uint32)
+    tiling = np.array([input_shape[0] * input_shape[1], dtype_emu[dtype]], dtype=np.uint32)
 
     tiling.tofile("./input/input_tiling.bin")
     input_x.tofile("./input/input_x.bin")
     input_y.tofile("./input/input_y.bin")
     golden.tofile("./output/golden.bin")
 
-
 if __name__ == "__main__":
     gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
similarity index 87%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
index aac4ba310..277d94780 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/verify_result.py
@@ -17,11 +17,14 @@ bfloat16 = tf.bfloat16.as_numpy_dtype
 relative_tol = 1e-3
 absolute_tol = 1e-5
 error_tol = 1e-3
-
+data_type = np.int8
 
 def verify_result(output, golden):
-    output = np.fromfile(output, dtype=np.int8).reshape(-1)
-    golden = np.fromfile(golden, dtype=np.int8).reshape(-1)
+    output = np.fromfile(output, dtype=data_type).reshape(-1)
+    golden = np.fromfile(golden, dtype=data_type).reshape(-1)
+    if data_type == bfloat16:
+        output = output.astype(np.float32)
+        golden = golden.astype(np.float32)
     different_element_results = np.isclose(output,
                                            golden,
                                            rtol=relative_tol,
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/CMakeLists.txt b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/CMakeLists.txt
new file mode 100644
index 000000000..dc32860d9
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_tiling.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
new file mode 100644
index 000000000..75bf30d10
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/README.md
@@ -0,0 +1,83 @@
+## 目录结构介绍
+```
+├── VectorAddMultiCoreWithTilingBroadcast
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_tiling.h     // 算子tiling结构体
+│   ├── add_custom_tiling.cpp   // 算子tiling实现
+│   ├── add_custom.cpp          // 算子kernel实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+## 代码实现介绍
+- kernel实现
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。
+  - CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal；
+  - Compute任务负责对xLocal、yLocal进行Broadcast并存入临时内存，之后执行加法操作，计算结果存储在zLocal中；
+  - CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。
+- tiling实现
+  - TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数。
+
+    本样例尝试将输入数据均分在给定数目的核上计算，如不能均分，则计算整核和尾核的个数，整核和尾核分别处理不同数据量的数据。定义UB_BLOCK_NUM最为最大可用的UB数据块，对应UB的大小为UB_BLOCK_NUM * 32 Bytes。 并以UB_BLOCK_NUM为粒度进行tiling切分。
+
+    本示例算子使用了20个tiling参数： xLen、yLen、dataType为x, y输入数据的总长度和数据类型；coef为需要Broadcast的输入扩维的倍数；axis为输入数据Broadcast的轴；blockLength、tileNum、 tileLength、lastTileLength为核均分场景下每个核上的计算量、tiling切分个数以及整块和尾块长度；former(tail)Num、former(tail)Length、former(tail)TileNum、former(tail)TileLength、former(tail)LastTileLength为核不均分的场景下整核和尾核相关的切分数据，与核均分场景下对应；isEvenCore用来判断当前是否为核均分场景。
+
+  具体请参考[add_custom_tiling.cpp](./add_custom_tiling.cpp)。
+
+
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+    配置仿真模式日志文件目录，默认为sim_log。
+    ```bash
+    export CAMODEL_LOG_PATH=./sim_log
+    ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]，默认值为npu。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
new file mode 100644
index 000000000..a33c6a930
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
@@ -0,0 +1,909 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+
+constexpr uint32_t ADD_BFLOAT16 = 0;
+constexpr uint32_t ADD_FLOAT16 = 1;
+constexpr uint32_t ADD_FLOAT32 = 2;
+constexpr uint32_t ADD_INT8 = 3;
+constexpr uint32_t ADD_INT16 = 4;
+constexpr uint32_t ADD_INT32 = 5;
+
+constexpr uint32_t BROADCAST_DIM = 2;
+constexpr uint32_t BROADCAST_AXIS_ZERO = 0;
+constexpr uint32_t BROADCAST_AXIS_ONE = 1;
+constexpr uint32_t LAST_TWO_TILE = 2;
+template <typename dataType, uint32_t axis> class KernelAdd;
+
+// 针对axis = 0的场景
+template <> class KernelAdd<bfloat16_t, 0> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+            zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(bfloat16_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(bfloat16_t)), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<bfloat16_t> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+        } else {
+            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+        }
+        AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm, copyYParams, padParams);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.AllocTensor<bfloat16_t>();
+
+        AscendC::LocalTensor<bfloat16_t> broadcastTmpTensor = tmpBuf2.Get<bfloat16_t>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {1, this->coef};
+        AscendC::Broadcast<bfloat16_t, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
+        AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength);
+
+
+        outQueueZ.EnQue<bfloat16_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<bfloat16_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<bfloat16_t>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<bfloat16_t> xGm;
+    AscendC::GlobalTensor<bfloat16_t> yGm;
+    AscendC::GlobalTensor<bfloat16_t> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+template <> class KernelAdd<int8_t, 0> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+            zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(int8_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(int8_t)), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<int8_t> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<int8_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+        } else {
+            AscendC::DataCopyPad<int8_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+        }
+        AscendC::DataCopyPad<int8_t>(yLocal, yGm, copyYParams, padParams);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<int8_t> broadcastTmpTensor = tmpBuf2.Get<int8_t>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {1, this->coef};
+        AscendC::Broadcast<int8_t, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
+        AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength); 
+
+        outQueueZ.EnQue<int8_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<int8_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<int8_t>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<int8_t> xGm;
+    AscendC::GlobalTensor<int8_t> yGm;
+    AscendC::GlobalTensor<int8_t> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+    
+template <typename dataType> class KernelAdd<dataType, 0> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+            zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(dataType));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(dataType));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(dataType)), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<dataType> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<dataType>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+        } else {
+            AscendC::DataCopyPad<dataType>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+        }
+        AscendC::DataCopyPad<dataType>(yLocal, yGm, copyYParams, padParams);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
+
+        AscendC::LocalTensor<dataType> broadcastTmpTensor = tmpBuf2.Get<dataType>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {1, this->coef};
+        AscendC::Broadcast<dataType, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::Add(zLocal, xLocal, broadcastTmpTensor, this->tileLength);
+
+        outQueueZ.EnQue<dataType>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<dataType>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<dataType>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<dataType> xGm;
+    AscendC::GlobalTensor<dataType> yGm;
+    AscendC::GlobalTensor<dataType> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+// 针对axis = 1的场景
+template <> class KernelAdd<bfloat16_t, 1> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr + tiling.blockLength * AscendC::GetBlockIdx() / this->coef, tiling.blockLength / this->coef);
+            zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr + tiling.formerLength * AscendC::GetBlockIdx() / this->coef, tiling.formerLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr + tiling.formerLength * tiling.formerNum / this->coef +
+                tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum) / this->coef, tiling.tailLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(bfloat16_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t) / this->coef), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<bfloat16_t> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+            AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
+                copyYParams, padParams);
+        } else {
+            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+            AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm[progress * this->tileLength / this->coef], copyYParams, padParams);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.AllocTensor<bfloat16_t>();
+
+        AscendC::LocalTensor<bfloat16_t> broadcastTmpTensor = tmpBuf2.Get<bfloat16_t>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {this->tileLength / this->coef, 1};
+        AscendC::Broadcast<bfloat16_t, BROADCAST_DIM, BROADCAST_AXIS_ONE>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
+        AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength);
+
+        outQueueZ.EnQue<bfloat16_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<bfloat16_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<bfloat16_t>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<bfloat16_t> xGm;
+    AscendC::GlobalTensor<bfloat16_t> yGm;
+    AscendC::GlobalTensor<bfloat16_t> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+template <> class KernelAdd<int8_t, 1> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr + tiling.blockLength * AscendC::GetBlockIdx() / this->coef, tiling.blockLength / this->coef);
+            zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr + tiling.formerLength * AscendC::GetBlockIdx() / this->coef, tiling.formerLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr + tiling.formerLength * tiling.formerNum / this->coef +
+                tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum) / this->coef, tiling.tailLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(int8_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
+        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t) / this->coef), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<int8_t> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<int8_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+            AscendC::DataCopyPad<int8_t>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
+                copyYParams, padParams);
+        } else {
+            AscendC::DataCopyPad<int8_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+            AscendC::DataCopyPad<int8_t>(yLocal, yGm[progress * this->tileLength / this->coef], copyYParams, padParams);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> xLocal = inQueueX.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> yLocal = inQueueY.DeQue<int8_t>();
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<int8_t> broadcastTmpTensor = tmpBuf2.Get<int8_t>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {this->tileLength / this->coef, 1};
+        AscendC::Broadcast<int8_t, BROADCAST_DIM, BROADCAST_AXIS_ONE>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
+        AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
+        outQueueZ.EnQue<int8_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<int8_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<int8_t>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<int8_t> xGm;
+    AscendC::GlobalTensor<int8_t> yGm;
+    AscendC::GlobalTensor<int8_t> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+template <typename dataType> class KernelAdd<dataType, 1> {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+    {
+        GM_ADDR longerInputPtr;
+        GM_ADDR shorterInputPtr;
+        if (tiling.xLen > tiling.yLen) {
+            longerInputPtr = x;
+            shorterInputPtr = y;
+        } else {
+            longerInputPtr = y;
+            shorterInputPtr = x;
+        }
+        this->coef = tiling.coef;
+        if (tiling.isEvenCore) {
+            this->tileNum = tiling.tileNum;
+            this->tileLength = tiling.tileLength / BUFFER_NUM;
+            this->lastTileLength = tiling.lastTileLength;
+
+            xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+            yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr + tiling.blockLength * AscendC::GetBlockIdx() / this->coef, tiling.blockLength / this->coef);
+            zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
+        } else {
+            if (AscendC::GetBlockIdx() < tiling.formerNum) {
+                this->tileNum = tiling.formerTileNum;
+                this->tileLength = tiling.formerTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.formerLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr + tiling.formerLength * AscendC::GetBlockIdx() / this->coef, tiling.formerLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+            } else {
+                this->tileNum = tiling.tailTileNum;
+                this->tileLength = tiling.tailTileLength / BUFFER_NUM;
+                this->lastTileLength = tiling.tailLastTileLength;
+
+                xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr + tiling.formerLength * tiling.formerNum / this->coef +
+                tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum) / this->coef, tiling.tailLength / this->coef);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum +
+                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+            }
+        }
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(dataType));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
+
+        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(dataType));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
+
+        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->tileLength * sizeof(dataType) / this->coef), 0, 0, 0};
+        AscendC::DataCopyPadExtParams<dataType> padParams = {false, 0, 0, 0};
+
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<dataType>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
+                copyXParams, padParams);
+            AscendC::DataCopyPad<dataType>(yLocal, yGm[((progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength) / this->coef],
+                copyYParams, padParams);
+        } else {
+            AscendC::DataCopyPad<dataType>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
+            AscendC::DataCopyPad<dataType>(yLocal, yGm[progress * this->tileLength / this->coef], copyYParams, padParams);
+        }
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> xLocal = inQueueX.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
+
+        AscendC::LocalTensor<dataType> broadcastTmpTensor = tmpBuf2.Get<dataType>();
+        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
+        uint32_t srcShape[] = {this->tileLength / this->coef, 1};
+        AscendC::Broadcast<dataType, BROADCAST_DIM, BROADCAST_AXIS_ONE>(broadcastTmpTensor, yLocal, dstShape, srcShape);
+
+        AscendC::Add(zLocal, xLocal, broadcastTmpTensor, this->tileLength);
+
+        outQueueZ.EnQue<dataType>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
+        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
+            AscendC::DataCopyPad<dataType>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
+        } else {
+            AscendC::DataCopyPad<dataType>(zGm[progress * this->tileLength], zLocal, copyParams);
+        }
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
+
+    AscendC::GlobalTensor<dataType> xGm;
+    AscendC::GlobalTensor<dataType> yGm;
+    AscendC::GlobalTensor<dataType> zGm;
+
+    uint32_t coef;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+{
+    if (tiling.axis == 0) {
+        if (tiling.dataType == ADD_BFLOAT16) {
+            KernelAdd<bfloat16_t, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_FLOAT16) {
+            KernelAdd<half, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_FLOAT32) {
+            KernelAdd<float, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT8) {
+            KernelAdd<int8_t, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT16) {
+            KernelAdd<int16_t, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT32) {
+            KernelAdd<int32_t, 0> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else {
+            return;
+        }
+    } else if (tiling.axis == 1) {
+        if (tiling.dataType == ADD_BFLOAT16) {
+            KernelAdd<bfloat16_t, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_FLOAT16) {
+            KernelAdd<half, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_FLOAT32) {
+            KernelAdd<float, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT8) {
+            KernelAdd<int8_t, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT16) {
+            KernelAdd<int16_t, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else if (tiling.dataType == ADD_INT32) {
+            KernelAdd<int32_t, 1> op;
+            op.Init(x, y, z, tiling);
+            op.Process();
+        } else {
+            return;
+        }
+    } else {
+        return;
+    }
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
new file mode 100644
index 000000000..85e624b07
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
@@ -0,0 +1,111 @@
+/**
+ * @file add_custom_tiling.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <cassert>
+#include "add_custom_tiling.h"
+
+// bfloat16, float16, float, int8, int16, int32
+constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
+constexpr uint32_t BLOCK_SIZE = 32;
+constexpr uint32_t BUFFER_NUM = 2;
+constexpr uint32_t UB_BLOCK_NUM = 50;  // UB最大可以使用的block数量
+constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;
+constexpr uint32_t BLOCK_DIM = 8;
+void TilingParamsCalc(uint32_t length, uint32_t ubBlockNum,
+    uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength)
+{
+    assert(ubBlockNum != 0U);
+    tileNum = length / ubBlockNum;
+    if (length % ubBlockNum == 0U || tileNum == 0U) {
+        if (tileNum == 0U) {
+            tileNum = 1U;
+        }
+        if (length < ubBlockNum) {
+            tileLength = length;
+            lastTileLength = tileLength;
+        } else {
+            tileLength = ubBlockNum;
+            lastTileLength = tileLength;
+        }
+    } else {
+        tileNum++;
+        tileLength = ubBlockNum;
+        lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength);
+    }
+}
+
+void GenerateTilingData(uint8_t* tilingBuf)
+{
+    uint32_t xLen;
+    uint32_t yLen;
+    uint32_t totalLength;
+    uint32_t dataTypeSize;
+
+    AddCustomTilingData *tiling = reinterpret_cast<AddCustomTilingData *>(tilingBuf);
+    xLen = tiling->xLen;
+    yLen = tiling->yLen;
+    assert((xLen != 0U) && (yLen != 0U));
+    dataTypeSize = DATA_TYPE_SIZE[tiling->dataType];
+    totalLength = (xLen > yLen)? xLen : yLen;
+
+    uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
+    uint32_t shorterAxisLen = (xLen < yLen)? xLen : yLen;
+    uint32_t coef = totalLength / shorterAxisLen;
+    uint32_t ubBlockAligned =
+        (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM) == 0U)?
+        MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM);
+
+    if (shorterAxisLen % (BLOCK_DIM * BUFFER_NUM) == 0U) {
+        uint32_t blockLength = shorterAxisLen / BLOCK_DIM * coef;
+        uint32_t tileNum = 0;
+        uint32_t tileLength = 0;
+        uint32_t lastTileLength = 0;
+
+        TilingParamsCalc(blockLength, ubBlockAligned, tileNum, tileLength, lastTileLength);
+
+        tiling->blockLength = blockLength;
+        tiling->tileNum = tileNum;
+        tiling->tileLength = tileLength;
+        tiling->lastTileLength = lastTileLength;
+        tiling->isEvenCore = 1U;
+    } else {
+        uint32_t formerNum = (shorterAxisLen / BUFFER_NUM) % BLOCK_DIM;
+        uint32_t tailNum = BLOCK_DIM - formerNum;
+
+        uint32_t formerLength = (((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) + 1) * BUFFER_NUM * coef;
+        uint32_t tailLength = ((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) * BUFFER_NUM * coef;
+
+        uint32_t formerTileNum;
+        uint32_t formerTileLength;
+        uint32_t formerLastTileLength;
+
+        uint32_t tailTileNum;
+        uint32_t tailTileLength;
+        uint32_t tailLastTileLength;
+
+        TilingParamsCalc(formerLength, ubBlockAligned,
+            formerTileNum, formerTileLength, formerLastTileLength);
+        TilingParamsCalc(tailLength, ubBlockAligned,
+            tailTileNum, tailTileLength, tailLastTileLength);
+
+        tiling->formerNum = formerNum;
+        tiling->formerLength = formerLength;
+        tiling->formerTileNum = formerTileNum;
+        tiling->formerTileLength = formerTileLength;
+        tiling->formerLastTileLength = formerLastTileLength;
+
+        tiling->tailNum = tailNum;
+        tiling->tailLength = tailLength;
+        tiling->tailTileNum = tailTileNum;
+        tiling->tailTileLength = tailTileLength;
+        tiling->tailLastTileLength = tailLastTileLength;
+        tiling->isEvenCore = 0U;
+    }
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.h
new file mode 100644
index 000000000..8c31f6f4a
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.h
@@ -0,0 +1,39 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co. Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include <cstdint>
+
+struct AddCustomTilingData {
+    uint32_t xLen;
+    uint32_t yLen;
+    uint32_t coef;
+    uint32_t axis;
+    uint32_t dataType;
+
+    uint32_t isEvenCore;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+    uint32_t lastTileLength;
+
+    uint32_t formerNum;
+    uint32_t formerLength;
+    uint32_t formerTileNum;
+    uint32_t formerTileLength;
+    uint32_t formerLastTileLength;
+
+    uint32_t tailNum; 
+    uint32_t tailLength;
+    uint32_t tailTileNum;
+    uint32_t tailTileLength;
+    uint32_t tailLastTileLength;
+};
+#endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..5362c8b5a
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/npu_lib.cmake
new file mode 100644
index 000000000..f92b095d1
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/data_utils.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/data_utils.h
new file mode 100644
index 000000000..9d3445780
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
new file mode 100644
index 000000000..b5c077951
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
@@ -0,0 +1,109 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_add_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling);
+#endif
+extern void GenerateTilingData(uint8_t* tilingBuf);
+
+
+int32_t main(int32_t argc, char *argv[])
+{
+    constexpr uint32_t BLOCK_DIM = 8;
+    constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
+    uint8_t *tiling = nullptr;
+    size_t tilingSize = 20 * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+    ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *zHost;
+    uint8_t *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize));
+    ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
+#endif
+    GenerateTilingData(tiling);
+    uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast<AddCustomTilingData *>(tiling)->dataType];
+    uint32_t xLen = reinterpret_cast<AddCustomTilingData *>(tiling)->xLen;
+    uint32_t yLen = reinterpret_cast<AddCustomTilingData *>(tiling)->yLen;
+    uint32_t totalLength = (xLen > yLen)? xLen : yLen;
+
+    size_t inputXByteSize = xLen * dataTypeSize;
+    size_t inputYByteSize = yLen * dataTypeSize;
+    size_t outputByteSize = totalLength * dataTypeSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputXByteSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputYByteSize);
+    uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize);
+
+    ReadFile("./input/input_x.bin", inputXByteSize, x, inputXByteSize);
+    ReadFile("./input/input_y.bin", inputYByteSize, y, inputYByteSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+
+    ICPU_RUN_KF(add_custom, BLOCK_DIM, x, y, z,
+                *reinterpret_cast<AddCustomTilingData *>(tiling));
+
+    WriteFile("./output/output_z.bin", z, outputByteSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)z);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputXByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputYByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputXByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputYByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputXByteSize, xHost, inputXByteSize);
+    ReadFile("./input/input_y.bin", inputYByteSize, yHost, inputYByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputXByteSize, xHost, inputXByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputYByteSize, yHost, inputYByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ACLRT_LAUNCH_KERNEL(add_custom)(BLOCK_DIM, stream, xDevice, yDevice, zDevice,
+        reinterpret_cast<AddCustomTilingData *>(tiling));
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
new file mode 100644
index 000000000..8fcd59730
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/run.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
new file mode 100644
index 000000000..ce8a8f4ae
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5}
+
+def gen_golden_data_simple():
+    input_shape_x = [32, 128]
+    input_shape_y = [1, 128]
+
+    #input_shape_x = [14, 1]
+    #input_shape_y = [14, 280]
+
+    dtype = np.int8
+
+    input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype)
+    input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)
+    golden = (input_x + input_y).astype(dtype)
+
+    if np.size(input_x) > np.size(input_y):
+        if input_shape_y[0] == 1:
+            axis = 0
+            coef = np.size(input_y)
+        elif input_shape_y[1] == 1:
+            axis = 1
+            coef = np.size(input_x) / np.size(input_y)
+    else:
+        if input_shape_x[0] == 1:
+            axis = 0
+            coef = np.size(input_x)
+        elif input_shape_x[1] == 1:
+            axis = 1
+            coef = np.size(input_y) / np.size(input_x)
+    tiling = np.array([input_shape_x[0] * input_shape_x[1],
+                       input_shape_y[0] * input_shape_y[1],
+                       coef,
+                       axis,
+                       dtype_emu[dtype]],
+                       dtype=np.uint32)
+
+    tiling.tofile("./input/input_tiling.bin")
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
new file mode 100644
index 000000000..277d94780
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/verify_result.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+data_type = np.int8
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=data_type).reshape(-1)
+    golden = np.fromfile(golden, dtype=data_type).reshape(-1)
+    if data_type == bfloat16:
+        output = output.astype(np.float32)
+        golden = golden.astype(np.float32)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/CMakeLists.txt b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/CMakeLists.txt
similarity index 100%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/CMakeLists.txt
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/CMakeLists.txt
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
similarity index 81%
rename from operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/README.md
rename to operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
index 56d20ae88..b2fe9aaef 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/AddKernelInvocationTilingNeo/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/README.md
@@ -1,11 +1,10 @@
 ## 目录结构介绍
 ```
-├── AddKernelInvocationTilingNeo
+├── VectorAddSingleCore
 │   ├── cmake                   // 编译工程文件
 │   ├── scripts
 │   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
 │   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
-│   ├── add_custom_tiling.h     // 算子tiling实现
 │   ├── add_custom.cpp          // 算子kernel实现
 │   ├── CMakeLists.txt          // 编译工程文件
 │   ├── data_utils.h            // 数据读入写出函数
@@ -21,8 +20,6 @@
   计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。  
 
   Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。
-- tiling实现  
-  TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数，本示例算子使用了2个tiling参数：totalLength、tileNum。totalLength是指需要计算的数据量大小，tileNum是指每个核上总计算数据分块个数。比如，totalLength这个参数传递到kernel侧后，可以通过除以参与计算的核数，得到每个核上的计算量，这样就完成了多核数据的切分。tiling实现代码中通过上下文获取输入输出的shape信息，并对应设置TilingData。
 
 - 调用实现
   1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
@@ -34,7 +31,7 @@
   - 打开样例目录   
     以命令行方式下载样例代码，master分支为例。
     ```bash
-    cd ${git_clone_path}/samples/operator/AddCustomSample/KernelLaunch/AddKernelInvocationTilingNeo
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/
     ```
   - 配置环境变量
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/add_custom.cpp
new file mode 100644
index 000000000..968a4dc16
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/add_custom.cpp
@@ -0,0 +1,81 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+constexpr uint32_t TOTAL_LENGTH = 2048;
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ half *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z, TOTAL_LENGTH);
+
+        pipe.InitBuffer(inQueueX, 1, TOTAL_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, 1, TOTAL_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, 1, TOTAL_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+
+        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
+        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
+
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+
+        AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH);
+
+        outQueueZ.EnQue<half>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueZ;
+
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<half> zGm;
+};
+    
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..5362c8b5a
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/npu_lib.cmake
new file mode 100644
index 000000000..f92b095d1
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/data_utils.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/data_utils.h
new file mode 100644
index 000000000..9d3445780
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/main.cpp
new file mode 100644
index 000000000..d0c372ba0
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/main.cpp
@@ -0,0 +1,83 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_add_custom.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+#endif
+
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 1;
+    size_t inputByteSize = static_cast<size_t>(1) * 2048 * sizeof(uint16_t);
+    size_t outputByteSize = static_cast<size_t>(1) * 2048 * sizeof(uint16_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize);
+
+    ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug
+
+    WriteFile("./output/output_z.bin", z, outputByteSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)z);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *zHost;
+    uint8_t *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ACLRT_LAUNCH_KERNEL(add_custom)(blockDim, stream, xDevice, yDevice, zDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
new file mode 100644
index 000000000..8fcd59730
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/run.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/gen_data.py
new file mode 100644
index 000000000..a0aceb3bc
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/gen_data.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+
+def gen_golden_data_simple():
+    input_shape_x = [1, 2048]
+    input_shape_y = [1, 2048]
+    dtype = np.float16
+
+    input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype)
+    input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)
+    golden = (input_x + input_y).astype(dtype)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
new file mode 100644
index 000000000..4e1c4ad45
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/scripts/verify_result.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+import sys
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+data_type = bfloat16
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=data_type).reshape(-1)
+    golden = np.fromfile(golden, dtype=data_type).reshape(-1)
+    if data_type == bfloat16:
+        output = output.astype(np.float32)
+        golden = golden.astype(np.float32)
+    different_element_results = np.isclose(output,
+                                        golden,
+                                        rtol=relative_tol,
+                                        atol=absolute_tol,
+                                        equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/CMakeLists.txt b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/CMakeLists.txt
new file mode 100644
index 000000000..1e4d6de99
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
new file mode 100644
index 000000000..a8f76986d
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/README.md
@@ -0,0 +1,72 @@
+## 目录结构介绍
+```
+├── VectorAddSingleCore
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom.cpp          // 算子kernel实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+## 代码实现介绍
+- kernel实现  
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Cast接口进行数据转换，并将结果存入临时内存中。之后再调用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。  
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。
+  - CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal；
+  - Compute任务负责对xLocal、yLocal进行类型转换，并执行加法操作，计算结果存储在zLocal中；
+  - CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。
+
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCore/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    
+    配置仿真模式日志文件目录，默认为sim_log。
+    ```bash
+    export CAMODEL_LOG_PATH=./sim_log
+    ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]，默认值为npu。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/add_custom.cpp
new file mode 100644
index 000000000..948da72bc
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/add_custom.cpp
@@ -0,0 +1,95 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+constexpr uint32_t TOTAL_LENGTH = 2048;
+
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z, TOTAL_LENGTH);
+
+        pipe.InitBuffer(inQueueX, 1, TOTAL_LENGTH * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, 1, TOTAL_LENGTH * sizeof(bfloat16_t));
+        pipe.InitBuffer(outQueueZ, 1, TOTAL_LENGTH * sizeof(bfloat16_t));
+
+        pipe.InitBuffer(tmpBuf0, TOTAL_LENGTH * sizeof(float));
+        pipe.InitBuffer(tmpBuf1, TOTAL_LENGTH * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
+
+        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
+        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
+
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.AllocTensor<bfloat16_t>();
+
+        AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
+        AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
+
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, TOTAL_LENGTH);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, TOTAL_LENGTH);
+
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, TOTAL_LENGTH);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, TOTAL_LENGTH); 
+
+        outQueueZ.EnQue<bfloat16_t>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
+        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueZ;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
+
+    AscendC::GlobalTensor<bfloat16_t> xGm;
+    AscendC::GlobalTensor<bfloat16_t> yGm;
+    AscendC::GlobalTensor<bfloat16_t> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..5362c8b5a
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/npu_lib.cmake
new file mode 100644
index 000000000..f92b095d1
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/data_utils.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/data_utils.h
new file mode 100644
index 000000000..9d3445780
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/main.cpp
new file mode 100644
index 000000000..d0c372ba0
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/main.cpp
@@ -0,0 +1,83 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_add_custom.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+#endif
+
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 1;
+    size_t inputByteSize = static_cast<size_t>(1) * 2048 * sizeof(uint16_t);
+    size_t outputByteSize = static_cast<size_t>(1) * 2048 * sizeof(uint16_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputByteSize);
+    uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize);
+
+    ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug
+
+    WriteFile("./output/output_z.bin", z, outputByteSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)z);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *zHost;
+    uint8_t *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    ACLRT_LAUNCH_KERNEL(add_custom)(blockDim, stream, xDevice, yDevice, zDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
new file mode 100644
index 000000000..8fcd59730
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/run.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/gen_data.py
new file mode 100644
index 000000000..872333fc0
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/gen_data.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+
+def gen_golden_data_simple():
+    input_shape_x = [1, 2048]
+    input_shape_y = [1, 2048]
+    dtype = bfloat16
+
+    input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype)
+    input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)
+    golden = (input_x + input_y).astype(dtype)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py
new file mode 100644
index 000000000..4e1c4ad45
--- /dev/null
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddSingleCoreWithTmpbuf/scripts/verify_result.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+import sys
+import numpy as np
+import tensorflow as tf
+bfloat16 = tf.bfloat16.as_numpy_dtype
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+data_type = bfloat16
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=data_type).reshape(-1)
+    golden = np.fromfile(golden, dtype=data_type).reshape(-1)
+    if data_type == bfloat16:
+        output = output.astype(np.float32)
+        golden = golden.astype(np.float32)
+    different_element_results = np.isclose(output,
+                                        golden,
+                                        rtol=relative_tol,
+                                        atol=absolute_tol,
+                                        equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/README.md b/operator/ascendc/0_introduction/README.md
index 70a27a117..0c9e46bf7 100644
--- a/operator/ascendc/0_introduction/README.md
+++ b/operator/ascendc/0_introduction/README.md
@@ -34,6 +34,7 @@
 | [18_unaligned_wholereduces_frameworklaunch](./18_unaligned_wholereduces_frameworklaunch) | 基于Ascend C的非对齐WholeReduceSum自定义算子及FrameworkLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 | [19_unaligned_wholereduces_kernellaunch](./19_unaligned_wholereduces_kernellaunch) | 基于Ascend C的非对齐WholeReduceSum自定义算子及KernelLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 | [20_mmad_kernellaunch](./20_mmad_kernellaunch) | 基于Ascend C基础API的Matmul自定义Cube算子及KernelLaunch调用样例 | Atlas 推理系列产品AI Core<br>Atlas A2训练系列产品/Atlas 800I A2推理产品 |
+| [21_vectoradd_kernellaunch](./21_vectoradd_kernellaunch) | 基于Ascend C的Add多场景自定义Vector算子的KernelLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 
 ## 获取样例代码<a name="codeready"></a>
 
-- 
Gitee


From ac5945ec5c2758a67041abc7dcdcecd31c9fd19f Mon Sep 17 00:00:00 2001
From: anzoola <liuyuhang24@huawei.com>
Date: Wed, 2 Apr 2025 13:23:52 +0000
Subject: [PATCH 2/3] =?UTF-8?q?VecAdd=E6=A0=B7=E4=BE=8B=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../21_vectoradd_kernellaunch/README.md       |  59 +++++++--
 .../add_custom_tiling.cpp                     |  17 ++-
 .../VectorAddMultiCoreWithTiling/main.cpp     |  13 +-
 .../scripts/gen_data.py                       |  15 ++-
 .../add_custom.cpp                            | 125 ++++++------------
 .../add_custom_tiling.cpp                     |  50 ++++---
 .../main.cpp                                  |   4 +-
 .../scripts/gen_data.py                       |  38 +++++-
 8 files changed, 183 insertions(+), 138 deletions(-)

diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
index bda5a0626..7bf8b7bea 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md
@@ -53,41 +53,74 @@ z = x + y
 <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
 </tr>
 <tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">x</td><td align="center">(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023)
+</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
 </tr>
 </tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">-</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
 </tr>
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
 </table>
 
+
+该算子支持在不同输入数据长度下采用不同策略对数据进行核间切分以及tiling，shape对应场景如下:
+
+  1. 核间均分，单核计算量满足32B对齐：    (32, 1024)
+
+  2. 核间均分，单核计算量不满足32B对齐：  (8, 1023)
+
+  3. 核间不均分，单核计算量满足32B对齐：  (32, 1023)
+
+  4. 核间不均分，单核计算量不满足32B对齐：(17, 1023)
+
+
+
 - VectorAddMultiCoreWithTilingBroadcast
 <table>
 <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
 </tr>
 <tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">(m, n) / (1, n) / (m, 1)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">(m, n) / (1, n) / (m, 1)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">x</td><td align="left">
+axis = 0：(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022)
+axis = 1：(16, 1) / (16, 1) / (20, 1) / (20, 1)
+</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="left">
+axis = 0：(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022)
+axis = 1：(16, 256) / (16, 255) / (20, 256) / (20, 255)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
 </tr>
 </tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">(m * n))</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="left">
+axis = 0：(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022)
+axis = 1：(16, 256) / (16, 255) / (20, 256) / (20, 255)</td><td align="center">bfloat16_t/int8_t/float/half/int16_t/int32_t</td><td align="center">ND</td></tr>
 </tr>
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
 </table>
 </tr>
 </tr>
-该算子支持对任一输入的某个轴进行广播，对应关系如下
 
-- 针对axis = 0（第一个轴）进行广播
-  - x(m, n), y(1, n)
-  - x(1, n), y(m, n)
 
-  注意，该场景下m需满足32字节对齐。
+该算子支持对任一输入的某个轴进行广播，其中输入x，y的shape可以交换。表格中提到的shape对应不同的策略对数据进行核间切分以及tiling，对应关系如下：
+
+  - 针对axis = 0（第一个轴）进行广播
+
+    1. 核间均分，单核计算量对齐	x shape：(8, 1024)， y shape：(1, 1024)；
+
+    2. 核间均分，单核计算量非对齐	x shape：(8, 1022)， y shape：(1, 1022)；
+
+    3. 核间不均分，单核计算量对齐  x shape：(17, 1024)， y shape：(1, 1024)；
+
+    4. 核间不均分，单核计算量非对齐	x shape：(17, 1022)， y shape：(1, 1022)。
+
 
 - 针对axis = 1（第二个轴）进行广播
-  - x(m, n), y(m, 1)
-  - x(m, 1), y(m, n)
+  1. 核间均分，单核计算量对齐	x shape：(16, 1)， y shape：(16, 256)；
+
+  2. 核间均分，单核计算量非对齐	x shape：(16, 1)， y shape：(16, 255)；
+
+  3. 核间不均分，单核计算量对齐  x shape：(20, 1)， y shape：(20, 256)；
+
+  4. 核间不均分，单核计算量非对齐	x shape：(20, 1)， y shape：(20, 255)。
 
 
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
index cfb838a67..b71573e2d 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
@@ -15,9 +15,8 @@
 constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
 constexpr uint32_t BLOCK_SIZE = 32;
 constexpr uint32_t BUFFER_NUM = 2;
-constexpr uint32_t UB_BLOCK_NUM = 21;  // UB最大可以使用的block数量
+constexpr uint32_t UB_BLOCK_NUM = 100;  // UB最大可以使用的block数量
 constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;
-constexpr uint32_t BLOCK_DIM = 9;
 
 // tiling参数计算函数
 void TilingParamsCalc(uint32_t length, uint32_t alignNum,
@@ -43,7 +42,7 @@ void TilingParamsCalc(uint32_t length, uint32_t alignNum,
     }
 }
 
-void GenerateTilingData(uint8_t* tilingBuf)
+void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
 {
     uint32_t totalLength;
     uint32_t dataTypeSize;
@@ -63,11 +62,11 @@ void GenerateTilingData(uint8_t* tilingBuf)
         ((static_cast<uint32_t>(totalLength) + alignNum - 1) / alignNum) * alignNum;
 
     /* 核间可均分场景 */
-    if ((totalLengthAligned / alignNum) % BLOCK_DIM == 0U) {
+    if ((totalLengthAligned / alignNum) % blockDim == 0U) {
         uint32_t tileNum = 0;
         uint32_t tileLength = 0;
         uint32_t lastTileLength = 0;
-        blockLength = totalLengthAligned / BLOCK_DIM;
+        blockLength = totalLengthAligned / blockDim;
         TilingParamsCalc(blockLength, alignNum, tileNum, tileLength, lastTileLength);
 
         tiling->blockLength = blockLength;
@@ -76,12 +75,12 @@ void GenerateTilingData(uint8_t* tilingBuf)
         tiling->lastTileLength = lastTileLength;
         tiling->isEvenCore = 1U;
     } else {  // 核间不可均分
-        uint32_t formerNum = (totalLengthAligned / alignNum) % BLOCK_DIM;
-        uint32_t tailNum = BLOCK_DIM - formerNum;
+        uint32_t formerNum = (totalLengthAligned / alignNum) % blockDim;
+        uint32_t tailNum = blockDim - formerNum;
         // 计算整块和尾块的数据量
         uint32_t formerLength =
-            static_cast<uint32_t>(((totalLengthAligned + BLOCK_DIM - 1) / BLOCK_DIM + alignNum - 1) / alignNum) * alignNum;
-        uint32_t tailLength = (totalLengthAligned / BLOCK_DIM / alignNum) * alignNum;
+            static_cast<uint32_t>(((totalLengthAligned + blockDim - 1) / blockDim + alignNum - 1) / alignNum) * alignNum;
+        uint32_t tailLength = (totalLengthAligned / blockDim / alignNum) * alignNum;
 
         uint32_t formerTileNum;
         uint32_t formerTileLength;
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
index 3e35408b7..acfeb24cf 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp
@@ -17,12 +17,13 @@
 #include "tikicpulib.h"
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling);
 #endif
-extern void GenerateTilingData(uint8_t* tilingBuf);
+extern void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim);
 
 
 int32_t main(int32_t argc, char *argv[])
 {
-    constexpr uint32_t BLOCK_DIM = 9;
+    constexpr uint32_t BLOCK_DIM = 8;
+    constexpr uint32_t DATA_BLOCK_BYTE = 32;
     constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
     uint8_t *tiling = nullptr;
     size_t tilingSize = 17 * sizeof(uint32_t);
@@ -44,15 +45,15 @@ int32_t main(int32_t argc, char *argv[])
     CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize));
     ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
 #endif
-    GenerateTilingData(tiling);
+    GenerateTilingData(tiling, BLOCK_DIM);
     uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast<AddCustomTilingData *>(tiling)->dataType];
     size_t inputByteSize = reinterpret_cast<AddCustomTilingData *>(tiling)->totalLength * dataTypeSize;
     size_t outputByteSize = reinterpret_cast<AddCustomTilingData *>(tiling)->totalLength * dataTypeSize;
 
 #ifdef ASCENDC_CPU_DEBUG
-    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize);
-    uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputByteSize);
-    uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize);
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE);
+    uint8_t *z = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE);
 
     ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
     ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
index 70f6bae30..e12157d5a 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py
@@ -14,13 +14,24 @@ bfloat16 = tf.bfloat16.as_numpy_dtype
 dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5}
 
 def gen_golden_data_simple():
-    input_shape = [32, 6737]
     dtype = np.int8
+    # dtype = bfloat16
+
+    ## 核间均分，单核计算量对齐:
+    # input_shape = [32, 1024]
+
+    ## 核间均分，单核计算量非对齐:
+    # input_shape = [8, 1023]
+
+    ## 核间不均分，单核计算量对齐:
+    # input_shape = [32, 1023]
+
+    ## 核间不均分，单核计算量非对齐:
+    input_shape = [17, 1023]
 
     input_x = np.random.uniform(-50, 50, input_shape).astype(dtype)
     input_y = np.random.uniform(-50, 50, input_shape).astype(dtype)
     golden = (input_x + input_y).astype(dtype)
-
     tiling = np.array([input_shape[0] * input_shape[1], dtype_emu[dtype]], dtype=np.uint32)
 
     tiling.tofile("./input/input_tiling.bin")
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
index a33c6a930..029e3e098 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp
@@ -36,9 +36,11 @@ public:
         if (tiling.xLen > tiling.yLen) {
             longerInputPtr = x;
             shorterInputPtr = y;
+            this->shorterAxisLen = tiling.yLen;
         } else {
             longerInputPtr = y;
             shorterInputPtr = x;
+            this->shorterAxisLen = tiling.xLen;
         }
         this->coef = tiling.coef;
         if (tiling.isEvenCore) {
@@ -47,7 +49,7 @@ public:
             this->lastTileLength = tiling.lastTileLength;
 
             xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
-            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen);
             zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
@@ -56,7 +58,7 @@ public:
                 this->lastTileLength = tiling.formerLastTileLength;
 
                 xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
@@ -65,16 +67,15 @@ public:
 
                 xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
             }
         }
         pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
         pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
 
-        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(bfloat16_t));
         pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
         pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
     }
@@ -94,17 +95,11 @@ private:
         AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
         AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
 
-        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
-        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(bfloat16_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
         AscendC::DataCopyPadExtParams<bfloat16_t> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                copyXParams, padParams);
-        } else {
-            AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
-        }
-        AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm, copyYParams, padParams);
+        AscendC::DataCopyPad<bfloat16_t>(xLocal, xGm[progress * this->tileLength], copyParams, padParams);
+        AscendC::DataCopyPad<bfloat16_t>(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams);
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
@@ -114,16 +109,11 @@ private:
         AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
         AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.AllocTensor<bfloat16_t>();
 
-        AscendC::LocalTensor<bfloat16_t> broadcastTmpTensor = tmpBuf2.Get<bfloat16_t>();
-        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
-        uint32_t srcShape[] = {1, this->coef};
-        AscendC::Broadcast<bfloat16_t, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
-
         AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
         AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
 
         AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
 
         AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
         AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength);
@@ -137,11 +127,8 @@ private:
     {
         AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<bfloat16_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
-        } else {
-            AscendC::DataCopyPad<bfloat16_t>(zGm[progress * this->tileLength], zLocal, copyParams);
-        }
+
+        AscendC::DataCopyPad<bfloat16_t>(zGm[progress * this->tileLength], zLocal, copyParams);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -153,7 +140,6 @@ private:
 
     AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
     AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
 
     AscendC::GlobalTensor<bfloat16_t> xGm;
     AscendC::GlobalTensor<bfloat16_t> yGm;
@@ -163,6 +149,7 @@ private:
     uint32_t tileNum;
     uint32_t tileLength;
     uint32_t lastTileLength;
+    uint32_t shorterAxisLen;
 };
 
 template <> class KernelAdd<int8_t, 0> {
@@ -175,9 +162,11 @@ public:
         if (tiling.xLen > tiling.yLen) {
             longerInputPtr = x;
             shorterInputPtr = y;
+            this->shorterAxisLen = tiling.yLen;
         } else {
             longerInputPtr = y;
             shorterInputPtr = x;
+            this->shorterAxisLen = tiling.xLen;
         }
         this->coef = tiling.coef;
         if (tiling.isEvenCore) {
@@ -186,7 +175,7 @@ public:
             this->lastTileLength = tiling.lastTileLength;
 
             xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
-            yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+            yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen);
             zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
@@ -195,7 +184,7 @@ public:
                 this->lastTileLength = tiling.formerLastTileLength;
 
                 xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
@@ -204,17 +193,15 @@ public:
 
                 xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
             }
         }
         pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(int8_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(int8_t));
         pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t));
 
-        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(int8_t));
-
         pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
         pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
     }
@@ -234,17 +221,11 @@ private:
         AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
         AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
 
-        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
-        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(int8_t)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
         AscendC::DataCopyPadExtParams<int8_t> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<int8_t>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                copyXParams, padParams);
-        } else {
-            AscendC::DataCopyPad<int8_t>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
-        }
-        AscendC::DataCopyPad<int8_t>(yLocal, yGm, copyYParams, padParams);
+        AscendC::DataCopyPad<int8_t>(xLocal, xGm[progress * this->tileLength], copyParams, padParams);
+        AscendC::DataCopyPad<int8_t>(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams);
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
@@ -254,19 +235,15 @@ private:
         AscendC::LocalTensor<int8_t> yLocal = inQueueY.DeQue<int8_t>();
         AscendC::LocalTensor<int8_t> zLocal = outQueueZ.AllocTensor<int8_t>();
 
-        AscendC::LocalTensor<int8_t> broadcastTmpTensor = tmpBuf2.Get<int8_t>();
-        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
-        uint32_t srcShape[] = {1, this->coef};
-        AscendC::Broadcast<int8_t, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
-
         AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
         AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
 
         AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-        AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
 
         AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
-        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength); 
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength);
+
 
         outQueueZ.EnQue<int8_t>(zLocal);
         inQueueX.FreeTensor(xLocal);
@@ -276,11 +253,8 @@ private:
     {
         AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<int8_t>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
-        } else {
-            AscendC::DataCopyPad<int8_t>(zGm[progress * this->tileLength], zLocal, copyParams);
-        }
+
+        AscendC::DataCopyPad<int8_t>(zGm[progress * this->tileLength], zLocal, copyParams);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -292,7 +266,6 @@ private:
 
     AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf0;
     AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf1;
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
 
     AscendC::GlobalTensor<int8_t> xGm;
     AscendC::GlobalTensor<int8_t> yGm;
@@ -302,8 +275,9 @@ private:
     uint32_t tileNum;
     uint32_t tileLength;
     uint32_t lastTileLength;
+    uint32_t shorterAxisLen;
 };
-    
+
 template <typename dataType> class KernelAdd<dataType, 0> {
 public:
     __aicore__ inline KernelAdd() {}
@@ -314,9 +288,11 @@ public:
         if (tiling.xLen > tiling.yLen) {
             longerInputPtr = x;
             shorterInputPtr = y;
+            this->shorterAxisLen = tiling.yLen;
         } else {
             longerInputPtr = y;
             shorterInputPtr = x;
+            this->shorterAxisLen = tiling.xLen;
         }
         this->coef = tiling.coef;
         if (tiling.isEvenCore) {
@@ -325,7 +301,7 @@ public:
             this->lastTileLength = tiling.lastTileLength;
 
             xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
-            yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+            yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen);
             zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
@@ -334,7 +310,7 @@ public:
                 this->lastTileLength = tiling.formerLastTileLength;
 
                 xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
@@ -343,16 +319,14 @@ public:
 
                 xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef);
+                yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen);
                 zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum +
                     tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
             }
         }
         pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(dataType));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType));
         pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
-
-        pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(dataType));
     }
     __aicore__ inline void Process()
     {
@@ -370,17 +344,11 @@ private:
         AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
         AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
 
-        AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
-        AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(dataType)), 0, 0, 0};
+        AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
         AscendC::DataCopyPadExtParams<dataType> padParams = {false, 0, 0, 0};
 
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<dataType>(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                copyXParams, padParams);
-        } else {
-            AscendC::DataCopyPad<dataType>(xLocal, xGm[progress * this->tileLength], copyXParams, padParams);
-        }
-        AscendC::DataCopyPad<dataType>(yLocal, yGm, copyYParams, padParams);
+        AscendC::DataCopyPad<dataType>(xLocal, xGm[progress * this->tileLength], copyParams, padParams);
+        AscendC::DataCopyPad<dataType>(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams);
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
@@ -390,12 +358,7 @@ private:
         AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
 
-        AscendC::LocalTensor<dataType> broadcastTmpTensor = tmpBuf2.Get<dataType>();
-        uint32_t dstShape[] = {this->tileLength / this->coef, this->coef};
-        uint32_t srcShape[] = {1, this->coef};
-        AscendC::Broadcast<dataType, BROADCAST_DIM, BROADCAST_AXIS_ZERO>(broadcastTmpTensor, yLocal, dstShape, srcShape);
-
-        AscendC::Add(zLocal, xLocal, broadcastTmpTensor, this->tileLength);
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
 
         outQueueZ.EnQue<dataType>(zLocal);
         inQueueX.FreeTensor(xLocal);
@@ -405,11 +368,8 @@ private:
     {
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
         AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0};
-        if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) {
-            AscendC::DataCopyPad<dataType>(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams);
-        } else {
-            AscendC::DataCopyPad<dataType>(zGm[progress * this->tileLength], zLocal, copyParams);
-        }
+
+        AscendC::DataCopyPad<dataType>(zGm[progress * this->tileLength], zLocal, copyParams);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -419,8 +379,6 @@ private:
     AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY;
     AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
 
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBuf2;
-
     AscendC::GlobalTensor<dataType> xGm;
     AscendC::GlobalTensor<dataType> yGm;
     AscendC::GlobalTensor<dataType> zGm;
@@ -429,6 +387,7 @@ private:
     uint32_t tileNum;
     uint32_t tileLength;
     uint32_t lastTileLength;
+    uint32_t shorterAxisLen;
 };
 
 // 针对axis = 1的场景
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
index 85e624b07..3a92ab43b 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
@@ -15,9 +15,8 @@
 constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
 constexpr uint32_t BLOCK_SIZE = 32;
 constexpr uint32_t BUFFER_NUM = 2;
-constexpr uint32_t UB_BLOCK_NUM = 50;  // UB最大可以使用的block数量
+constexpr uint32_t UB_BLOCK_NUM = 100;  // UB最大可以使用的block数量
 constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;
-constexpr uint32_t BLOCK_DIM = 8;
 void TilingParamsCalc(uint32_t length, uint32_t ubBlockNum,
     uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength)
 {
@@ -41,7 +40,7 @@ void TilingParamsCalc(uint32_t length, uint32_t ubBlockNum,
     }
 }
 
-void GenerateTilingData(uint8_t* tilingBuf)
+void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
 {
     uint32_t xLen;
     uint32_t yLen;
@@ -57,18 +56,24 @@ void GenerateTilingData(uint8_t* tilingBuf)
 
     uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
     uint32_t shorterAxisLen = (xLen < yLen)? xLen : yLen;
-    uint32_t coef = totalLength / shorterAxisLen;
+    uint32_t alignCoef = (tiling->axis == 0)? shorterAxisLen : totalLength / shorterAxisLen;
+    uint32_t divDimCoef = (tiling->axis == 0)? totalLength / shorterAxisLen : shorterAxisLen;
     uint32_t ubBlockAligned =
-        (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM) == 0U)?
-        MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM);
+        (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM) == 0U)?
+        MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM);
 
-    if (shorterAxisLen % (BLOCK_DIM * BUFFER_NUM) == 0U) {
-        uint32_t blockLength = shorterAxisLen / BLOCK_DIM * coef;
+    if (divDimCoef % blockDim == 0U) {
+        uint32_t blockLength = divDimCoef / blockDim * alignCoef;
         uint32_t tileNum = 0;
         uint32_t tileLength = 0;
         uint32_t lastTileLength = 0;
-
-        TilingParamsCalc(blockLength, ubBlockAligned, tileNum, tileLength, lastTileLength);
+        if (tiling->axis == 0) {
+            tileNum = blockLength / shorterAxisLen;
+            tileLength = shorterAxisLen;
+            lastTileLength = tileLength;
+        } else {
+            TilingParamsCalc(blockLength, ubBlockAligned, tileNum, tileLength, lastTileLength);
+        }
 
         tiling->blockLength = blockLength;
         tiling->tileNum = tileNum;
@@ -76,11 +81,11 @@ void GenerateTilingData(uint8_t* tilingBuf)
         tiling->lastTileLength = lastTileLength;
         tiling->isEvenCore = 1U;
     } else {
-        uint32_t formerNum = (shorterAxisLen / BUFFER_NUM) % BLOCK_DIM;
-        uint32_t tailNum = BLOCK_DIM - formerNum;
+        uint32_t formerNum = (divDimCoef) % blockDim;
+        uint32_t tailNum = blockDim - formerNum;
 
-        uint32_t formerLength = (((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) + 1) * BUFFER_NUM * coef;
-        uint32_t tailLength = ((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) * BUFFER_NUM * coef;
+        uint32_t formerLength = (divDimCoef / blockDim + 1)  * alignCoef;
+        uint32_t tailLength = divDimCoef / blockDim * alignCoef;
 
         uint32_t formerTileNum;
         uint32_t formerTileLength;
@@ -89,11 +94,20 @@ void GenerateTilingData(uint8_t* tilingBuf)
         uint32_t tailTileNum;
         uint32_t tailTileLength;
         uint32_t tailLastTileLength;
+        if (tiling->axis == 0) {
+            formerTileNum = formerLength / shorterAxisLen;
+            formerTileLength = shorterAxisLen;
+            formerLastTileLength = shorterAxisLen;
 
-        TilingParamsCalc(formerLength, ubBlockAligned,
-            formerTileNum, formerTileLength, formerLastTileLength);
-        TilingParamsCalc(tailLength, ubBlockAligned,
-            tailTileNum, tailTileLength, tailLastTileLength);
+            tailTileNum = tailLength / shorterAxisLen;
+            tailTileLength = shorterAxisLen;
+            tailLastTileLength = shorterAxisLen;
+        } else {
+            TilingParamsCalc(formerLength, ubBlockAligned,
+                formerTileNum, formerTileLength, formerLastTileLength);
+            TilingParamsCalc(tailLength, ubBlockAligned,
+                tailTileNum, tailTileLength, tailLastTileLength);
+        }
 
         tiling->formerNum = formerNum;
         tiling->formerLength = formerLength;
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
index b5c077951..3b76bdf69 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp
@@ -17,7 +17,7 @@
 #include "tikicpulib.h"
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling);
 #endif
-extern void GenerateTilingData(uint8_t* tilingBuf);
+extern void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim);
 
 
 int32_t main(int32_t argc, char *argv[])
@@ -43,7 +43,7 @@ int32_t main(int32_t argc, char *argv[])
     CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize));
     ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize);
 #endif
-    GenerateTilingData(tiling);
+    GenerateTilingData(tiling, BLOCK_DIM);
     uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast<AddCustomTilingData *>(tiling)->dataType];
     uint32_t xLen = reinterpret_cast<AddCustomTilingData *>(tiling)->xLen;
     uint32_t yLen = reinterpret_cast<AddCustomTilingData *>(tiling)->yLen;
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
index ce8a8f4ae..95cd2435c 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py
@@ -14,13 +14,41 @@ bfloat16 = tf.bfloat16.as_numpy_dtype
 dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5}
 
 def gen_golden_data_simple():
-    input_shape_x = [32, 128]
-    input_shape_y = [1, 128]
+    # dtype = np.float32
+    # dtype = bfloat16
+    dtype = np.int8
 
-    #input_shape_x = [14, 1]
-    #input_shape_y = [14, 280]
+    ## Broadcast场景 axis = 0时, 核间均分, 单核计算量对齐
+    # input_shape_x = [8, 1024]
+    # input_shape_y = [1, 1024]
 
-    dtype = np.int8
+    ## Broadcast场景 axis = 0时, 核间均分, 单核计算量非对齐
+    # input_shape_x = [8, 1022]
+    # input_shape_y = [1, 1022]
+
+    ## Broadcast场景 axis = 0时, 核间不均分, 单核计算量对齐
+    input_shape_x = [17, 1024]
+    input_shape_y = [1, 1024]
+
+    ## Broadcast场景 axis = 0时, 核间不均分, 单核计算量非对齐
+    input_shape_x = [17, 1022]
+    input_shape_y = [1, 1022]
+
+    ## Broadcast场景 axis = 1时, 核间均分, 单核计算量对齐
+    # input_shape_x = [16, 1]
+    # input_shape_y = [16, 256]
+
+    ## Broadcast场景 axis = 1时, 核间均分, 单核计算量非对齐
+    # input_shape_x = [16, 1]
+    # input_shape_y = [16, 255]
+
+    ## Broadcast场景 axis = 1时, 核间不均分, 单核计算量对齐
+    # input_shape_x = [20, 1]
+    # input_shape_y = [20, 256]
+
+    ## Broadcast场景 axis = 1时, 核间不均分, 单核计算量非对齐
+    # input_shape_x = [20, 1]
+    # input_shape_y = [20, 255]
 
     input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype)
     input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)
-- 
Gitee


From 834aa966a81d8568a954ee9ebcacc16be634bfb2 Mon Sep 17 00:00:00 2001
From: anzoola <liuyuhang24@huawei.com>
Date: Thu, 3 Apr 2025 07:07:27 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=E8=A1=A5=E5=85=85=E9=99=A40=E6=A0=A1?=
 =?UTF-8?q?=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../VectorAddMultiCoreWithTiling/add_custom_tiling.cpp     | 2 +-
 .../add_custom_tiling.cpp                                  | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
index b71573e2d..ca3f2cee6 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
@@ -54,7 +54,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
     dataTypeSize = DATA_TYPE_SIZE[tiling->dataType];
 
     uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
-    assert(alignNum != 0U);
+    assert((alignNum != 0U) && (blockDim != 0U));
     /** 计算使用的核数 **/
     /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/
     totalLengthAligned = (totalLength % alignNum == 0U)?
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
index 3a92ab43b..1690d529a 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp
@@ -55,9 +55,10 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
     totalLength = (xLen > yLen)? xLen : yLen;
 
     uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
+    assert((alignNum != 0U) && (blockDim != 0U));
     uint32_t shorterAxisLen = (xLen < yLen)? xLen : yLen;
-    uint32_t alignCoef = (tiling->axis == 0)? shorterAxisLen : totalLength / shorterAxisLen;
-    uint32_t divDimCoef = (tiling->axis == 0)? totalLength / shorterAxisLen : shorterAxisLen;
+    uint32_t alignCoef = (tiling->axis == 0U)? shorterAxisLen : totalLength / shorterAxisLen;
+    uint32_t divDimCoef = (tiling->axis == 0U)? totalLength / shorterAxisLen : shorterAxisLen;
     uint32_t ubBlockAligned =
         (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM) == 0U)?
         MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM);
@@ -67,7 +68,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
         uint32_t tileNum = 0;
         uint32_t tileLength = 0;
         uint32_t lastTileLength = 0;
-        if (tiling->axis == 0) {
+        if (tiling->axis == 0U) {
             tileNum = blockLength / shorterAxisLen;
             tileLength = shorterAxisLen;
             lastTileLength = tileLength;
-- 
Gitee