diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
index df072c8e677da1757641fdc125991ab5154ddb6e..0d3d3c29726b51b06f930c2276c21704fa996cf5 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom.cpp
@@ -33,66 +33,72 @@ public:
             this->tileLength = tiling.tileLength / BUFFER_NUM;
             this->lastTileLength = tiling.lastTileLength;
 
-            xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            uint64_t offset = this->blockLength * AscendC::GetBlockIdx();
+            xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, this->blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
                 this->tileNum = tiling.formerTileNum;
                 this->tileLength = tiling.formerTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.formerLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx();
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
                 this->tileLength = tiling.tailTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.tailLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength *
+                    (AscendC::GetBlockIdx() - tiling.formerNum);
+                xGm.SetGlobalBuffer((__gm__ bfloat16_t *)x + offset, tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ bfloat16_t *)y + offset, tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + offset, tiling.tailLength);
             }
         }
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t));
 
-        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float));
-        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float));
+        this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(bfloat16_t));
+
+        pipe.InitBuffer(tmpBuf0, this->initBufferLength * sizeof(float));
+        pipe.InitBuffer(tmpBuf1, this->initBufferLength * sizeof(float));
     }
     __aicore__ inline void Process()
     {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
+        // 整块进行double buffer计算
+        uint32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (uint32_t i = 0; i < loopCount; i++) {
+            CopyIn(i, this->tileLength);
+            Compute(i, this->tileLength);
+            CopyOut(i, this->tileLength);
+        }
+
+        // 进行尾块计算, 不做double buffer操作
+        if (this->lastTileLength > 0) {
+            CopyIn(loopCount, this->lastTileLength);
+            Compute(loopCount, this->lastTileLength);
+            CopyOut(loopCount, this->lastTileLength);
         }
     }
 
 private:
-    __aicore__ inline void CopyIn(int32_t progress)
+    __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.AllocTensor<bfloat16_t>();
         AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.AllocTensor<bfloat16_t>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
-        }
+
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength);
+
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
-    __aicore__ inline void Compute(int32_t progress)
+    __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<bfloat16_t> xLocal = inQueueX.DeQue<bfloat16_t>();
         AscendC::LocalTensor<bfloat16_t> yLocal = inQueueY.DeQue<bfloat16_t>();
@@ -101,25 +107,20 @@ private:
         AscendC::LocalTensor<float> tmpTensor0 = tmpBuf0.Get<float>();
         AscendC::LocalTensor<float> tmpTensor1 = tmpBuf1.Get<float>();
 
-        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, tileLength);
 
-        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
-        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength); 
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, tileLength);
 
         outQueueZ.EnQue<bfloat16_t>(zLocal);
         inQueueX.FreeTensor(xLocal);
         inQueueY.FreeTensor(yLocal);
     }
-    __aicore__ inline void CopyOut(int32_t progress)
+    __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<bfloat16_t> zLocal = outQueueZ.DeQue<bfloat16_t>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
-        }
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, tileLength);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -135,6 +136,7 @@ private:
     AscendC::GlobalTensor<bfloat16_t> yGm;
     AscendC::GlobalTensor<bfloat16_t> zGm;
 
+    uint32_t initBufferLength;     // initBuffer所用长度
     uint32_t blockLength;
     uint32_t tileNum;
     uint32_t tileLength;
@@ -152,66 +154,72 @@ public:
             this->tileLength = tiling.tileLength / BUFFER_NUM;
             this->lastTileLength = tiling.lastTileLength;
 
-            xGm.SetGlobalBuffer((__gm__ int8_t *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            yGm.SetGlobalBuffer((__gm__ int8_t *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            zGm.SetGlobalBuffer((__gm__ int8_t *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            uint64_t offset = this->blockLength * AscendC::GetBlockIdx();
+            xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, this->blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
                 this->tileNum = tiling.formerTileNum;
                 this->tileLength = tiling.formerTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.formerLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx();
+                xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
                 this->tileLength = tiling.tailTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.tailLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ int8_t *)x + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ int8_t *)y + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength *
+                    (AscendC::GetBlockIdx() - tiling.formerNum);
+                xGm.SetGlobalBuffer((__gm__ int8_t *)x + offset, tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ int8_t *)y + offset, tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ int8_t *)z + offset, tiling.tailLength);
             }
         }
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(int8_t));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t));
 
-        pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half));
-        pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half));
+        this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(int8_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(int8_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(int8_t));
+
+        pipe.InitBuffer(tmpBuf0, this->initBufferLength * sizeof(half));
+        pipe.InitBuffer(tmpBuf1, this->initBufferLength * sizeof(half));
     }
     __aicore__ inline void Process()
     {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
+        // 整块进行double buffer计算
+        uint32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (uint32_t i = 0; i < loopCount; i++) {
+            CopyIn(i, this->tileLength);
+            Compute(i, this->tileLength);
+            CopyOut(i, this->tileLength);
+        }
+
+        // 进行尾块计算, 不做double buffer操作
+        if (this->lastTileLength > 0) {
+            CopyIn(loopCount, this->lastTileLength);
+            Compute(loopCount, this->lastTileLength);
+            CopyOut(loopCount, this->lastTileLength);
         }
     }
 
 private:
-    __aicore__ inline void CopyIn(int32_t progress)
+    __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<int8_t> xLocal = inQueueX.AllocTensor<int8_t>();
         AscendC::LocalTensor<int8_t> yLocal = inQueueY.AllocTensor<int8_t>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
-        }
+
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength);
+
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
-    __aicore__ inline void Compute(int32_t progress)
+    __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<int8_t> xLocal = inQueueX.DeQue<int8_t>();
         AscendC::LocalTensor<int8_t> yLocal = inQueueY.DeQue<int8_t>();
@@ -220,25 +228,20 @@ private:
         AscendC::LocalTensor<half> tmpTensor0 = tmpBuf0.Get<half>();
         AscendC::LocalTensor<half> tmpTensor1 = tmpBuf1.Get<half>();
 
-        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
-        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, tileLength);
+        AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, tileLength);
 
-        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength);
-        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength);
+        AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, tileLength);
+        AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, tileLength);
 
         outQueueZ.EnQue<int8_t>(zLocal);
         inQueueX.FreeTensor(xLocal);
         inQueueY.FreeTensor(yLocal);
     }
-    __aicore__ inline void CopyOut(int32_t progress)
+    __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<int8_t> zLocal = outQueueZ.DeQue<int8_t>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
-        }
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, tileLength);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -254,6 +257,7 @@ private:
     AscendC::GlobalTensor<int8_t> yGm;
     AscendC::GlobalTensor<int8_t> zGm;
 
+    uint32_t initBufferLength;     // initBuffer所用长度
     uint32_t blockLength;
     uint32_t tileNum;
     uint32_t tileLength;
@@ -271,83 +275,83 @@ public:
             this->tileLength = tiling.tileLength / BUFFER_NUM;
             this->lastTileLength = tiling.lastTileLength;
 
-            xGm.SetGlobalBuffer((__gm__ dataType *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            yGm.SetGlobalBuffer((__gm__ dataType *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-            zGm.SetGlobalBuffer((__gm__ dataType *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+            uint64_t offset = this->blockLength * AscendC::GetBlockIdx();
+            xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, this->blockLength);
+            yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, this->blockLength);
+            zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, this->blockLength);
         } else {
             if (AscendC::GetBlockIdx() < tiling.formerNum) {
                 this->tileNum = tiling.formerTileNum;
                 this->tileLength = tiling.formerTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.formerLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
-                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength);
+                uint64_t offset = tiling.formerLength * AscendC::GetBlockIdx();
+                xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, tiling.formerLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, tiling.formerLength);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, tiling.formerLength);
             } else {
                 this->tileNum = tiling.tailTileNum;
                 this->tileLength = tiling.tailTileLength / BUFFER_NUM;
                 this->lastTileLength = tiling.tailLastTileLength;
 
-                xGm.SetGlobalBuffer((__gm__ dataType *)x + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                yGm.SetGlobalBuffer((__gm__ dataType *)y + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
-                zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum + 
-                    tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength);
+                uint64_t offset = tiling.formerLength * tiling.formerNum + tiling.tailLength *
+                    (AscendC::GetBlockIdx() - tiling.formerNum);
+                xGm.SetGlobalBuffer((__gm__ dataType *)x + offset, tiling.tailLength);
+                yGm.SetGlobalBuffer((__gm__ dataType *)y + offset, tiling.tailLength);
+                zGm.SetGlobalBuffer((__gm__ dataType *)z + offset, tiling.tailLength);
             }
         }
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType));
+        this->initBufferLength = AscendC::Std::max(this->tileLength, this->lastTileLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->initBufferLength * sizeof(dataType));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->initBufferLength * sizeof(dataType));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->initBufferLength * sizeof(dataType));
     }
     __aicore__ inline void Process()
     {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
+        // 整块进行double buffer计算
+        uint32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (uint32_t i = 0; i < loopCount; i++) {
+            CopyIn(i, this->tileLength);
+            Compute(i, this->tileLength);
+            CopyOut(i, this->tileLength);
+        }
+
+        // 进行尾块计算, 不做double buffer操作
+        if (this->lastTileLength > 0) {
+            CopyIn(loopCount, this->lastTileLength);
+            Compute(loopCount, this->lastTileLength);
+            CopyOut(loopCount, this->lastTileLength);
         }
     }
 
 private:
-    __aicore__ inline void CopyIn(int32_t progress)
+    __aicore__ inline void CopyIn(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<dataType> xLocal = inQueueX.AllocTensor<dataType>();
         AscendC::LocalTensor<dataType> yLocal = inQueueY.AllocTensor<dataType>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength],
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-            AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
-        }
+
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], tileLength);
+
         inQueueX.EnQue(xLocal);
         inQueueY.EnQue(yLocal);
     }
-    __aicore__ inline void Compute(int32_t progress)
+    __aicore__ inline void Compute(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<dataType> xLocal = inQueueX.DeQue<dataType>();
         AscendC::LocalTensor<dataType> yLocal = inQueueY.DeQue<dataType>();
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.AllocTensor<dataType>();
 
-        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        AscendC::Add(zLocal, xLocal, yLocal, tileLength);
 
         outQueueZ.EnQue<dataType>(zLocal);
         inQueueX.FreeTensor(xLocal);
         inQueueY.FreeTensor(yLocal);
     }
-    __aicore__ inline void CopyOut(int32_t progress)
+    __aicore__ inline void CopyOut(uint32_t progress, uint32_t tileLength)
     {
         AscendC::LocalTensor<dataType> zLocal = outQueueZ.DeQue<dataType>();
-        if (progress == (this->tileNum * BUFFER_NUM - 1)) {
-            AscendC::DataCopy(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal,
-                this->tileLength);
-        } else {
-            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
-        }
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
         outQueueZ.FreeTensor(zLocal);
     }
 
@@ -361,12 +365,13 @@ private:
     AscendC::GlobalTensor<dataType> yGm;
     AscendC::GlobalTensor<dataType> zGm;
 
+    uint32_t initBufferLength;     // initBuffer所用长度
     uint32_t blockLength;
     uint32_t tileNum;
     uint32_t tileLength;
     uint32_t lastTileLength;
 };
-        
+
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
 {
     if (tiling.dataType == ADD_BFLOAT16) {
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
index 8940940b055a4d85e9e0da7398263337bfbe7ab5..1c951d7e45d5e683eb968995b314b2592888284f 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp
@@ -15,36 +15,35 @@
 constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4};
 constexpr uint32_t BLOCK_SIZE = 32;
 constexpr uint32_t BUFFER_NUM = 2;
-constexpr uint32_t UB_BLOCK_NUM = 100;  // UB最大可以使用的block数量
-constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;
+constexpr uint32_t UB_BLOCK_NUM = 100;                                                      // UB最大可以使用的block数量
+constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM;     // 每个buffer可用block数量
 
 // tiling参数计算函数
-void TilingParamsCalc(uint32_t length, uint32_t alignNum,
-    uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength)
+// length: 每个核的数据长度
+void TilingParamsCalc(uint32_t length, uint32_t alignNum, uint32_t& tileNum, uint32_t& tileLength,
+    uint32_t& lastTileLength)
 {
     assert(alignNum != 0U);
     tileNum = length / (alignNum * MAX_AVAILABLE_UB_BLOCK_NUM);
-    if ((static_cast<uint32_t>(length / alignNum) % MAX_AVAILABLE_UB_BLOCK_NUM == 0U) || tileNum == 0U) {
-        if (tileNum == 0U) {
-            tileNum = 1U;
-        }
-        if (length < MAX_AVAILABLE_UB_BLOCK_NUM * alignNum) {
-            tileLength = ((static_cast<size_t>(length) + alignNum - 1) / alignNum) * alignNum;
-            lastTileLength = tileLength;
-        } else {
-            tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum;
-            lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength);
-        }
+
+    // 单核需要计算的长度 < 单核UB最大一次可计算长度 -> 仅有尾块
+    if (tileNum == 0U) {
+        tileLength = 0;
+        lastTileLength = ((static_cast<size_t>(length) + alignNum - 1) / alignNum) * alignNum;
+    } else if (static_cast<uint32_t>(length / alignNum) % MAX_AVAILABLE_UB_BLOCK_NUM == 0U) {
+        // 单核需要计算的长度 = 单核UB最大一次可计算长度 的整数倍 -> 仅有整块
+        tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum;
+        lastTileLength = 0;
     } else {
-        tileNum++;
+        // 有整块 + 尾块
         tileLength = MAX_AVAILABLE_UB_BLOCK_NUM * alignNum;
-        lastTileLength = (uint32_t)(length - (tileNum - 1) * tileLength);
+        lastTileLength = (uint32_t)(length - tileNum* tileLength);
     }
 }
 
 void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
 {
-    uint32_t totalLength;
+    uint32_t totalLength;           // 总共要计算的元素个数
     uint32_t dataTypeSize;
     uint32_t blockLength;
     uint32_t totalLengthAligned;
@@ -53,7 +52,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
     totalLength = tiling->totalLength;
     dataTypeSize = DATA_TYPE_SIZE[tiling->dataType];
 
-    uint32_t alignNum = BLOCK_SIZE / dataTypeSize;
+    uint32_t alignNum = BLOCK_SIZE / dataTypeSize;     // 一个block中的元素个数
     assert((alignNum != 0U) && (blockDim != 0U));
     /** 计算使用的核数 **/
     /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/
@@ -90,10 +89,8 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
         uint32_t tailTileLength;
         uint32_t tailLastTileLength;
 
-        TilingParamsCalc(formerLength, alignNum,
-            formerTileNum, formerTileLength, formerLastTileLength);
-        TilingParamsCalc(tailLength, alignNum,
-            tailTileNum, tailTileLength, tailLastTileLength);
+        TilingParamsCalc(formerLength, alignNum, formerTileNum, formerTileLength, formerLastTileLength);
+        TilingParamsCalc(tailLength, alignNum, tailTileNum, tailTileLength, tailLastTileLength);
 
         tiling->formerNum = formerNum;
         tiling->formerLength = formerLength;
@@ -106,6 +103,7 @@ void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim)
         tiling->tailTileNum = tailTileNum;
         tiling->tailTileLength = tailTileLength;
         tiling->tailLastTileLength = tailLastTileLength;
+        
         tiling->isEvenCore = 0U;
     }
 }
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
index 7b82c3377b1a8a51bb93cd5d0e5ee3f61a0594f2..441a56757b400685672a0abb0f1cd588d23f87cb 100644
--- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
+++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.h
@@ -14,23 +14,24 @@
 struct AddCustomTilingData {
     uint32_t totalLength;
     uint32_t dataType;
+    // 核间可均分数据块时，主块的参数
+    uint32_t blockLength;      // 每个核要计算的数据总长度
+    uint32_t tileNum;          // 数据整块数
+    uint32_t tileLength;       // 数据整块的元素个数
+    uint32_t lastTileLength;   // 数据尾块的元素个数
 
-    uint32_t blockLength;
-    uint32_t tileNum;
-    uint32_t tileLength;
-    uint32_t lastTileLength;
-
-    uint32_t formerNum;
+    // 核间不可均分数据块时，主块的参数
+    uint32_t formerNum;              // 主块核数
     uint32_t formerLength;
     uint32_t formerTileNum;
     uint32_t formerTileLength;
     uint32_t formerLastTileLength;
-
-    uint32_t tailNum; 
+    // 核间不可均分数据块时，尾块的参数
+    uint32_t tailNum;      // 尾块核数
     uint32_t tailLength;
     uint32_t tailTileNum;
     uint32_t tailTileLength;
-    uint32_t tailLastTileLength; 
+    uint32_t tailLastTileLength;
 
     uint32_t isEvenCore;
 };