diff --git a/ccsrc/CMakeLists.txt b/ccsrc/CMakeLists.txt
index baa1cecbee3497fc22105d569110d701c90a9f74..b74143bf35c8e574ff61e36d1d9e6d4862c882f6 100644
--- a/ccsrc/CMakeLists.txt
+++ b/ccsrc/CMakeLists.txt
@@ -58,21 +58,27 @@ message(STATUS "CFLAGS_INCLUDES: ${CFLAGS_INCLUDES}")
 # =============================================================================
 # Get YAML files
 # =============================================================================
-file(GLOB_RECURSE YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*.yaml")
-message(STATUS "YAML_FILES: ${YAML_FILES}")
-
-# Build Python list format: ['path1', 'path2', 'path3']
-set(YAML_STRING "[")
-set(FIRST_ITEM TRUE)
-foreach(YAML_FILE ${YAML_FILES})
-    if(NOT FIRST_ITEM)
-        set(YAML_STRING "${YAML_STRING}, ")
-    endif()
-    set(YAML_STRING "${YAML_STRING}'${YAML_FILE}'")
-    set(FIRST_ITEM FALSE)
-endforeach()
-set(YAML_STRING "${YAML_STRING}]")
-message(STATUS "YAML_STRING: ${YAML_STRING}")
+function(get_yaml_files YAML_FILES OUTPUT_VAR)
+    set(YAML_STRING "[")
+    set(FIRST_ITEM TRUE)
+    foreach(YAML_FILE ${YAML_FILES})
+        if(NOT FIRST_ITEM)
+            set(YAML_STRING "${YAML_STRING}, ")
+        endif()
+        set(YAML_STRING "${YAML_STRING}'${YAML_FILE}'")
+        set(FIRST_ITEM FALSE)
+    endforeach()
+    set(YAML_STRING "${YAML_STRING}]")
+    set(${OUTPUT_VAR} "${YAML_STRING}" PARENT_SCOPE)
+endfunction()
+
+file(GLOB_RECURSE OPS_YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*_op.yaml")
+message(STATUS "OPS_YAML_FILES: ${OPS_YAML_FILES}")
+get_yaml_files("${OPS_YAML_FILES}" OPS_YAML_STRING)
+
+file(GLOB_RECURSE DOC_YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*_doc.yaml")
+message(STATUS "DOC_YAML_FILES: ${DOC_YAML_FILES}")
+get_yaml_files("${DOC_YAML_FILES}" DOC_YAML_STRING)
 
 # =============================================================================
 # Custom Op Builder
@@ -86,7 +92,8 @@ src_files = '${SRC_FILES}'.split(';')
 ms.ops.CustomOpBuilder(
     name='${MS_EXTENSION_NAME}',
     sources=src_files,
-    yaml=${YAML_STRING},
+    yaml=${OPS_YAML_STRING},
+    doc=${DOC_YAML_STRING},
     backend='Ascend',
     cflags='${CFLAGS_INCLUDES}',
     ldflags='-L${INTERNAL_KERNEL_LIB_PATH} -l${LIBS}',
diff --git a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp
index df71699db490ebf349f7d38dc6684abc1234ccc0..9d2b3578fa90647a48bbb6435241cf553622e892 100644
--- a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp
+++ b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp
@@ -9,68 +9,120 @@
  */
 #include "add_custom_tiling.h"
 #include "register/op_def_registry.h"
+#include "graph/utils/type_utils.h"
+#include "tiling/platform/platform_ascendc.h"
 
 namespace optiling {
-const uint32_t BLOCK_DIM = 8;
-const uint32_t TILE_NUM = 8;
-static ge::graphStatus TilingFunc(gert::TilingContext *context)
+const uint32_t BLOCK_SIZE = 32;
+const uint32_t BUFFER_NUM = 2;
+static ge::graphStatus TilingFunc(gert::TilingContext* context)
 {
     TilingData tiling;
-    uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
-    context->SetBlockDim(BLOCK_DIM);
-    tiling.set_totalLength(totalLength);
-    tiling.set_tileNum(TILE_NUM);
+    uint64_t ubSize;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
+    auto coreNum = ascendcPlatform.GetCoreNum();
+    
+    // Based on the input length and the number of inputs, the number of bytes of the input data type is obtained
+    uint32_t inputNum = context->GetInputShape(0)->GetStorageShape().GetShapeSize();
+    uint32_t typeLength = 0;
+    ge::TypeUtils::GetDataTypeLength(context->GetInputDesc(0)->GetDataType(), typeLength);
+    uint32_t inputLength = inputNum * typeLength;
+    uint32_t inputBytes = inputLength / inputNum;
+
+    // There are a total of 3 shared UB spaces in the input and output. If it's int8, there are 2 more TBUFs
+    uint32_t ubDataNumber = (inputBytes == 1) ? 5 : 3;
+    // The number of 32B data blocks that can be used for each data. DOUBLE BUFFER is already counted here
+    uint32_t tileBlockNum = (ubSize / BLOCK_SIZE / BUFFER_NUM) / ubDataNumber;
+    uint32_t tileDataNum = (tileBlockNum * BLOCK_SIZE) / inputBytes;
+
+    // Input data for 32B alignment
+    uint32_t inputLengthAlgin32 = (((inputLength + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE);
+    // There is at least 32B of data on each core, satisfying several settings for several cores. The maximum number of audits is the actual number of audits
+    coreNum = (coreNum <  inputLengthAlgin32 / BLOCK_SIZE) ? coreNum : inputLengthAlgin32 / BLOCK_SIZE;
+    coreNum = (coreNum >= 1) ? coreNum : 1;
+    uint32_t everyCoreInputBlockNum = inputLengthAlgin32 / BLOCK_SIZE / coreNum;
+    uint32_t tailBlockNum = (inputLengthAlgin32 / BLOCK_SIZE) % coreNum;
+    
+    // Small chunks are calculated and sliced several times using the number of data on each core
+    uint32_t smallCoreDataNum = everyCoreInputBlockNum * BLOCK_SIZE / inputBytes;
+    uint32_t smallTileNum = everyCoreInputBlockNum / tileBlockNum;
+    uint32_t finalSmallTileNum = (everyCoreInputBlockNum % tileBlockNum) == 0 ? smallTileNum : smallTileNum + 1;
+    // Tail block calculation for small chunks of data
+    uint32_t smallTailDataNum = smallCoreDataNum - (tileDataNum * smallTileNum);
+    smallTailDataNum = smallTailDataNum == 0 ? tileDataNum : smallTailDataNum;
+    
+    // The total length of a large block of data is 32B larger than that of a small block of data
+    everyCoreInputBlockNum += 1;
+    uint32_t bigCoreDataNum = everyCoreInputBlockNum * BLOCK_SIZE / inputBytes;
+    uint32_t bigTileNum = everyCoreInputBlockNum / tileBlockNum;
+    uint32_t finalBigTileNum = (everyCoreInputBlockNum % tileBlockNum) == 0 ? bigTileNum : bigTileNum + 1;
+    uint32_t bigTailDataNum = bigCoreDataNum - tileDataNum * bigTileNum;
+    bigTailDataNum = bigTailDataNum == 0 ? tileDataNum : bigTailDataNum;
+    
+    tiling.set_smallCoreDataNum(smallCoreDataNum);
+    tiling.set_bigCoreDataNum(bigCoreDataNum);
+    tiling.set_tileDataNum(tileDataNum);
+    tiling.set_smallTailDataNum(smallTailDataNum);
+    tiling.set_bigTailDataNum(bigTailDataNum);
+    tiling.set_finalSmallTileNum(finalSmallTileNum);
+    tiling.set_finalBigTileNum(finalBigTileNum);
+    tiling.set_tailBlockNum(tailBlockNum);
+    
+    context->SetBlockDim(coreNum);
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = 0;
     return ge::GRAPH_SUCCESS;
 }
-} // namespace optiling
+}
 
 namespace ge {
-static graphStatus InferShape(gert::InferShapeContext *context)
+static ge::graphStatus InferShape(gert::InferShapeContext* context)
 {
-    const gert::Shape *x1_shape = context->GetInputShape(0);
-    gert::Shape *y_shape = context->GetOutputShape(0);
+    const gert::Shape* x1_shape = context->GetInputShape(0);
+    gert::Shape* y_shape = context->GetOutputShape(0);
     *y_shape = *x1_shape;
     return GRAPH_SUCCESS;
 }
-
-static graphStatus InferDataType(gert::InferDataTypeContext *context)
+static graphStatus InferDataType(gert::InferDataTypeContext* context)
 {
     const auto inputDataType = context->GetInputDataType(0);
     context->SetOutputDataType(0, inputDataType);
     return ge::GRAPH_SUCCESS;
 }
-} // namespace ge
+}
 
 namespace ops {
 class AddCustom : public OpDef {
 public:
-    explicit AddCustom(const char *name) : OpDef(name)
+    explicit AddCustom(const char* name) : OpDef(name)
     {
         this->Input("x")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT16})
-            .Format({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("y")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT16})
-            .Format({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("z")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT16})
-            .Format({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
 
         this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType);
         this->AICore()
             .SetTiling(optiling::TilingFunc)
-            .AddConfig("ascend910")
-            .AddConfig("ascend310p")
             .AddConfig("ascend310b")
+            .AddConfig("ascend310p")
+            .AddConfig("ascend910")
             .AddConfig("ascend910b");
     }
 };
 OP_ADD(AddCustom);
-} // namespace ops
+}
diff --git a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h
index 08b09da03040a92612310c46cba6145c85dcada6..d775bc6bab9ca5f4bdc2747480ca87564f0d6058 100644
--- a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h
+++ b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h
@@ -13,10 +13,16 @@
 
 namespace optiling {
 BEGIN_TILING_DATA_DEF(TilingData)
-TILING_DATA_FIELD_DEF(uint32_t, totalLength);
-TILING_DATA_FIELD_DEF(uint32_t, tileNum);
+  TILING_DATA_FIELD_DEF(uint32_t, smallCoreDataNum);
+  TILING_DATA_FIELD_DEF(uint32_t, bigCoreDataNum);
+  TILING_DATA_FIELD_DEF(uint32_t, finalBigTileNum);
+  TILING_DATA_FIELD_DEF(uint32_t, finalSmallTileNum);
+  TILING_DATA_FIELD_DEF(uint32_t, tileDataNum);
+  TILING_DATA_FIELD_DEF(uint32_t, smallTailDataNum);
+  TILING_DATA_FIELD_DEF(uint32_t, bigTailDataNum);
+  TILING_DATA_FIELD_DEF(uint32_t, tailBlockNum);
 END_TILING_DATA_DEF;
 
 REGISTER_TILING_DATA_CLASS(AddCustom, TilingData)
-} // namespace optiling
+}
 #endif // ADD_CUSTOM_TILING_H
diff --git a/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp b/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp
index 419bb10a8c840918c51bd97dc18ae5b038b33c8d..15fc847f77914ac44ee337dedb4246e83676c378 100644
--- a/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp
+++ b/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp
@@ -8,28 +8,51 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 #include "kernel_operator.h"
-constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+// tensor num for each queue
+constexpr int32_t BUFFER_NUM = 2;
 
-class KernelAdd {
+template<typename TYPE_X, typename TYPE_Y, typename TYPE_Z> class KernelAdd {
+    using T = TYPE_X;
 public:
     __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t smallCoreDataNum,
+                                uint32_t bigCoreDataNum, uint32_t finalBigTileNum, 
+                                uint32_t finalSmallTileNum, uint32_t tileDataNum, 
+                                uint32_t smallTailDataNum, uint32_t bigTailDataNum, 
+                                uint32_t tailBlockNum) 
     {
-        this->blockLength = totalLength / AscendC::GetBlockNum();
-        this->tileNum = tileNum;
-        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
-
-        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
+        uint32_t coreNum = AscendC::GetBlockIdx();
+        uint32_t globalBufferIndex = bigCoreDataNum * AscendC::GetBlockIdx();
+        this->tileDataNum = tileDataNum;
+        if (coreNum < tailBlockNum) { 
+          this->coreDataNum = bigCoreDataNum;
+          this->tileNum = finalBigTileNum;
+          this->tailDataNum = bigTailDataNum;
+        }
+        else { 
+          this->coreDataNum = smallCoreDataNum;
+          this->tileNum = finalSmallTileNum;
+          this->tailDataNum = smallTailDataNum;
+          globalBufferIndex -= (bigCoreDataNum - smallCoreDataNum) * (AscendC::GetBlockIdx() - tailBlockNum);
+        }
+        xGm.SetGlobalBuffer((__gm__ TYPE_X*)x + globalBufferIndex, this->coreDataNum);
+        yGm.SetGlobalBuffer((__gm__ TYPE_Y*)y + globalBufferIndex, this->coreDataNum);
+        zGm.SetGlobalBuffer((__gm__ TYPE_Z*)z + globalBufferIndex, this->coreDataNum);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_X));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Y));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Z));
+        pipe.InitBuffer(tmp1, this->tileDataNum * sizeof(half));
+        pipe.InitBuffer(tmp2, this->tileDataNum * sizeof(half));
     }
     __aicore__ inline void Process()
     {
-        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        int32_t loopCount = this->tileNum;
+        this->processDataNum = this->tileDataNum;
         for (int32_t i = 0; i < loopCount; i++) {
+            if (i == this->tileNum - 1) {
+              this->processDataNum = this->tailDataNum;
+            }
             CopyIn(i);
             Compute(i);
             CopyOut(i);
@@ -39,54 +62,75 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress)
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.AllocTensor<DTYPE_Y>();
-        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
-        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
+      AscendC::LocalTensor<TYPE_X> xLocal = inQueueX.AllocTensor<TYPE_X>();
+      AscendC::LocalTensor<TYPE_Y> yLocal = inQueueY.AllocTensor<TYPE_Y>();
+      AscendC::DataCopy(xLocal, xGm[progress * this->tileDataNum], this->processDataNum);
+      AscendC::DataCopy(yLocal, yGm[progress * this->tileDataNum], this->processDataNum);
+      inQueueX.EnQue(xLocal);
+      inQueueY.EnQue(yLocal);
     }
     __aicore__ inline void Compute(int32_t progress)
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.DeQue<DTYPE_Y>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
+      AscendC::LocalTensor<TYPE_X> xLocal = inQueueX.DeQue<TYPE_X>();
+      AscendC::LocalTensor<TYPE_Y> yLocal = inQueueY.DeQue<TYPE_Y>();
+      AscendC::LocalTensor<TYPE_Z> zLocal = outQueueZ.AllocTensor<TYPE_Z>();
+      if constexpr (std::is_same_v<T, int8_t>) {
+        auto p1 = tmp1.Get<half>();
+        auto p2 = tmp2.Get<half>();
+        AscendC::Cast(p1, xLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum);
+        AscendC::Cast(p2, yLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum);
+        AscendC::Add(p2, p1, p2, this->processDataNum);
+        AscendC::Cast(p1.ReinterpretCast<int16_t>(), p2, AscendC::RoundMode::CAST_RINT, this->processDataNum);
+        AscendC::ShiftLeft(p1.ReinterpretCast<int16_t>(), p1.ReinterpretCast<int16_t>(), int16_t(8), this->processDataNum); 
+        AscendC::ShiftRight(p1.ReinterpretCast<int16_t>(), p1.ReinterpretCast<int16_t>(), int16_t(8), this->processDataNum);
+        AscendC::Cast(p2, p1.ReinterpretCast<int16_t>(), AscendC::RoundMode::CAST_NONE, this->processDataNum);
+        AscendC::Cast(zLocal, p2, AscendC::RoundMode::CAST_NONE, this->processDataNum);
+      }
+      else {
+        AscendC::Add(zLocal, xLocal, yLocal, this->processDataNum);
+      }
+      outQueueZ.EnQue<TYPE_Z>(zLocal);
+      inQueueX.FreeTensor(xLocal);
+      inQueueY.FreeTensor(yLocal);
     }
     __aicore__ inline void CopyOut(int32_t progress)
     {
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
-        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
-        outQueueZ.FreeTensor(zLocal);
+      AscendC::LocalTensor<TYPE_Z> zLocal = outQueueZ.DeQue<TYPE_Z>();  
+      AscendC::DataCopy(zGm[progress * this->tileDataNum], zLocal, this->processDataNum);
+      outQueueZ.FreeTensor(zLocal);
     }
 
 private:
     AscendC::TPipe pipe;
-    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
-    AscendC::GlobalTensor<DTYPE_X> xGm;
-    AscendC::GlobalTensor<DTYPE_Y> yGm;
-    AscendC::GlobalTensor<DTYPE_Z> zGm;
-    uint32_t blockLength;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmp1, tmp2;
+    AscendC::GlobalTensor<TYPE_X> xGm;
+    AscendC::GlobalTensor<TYPE_Y> yGm;
+    AscendC::GlobalTensor<TYPE_Z> zGm;
+    uint32_t coreDataNum;
     uint32_t tileNum;
-    uint32_t tileLength;
+    uint32_t tileDataNum;
+    uint32_t tailDataNum;
+    uint32_t processDataNum;
 };
 
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tiling_data, tiling);
-    KernelAdd op;
-    op.Init(x, y, z, tiling_data.totalLength, tiling_data.tileNum);
+    KernelAdd<DTYPE_X, DTYPE_Y, DTYPE_Z> op;
+    op.Init(x, y, z, tiling_data.smallCoreDataNum, 
+            tiling_data.bigCoreDataNum, tiling_data.finalBigTileNum, 
+            tiling_data.finalSmallTileNum, tiling_data.tileDataNum, 
+            tiling_data.smallTailDataNum, tiling_data.bigTailDataNum, 
+            tiling_data.tailBlockNum);  
     op.Process();
 }
 
 #ifndef ASCENDC_CPU_DEBUG
 // call of kernel function
-void add_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *z,
-                   uint8_t *workspace, uint8_t *tiling)
+void add_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z,
+    uint8_t* workspace, uint8_t* tiling)
 {
     add_custom<<<blockDim, l2ctrl, stream>>>(x, y, z, workspace, tiling);
 }
diff --git a/python/ms_custom_ops/__init__.py b/python/ms_custom_ops/__init__.py
index de2a5ee26b6e8ca7777fe946daf15bf72a9a7aae..b4094d88d619da325e5f52de7b446b83251493b8 100644
--- a/python/ms_custom_ops/__init__.py
+++ b/python/ms_custom_ops/__init__.py
@@ -23,7 +23,7 @@ def _init_env():
 
 _init_env()
 
-from ms_custom_ops.ms_custom_ops import *
+from .ms_custom_ops import *
 # Import generated ops interfaces
 try:
     from .gen_ops_def import *
@@ -40,7 +40,7 @@ __all__ = []
 
 # Add ops from gen_ops_def if available
 try:
-    import ms_custom_ops.gen_ops_def as gen_ops_def
+    from . import gen_ops_def
     if hasattr(gen_ops_def, '__all__'):
         __all__.extend(gen_ops_def.__all__)
     else:
@@ -51,7 +51,7 @@ except ImportError:
 
 # Add ops from gen_ops_prim if available  
 try:
-    import ms_custom_ops.gen_ops_prim as gen_ops_prim
+    from . import gen_ops_prim
     if hasattr(gen_ops_prim, '__all__'):
         __all__.extend(gen_ops_prim.__all__)
     else:
diff --git a/setup.py b/setup.py
index 8042fe81a01c59bfafaa5a6d766d098048a9b01e..4fd97f4395d180e38df53c8d03971a3e9b2e1e7e 100644
--- a/setup.py
+++ b/setup.py
@@ -188,6 +188,11 @@ class CustomBuildExt(build_ext):
                 if os.path.exists(src_gen_path):
                     dst_gen_path = os.path.join(python_package_path, gen_file)
                     shutil.copy(src_gen_path, dst_gen_path)
+                    replace_cmd = ["sed", "-i", "s/import ms_cusrom_ops/from . import ms_custom_ops/g", dst_gen_path]
+                    try:
+                        result = subprocess.run(replace_cmd, cwd=self.ROOT_DIR, text=True, shell=False)
+                    except subprocess.CalledProcessError as e:
+                        raise RuntimeError(f"Failed to exec command {replace_cmd}: {e}")
                     logger.info(f"Copied {gen_file} to {dst_gen_path}")
                 else:
                     logger.warning(f"Generated file not found: {src_gen_path}")
diff --git a/yaml/doc/add_doc.yaml b/yaml/doc/add_doc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a7b9b3988c895c358a4f741f60b8c5feaa87959
--- /dev/null
+++ b/yaml/doc/add_doc.yaml
@@ -0,0 +1,43 @@
+add:
+    description: |
+        Compute the element-wise sum of the two input tensors.
+
+        .. math::
+
+            out_{i} = input_{i} + other_{i}
+
+        Note:
+            - The two inputs can not be bool type at the same time,
+              [True, Tensor(True), Tensor(np.array([True]))] are all considered bool type.
+            - Support broadcast, support implicit type conversion and type promotion.
+            - When the input is a tensor, the dimension should be greater than or equal to 1.
+
+        Args:
+            input (Union[Tensor, number.Number, bool]): The first input tensor.
+            other (Union[Tensor, number.Number, bool]): The second input tensor.
+
+        Returns:
+            Tensor
+
+        Supported Platforms:
+            ``Ascend``
+
+        Examples:
+            >>> import mindspore as ms
+            >>> import ms_custom_ops
+            >>> # case 1: x and y are both tensor.
+            >>> x = ms.tensor([1., 2., 3.])
+            >>> y = ms.tensor([4., 5., 6.])
+            >>> output = ms_custom_ops.add(x, y)
+            >>> print(output)
+            [5. 7. 9.]
+            >>> # case 2: x is a scalar and y is a tensor
+            >>> x = ms.tensor(1, ms.int32)
+            >>> y = ms.tensor([4., 5., 6.])
+            >>> output = ms_custom_ops.add(x, y)
+            >>> print(output)
+            [5. 6. 7.]
+            >>> # the data type of x is int32, the data type of y is float32,
+            >>> # and the output is the data format of higher precision float32.
+            >>> print(output.dtype)
+            Float32
diff --git a/yaml/doc/reshape_and_cache_doc.yaml b/yaml/doc/reshape_and_cache_doc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67b9925fd2034c0b2197f7ec668d542235bd04d3
--- /dev/null
+++ b/yaml/doc/reshape_and_cache_doc.yaml
@@ -0,0 +1,34 @@
+reshape_and_cache:
+    description: |
+        The ReshapeAndCache is used for updating the block-wise KVCache of transformer network.
+
+        Args:
+            key (Tensor): The key tensor with data type of float16.
+              :math:`(num\_tokens, num\_head, head\_dim)`.
+            value (Tensor, optional): The value tensor with data type of float16.
+              :math:`(num\_tokens, num\_head, head\_dim)`.
+            key_cache (Tensor): The cache tensor with data type of float16.
+              :math:`(num\_blocks, block\_size, num\_head, head\_dim)`.
+            value_cache (Tensor, optional): The cache tensor with data type of float16.
+              :math:`(num\_blocks, block\_size, num\_head, head\_dim)`.
+            slot_mapping (Tensor): The slot mapping tensor with data type of int32.
+              :math:`(num\_tokens,)`.
+                        
+        Outputs:
+            With same data type and same shape as `key` tensor.
+
+        Examples:
+            >>> from mindspore import Tensor, Parameter
+            >>> import ms_custom_ops
+            >>> num_tokens =  = 4
+            >>> num_head = 40
+            >>> head_dim = 128
+            >>> block_size = 16
+            >>> num_blocks = 128
+            >>> key = Tensor(np.random.randn(num_tokens, num_head, head_dim).astype(np.float16))
+            >>> value = Tensor(np.random.randn(num_tokens, num_head, head_dim).astype(np.float16))
+            >>> key_cache = Parameter(default_input=Tensor(np.random.randn(num_blocks, block_size, num_head, head_dim).astype(np.float16)))
+            >>> value_cache = Parameter(default_input=Tensor(np.random.randn(num_blocks, block_size, num_head, head_dim).astype(np.float16)))
+            >>> slot_mapping = Tensor(np.random.shuffle(np.arange(num_tokens, dtype=np.int32)))
+            >>> output = ms_custom_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+            >>> print(key_cache)