diff --git a/ccsrc/CMakeLists.txt b/ccsrc/CMakeLists.txt index baa1cecbee3497fc22105d569110d701c90a9f74..b74143bf35c8e574ff61e36d1d9e6d4862c882f6 100644 --- a/ccsrc/CMakeLists.txt +++ b/ccsrc/CMakeLists.txt @@ -58,21 +58,27 @@ message(STATUS "CFLAGS_INCLUDES: ${CFLAGS_INCLUDES}") # ============================================================================= # Get YAML files # ============================================================================= -file(GLOB_RECURSE YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*.yaml") -message(STATUS "YAML_FILES: ${YAML_FILES}") - -# Build Python list format: ['path1', 'path2', 'path3'] -set(YAML_STRING "[") -set(FIRST_ITEM TRUE) -foreach(YAML_FILE ${YAML_FILES}) - if(NOT FIRST_ITEM) - set(YAML_STRING "${YAML_STRING}, ") - endif() - set(YAML_STRING "${YAML_STRING}'${YAML_FILE}'") - set(FIRST_ITEM FALSE) -endforeach() -set(YAML_STRING "${YAML_STRING}]") -message(STATUS "YAML_STRING: ${YAML_STRING}") +function(get_yaml_files YAML_FILES OUTPUT_VAR) + set(YAML_STRING "[") + set(FIRST_ITEM TRUE) + foreach(YAML_FILE ${YAML_FILES}) + if(NOT FIRST_ITEM) + set(YAML_STRING "${YAML_STRING}, ") + endif() + set(YAML_STRING "${YAML_STRING}'${YAML_FILE}'") + set(FIRST_ITEM FALSE) + endforeach() + set(YAML_STRING "${YAML_STRING}]") + set(${OUTPUT_VAR} "${YAML_STRING}" PARENT_SCOPE) +endfunction() + +file(GLOB_RECURSE OPS_YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*_op.yaml") +message(STATUS "OPS_YAML_FILES: ${OPS_YAML_FILES}") +get_yaml_files("${OPS_YAML_FILES}" OPS_YAML_STRING) + +file(GLOB_RECURSE DOC_YAML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../yaml/*_doc.yaml") +message(STATUS "DOC_YAML_FILES: ${DOC_YAML_FILES}") +get_yaml_files("${DOC_YAML_FILES}" DOC_YAML_STRING) # ============================================================================= # Custom Op Builder @@ -86,7 +92,8 @@ src_files = '${SRC_FILES}'.split(';') ms.ops.CustomOpBuilder( name='${MS_EXTENSION_NAME}', sources=src_files, - yaml=${YAML_STRING}, + yaml=${OPS_YAML_STRING}, + doc=${DOC_YAML_STRING}, backend='Ascend', cflags='${CFLAGS_INCLUDES}', ldflags='-L${INTERNAL_KERNEL_LIB_PATH} -l${LIBS}', diff --git a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp index df71699db490ebf349f7d38dc6684abc1234ccc0..9d2b3578fa90647a48bbb6435241cf553622e892 100644 --- a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp +++ b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom.cpp @@ -9,68 +9,120 @@ */ #include "add_custom_tiling.h" #include "register/op_def_registry.h" +#include "graph/utils/type_utils.h" +#include "tiling/platform/platform_ascendc.h" namespace optiling { -const uint32_t BLOCK_DIM = 8; -const uint32_t TILE_NUM = 8; -static ge::graphStatus TilingFunc(gert::TilingContext *context) +const uint32_t BLOCK_SIZE = 32; +const uint32_t BUFFER_NUM = 2; +static ge::graphStatus TilingFunc(gert::TilingContext* context) { TilingData tiling; - uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); - context->SetBlockDim(BLOCK_DIM); - tiling.set_totalLength(totalLength); - tiling.set_tileNum(TILE_NUM); + uint64_t ubSize; + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize); + auto coreNum = ascendcPlatform.GetCoreNum(); + + // Based on the input length and the number of inputs, the number of bytes of the input data type is obtained + uint32_t inputNum = context->GetInputShape(0)->GetStorageShape().GetShapeSize(); + uint32_t typeLength = 0; + ge::TypeUtils::GetDataTypeLength(context->GetInputDesc(0)->GetDataType(), typeLength); + uint32_t inputLength = inputNum * typeLength; + uint32_t inputBytes = inputLength / inputNum; + + // There are a total of 3 shared UB spaces in the input and output. If it's int8, there are 2 more TBUFs + uint32_t ubDataNumber = (inputBytes == 1) ? 5 : 3; + // The number of 32B data blocks that can be used for each data. DOUBLE BUFFER is already counted here + uint32_t tileBlockNum = (ubSize / BLOCK_SIZE / BUFFER_NUM) / ubDataNumber; + uint32_t tileDataNum = (tileBlockNum * BLOCK_SIZE) / inputBytes; + + // Input data for 32B alignment + uint32_t inputLengthAlgin32 = (((inputLength + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE); + // There is at least 32B of data on each core, satisfying several settings for several cores. The maximum number of audits is the actual number of audits + coreNum = (coreNum < inputLengthAlgin32 / BLOCK_SIZE) ? coreNum : inputLengthAlgin32 / BLOCK_SIZE; + coreNum = (coreNum >= 1) ? coreNum : 1; + uint32_t everyCoreInputBlockNum = inputLengthAlgin32 / BLOCK_SIZE / coreNum; + uint32_t tailBlockNum = (inputLengthAlgin32 / BLOCK_SIZE) % coreNum; + + // Small chunks are calculated and sliced several times using the number of data on each core + uint32_t smallCoreDataNum = everyCoreInputBlockNum * BLOCK_SIZE / inputBytes; + uint32_t smallTileNum = everyCoreInputBlockNum / tileBlockNum; + uint32_t finalSmallTileNum = (everyCoreInputBlockNum % tileBlockNum) == 0 ? smallTileNum : smallTileNum + 1; + // Tail block calculation for small chunks of data + uint32_t smallTailDataNum = smallCoreDataNum - (tileDataNum * smallTileNum); + smallTailDataNum = smallTailDataNum == 0 ? tileDataNum : smallTailDataNum; + + // The total length of a large block of data is 32B larger than that of a small block of data + everyCoreInputBlockNum += 1; + uint32_t bigCoreDataNum = everyCoreInputBlockNum * BLOCK_SIZE / inputBytes; + uint32_t bigTileNum = everyCoreInputBlockNum / tileBlockNum; + uint32_t finalBigTileNum = (everyCoreInputBlockNum % tileBlockNum) == 0 ? bigTileNum : bigTileNum + 1; + uint32_t bigTailDataNum = bigCoreDataNum - tileDataNum * bigTileNum; + bigTailDataNum = bigTailDataNum == 0 ? tileDataNum : bigTailDataNum; + + tiling.set_smallCoreDataNum(smallCoreDataNum); + tiling.set_bigCoreDataNum(bigCoreDataNum); + tiling.set_tileDataNum(tileDataNum); + tiling.set_smallTailDataNum(smallTailDataNum); + tiling.set_bigTailDataNum(bigTailDataNum); + tiling.set_finalSmallTileNum(finalSmallTileNum); + tiling.set_finalBigTileNum(finalBigTileNum); + tiling.set_tailBlockNum(tailBlockNum); + + context->SetBlockDim(coreNum); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = 0; return ge::GRAPH_SUCCESS; } -} // namespace optiling +} namespace ge { -static graphStatus InferShape(gert::InferShapeContext *context) +static ge::graphStatus InferShape(gert::InferShapeContext* context) { - const gert::Shape *x1_shape = context->GetInputShape(0); - gert::Shape *y_shape = context->GetOutputShape(0); + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); *y_shape = *x1_shape; return GRAPH_SUCCESS; } - -static graphStatus InferDataType(gert::InferDataTypeContext *context) +static graphStatus InferDataType(gert::InferDataTypeContext* context) { const auto inputDataType = context->GetInputDataType(0); context->SetOutputDataType(0, inputDataType); return ge::GRAPH_SUCCESS; } -} // namespace ge +} namespace ops { class AddCustom : public OpDef { public: - explicit AddCustom(const char *name) : OpDef(name) + explicit AddCustom(const char* name) : OpDef(name) { this->Input("x") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT16}) - .Format({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("y") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT16}) - .Format({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("z") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT16}) - .Format({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT8}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); this->AICore() .SetTiling(optiling::TilingFunc) - .AddConfig("ascend910") - .AddConfig("ascend310p") .AddConfig("ascend310b") + .AddConfig("ascend310p") + .AddConfig("ascend910") .AddConfig("ascend910b"); } }; OP_ADD(AddCustom); -} // namespace ops +} diff --git a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h index 08b09da03040a92612310c46cba6145c85dcada6..d775bc6bab9ca5f4bdc2747480ca87564f0d6058 100644 --- a/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h +++ b/ccsrc/ops/ascendc/kernel_impl/op_host/add_custom_tiling.h @@ -13,10 +13,16 @@ namespace optiling { BEGIN_TILING_DATA_DEF(TilingData) -TILING_DATA_FIELD_DEF(uint32_t, totalLength); -TILING_DATA_FIELD_DEF(uint32_t, tileNum); + TILING_DATA_FIELD_DEF(uint32_t, smallCoreDataNum); + TILING_DATA_FIELD_DEF(uint32_t, bigCoreDataNum); + TILING_DATA_FIELD_DEF(uint32_t, finalBigTileNum); + TILING_DATA_FIELD_DEF(uint32_t, finalSmallTileNum); + TILING_DATA_FIELD_DEF(uint32_t, tileDataNum); + TILING_DATA_FIELD_DEF(uint32_t, smallTailDataNum); + TILING_DATA_FIELD_DEF(uint32_t, bigTailDataNum); + TILING_DATA_FIELD_DEF(uint32_t, tailBlockNum); END_TILING_DATA_DEF; REGISTER_TILING_DATA_CLASS(AddCustom, TilingData) -} // namespace optiling +} #endif // ADD_CUSTOM_TILING_H diff --git a/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp b/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp index 419bb10a8c840918c51bd97dc18ae5b038b33c8d..15fc847f77914ac44ee337dedb4246e83676c378 100644 --- a/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp +++ b/ccsrc/ops/ascendc/kernel_impl/op_kernel/add_custom.cpp @@ -8,28 +8,51 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +// tensor num for each queue +constexpr int32_t BUFFER_NUM = 2; -class KernelAdd { +template class KernelAdd { + using T = TYPE_X; public: __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t smallCoreDataNum, + uint32_t bigCoreDataNum, uint32_t finalBigTileNum, + uint32_t finalSmallTileNum, uint32_t tileDataNum, + uint32_t smallTailDataNum, uint32_t bigTailDataNum, + uint32_t tailBlockNum) { - this->blockLength = totalLength / AscendC::GetBlockNum(); - this->tileNum = tileNum; - this->tileLength = this->blockLength / tileNum / BUFFER_NUM; - - xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); - pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z)); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + uint32_t coreNum = AscendC::GetBlockIdx(); + uint32_t globalBufferIndex = bigCoreDataNum * AscendC::GetBlockIdx(); + this->tileDataNum = tileDataNum; + if (coreNum < tailBlockNum) { + this->coreDataNum = bigCoreDataNum; + this->tileNum = finalBigTileNum; + this->tailDataNum = bigTailDataNum; + } + else { + this->coreDataNum = smallCoreDataNum; + this->tileNum = finalSmallTileNum; + this->tailDataNum = smallTailDataNum; + globalBufferIndex -= (bigCoreDataNum - smallCoreDataNum) * (AscendC::GetBlockIdx() - tailBlockNum); + } + xGm.SetGlobalBuffer((__gm__ TYPE_X*)x + globalBufferIndex, this->coreDataNum); + yGm.SetGlobalBuffer((__gm__ TYPE_Y*)y + globalBufferIndex, this->coreDataNum); + zGm.SetGlobalBuffer((__gm__ TYPE_Z*)z + globalBufferIndex, this->coreDataNum); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_X)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Y)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Z)); + pipe.InitBuffer(tmp1, this->tileDataNum * sizeof(half)); + pipe.InitBuffer(tmp2, this->tileDataNum * sizeof(half)); } __aicore__ inline void Process() { - int32_t loopCount = this->tileNum * BUFFER_NUM; + int32_t loopCount = this->tileNum; + this->processDataNum = this->tileDataNum; for (int32_t i = 0; i < loopCount; i++) { + if (i == this->tileNum - 1) { + this->processDataNum = this->tailDataNum; + } CopyIn(i); Compute(i); CopyOut(i); @@ -39,54 +62,75 @@ public: private: __aicore__ inline void CopyIn(int32_t progress) { - AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); - AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); - AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileDataNum], this->processDataNum); + AscendC::DataCopy(yLocal, yGm[progress * this->tileDataNum], this->processDataNum); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progress) { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor yLocal = inQueueY.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); - outQueueZ.EnQue(zLocal); - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + if constexpr (std::is_same_v) { + auto p1 = tmp1.Get(); + auto p2 = tmp2.Get(); + AscendC::Cast(p1, xLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Cast(p2, yLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Add(p2, p1, p2, this->processDataNum); + AscendC::Cast(p1.ReinterpretCast(), p2, AscendC::RoundMode::CAST_RINT, this->processDataNum); + AscendC::ShiftLeft(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); + AscendC::ShiftRight(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); + AscendC::Cast(p2, p1.ReinterpretCast(), AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Cast(zLocal, p2, AscendC::RoundMode::CAST_NONE, this->processDataNum); + } + else { + AscendC::Add(zLocal, xLocal, yLocal, this->processDataNum); + } + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); } __aicore__ inline void CopyOut(int32_t progress) { - AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); - outQueueZ.FreeTensor(zLocal); + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileDataNum], zLocal, this->processDataNum); + outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; - AscendC::TQue inQueueX, inQueueY; - AscendC::TQue outQueueZ; - AscendC::GlobalTensor xGm; - AscendC::GlobalTensor yGm; - AscendC::GlobalTensor zGm; - uint32_t blockLength; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::TBuf tmp1, tmp2; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t coreDataNum; uint32_t tileNum; - uint32_t tileLength; + uint32_t tileDataNum; + uint32_t tailDataNum; + uint32_t processDataNum; }; extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tiling_data, tiling); - KernelAdd op; - op.Init(x, y, z, tiling_data.totalLength, tiling_data.tileNum); + KernelAdd op; + op.Init(x, y, z, tiling_data.smallCoreDataNum, + tiling_data.bigCoreDataNum, tiling_data.finalBigTileNum, + tiling_data.finalSmallTileNum, tiling_data.tileDataNum, + tiling_data.smallTailDataNum, tiling_data.bigTailDataNum, + tiling_data.tailBlockNum); op.Process(); } #ifndef ASCENDC_CPU_DEBUG // call of kernel function -void add_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, - uint8_t *workspace, uint8_t *tiling) +void add_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z, + uint8_t* workspace, uint8_t* tiling) { add_custom<<>>(x, y, z, workspace, tiling); } diff --git a/python/ms_custom_ops/__init__.py b/python/ms_custom_ops/__init__.py index de2a5ee26b6e8ca7777fe946daf15bf72a9a7aae..b4094d88d619da325e5f52de7b446b83251493b8 100644 --- a/python/ms_custom_ops/__init__.py +++ b/python/ms_custom_ops/__init__.py @@ -23,7 +23,7 @@ def _init_env(): _init_env() -from ms_custom_ops.ms_custom_ops import * +from .ms_custom_ops import * # Import generated ops interfaces try: from .gen_ops_def import * @@ -40,7 +40,7 @@ __all__ = [] # Add ops from gen_ops_def if available try: - import ms_custom_ops.gen_ops_def as gen_ops_def + from . import gen_ops_def if hasattr(gen_ops_def, '__all__'): __all__.extend(gen_ops_def.__all__) else: @@ -51,7 +51,7 @@ except ImportError: # Add ops from gen_ops_prim if available try: - import ms_custom_ops.gen_ops_prim as gen_ops_prim + from . import gen_ops_prim if hasattr(gen_ops_prim, '__all__'): __all__.extend(gen_ops_prim.__all__) else: diff --git a/setup.py b/setup.py index 8042fe81a01c59bfafaa5a6d766d098048a9b01e..4fd97f4395d180e38df53c8d03971a3e9b2e1e7e 100644 --- a/setup.py +++ b/setup.py @@ -188,6 +188,11 @@ class CustomBuildExt(build_ext): if os.path.exists(src_gen_path): dst_gen_path = os.path.join(python_package_path, gen_file) shutil.copy(src_gen_path, dst_gen_path) + replace_cmd = ["sed", "-i", "s/import ms_cusrom_ops/from . import ms_custom_ops/g", dst_gen_path] + try: + result = subprocess.run(replace_cmd, cwd=self.ROOT_DIR, text=True, shell=False) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to exec command {replace_cmd}: {e}") logger.info(f"Copied {gen_file} to {dst_gen_path}") else: logger.warning(f"Generated file not found: {src_gen_path}") diff --git a/yaml/doc/add_doc.yaml b/yaml/doc/add_doc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a7b9b3988c895c358a4f741f60b8c5feaa87959 --- /dev/null +++ b/yaml/doc/add_doc.yaml @@ -0,0 +1,43 @@ +add: + description: | + Compute the element-wise sum of the two input tensors. + + .. math:: + + out_{i} = input_{i} + other_{i} + + Note: + - The two inputs can not be bool type at the same time, + [True, Tensor(True), Tensor(np.array([True]))] are all considered bool type. + - Support broadcast, support implicit type conversion and type promotion. + - When the input is a tensor, the dimension should be greater than or equal to 1. + + Args: + input (Union[Tensor, number.Number, bool]): The first input tensor. + other (Union[Tensor, number.Number, bool]): The second input tensor. + + Returns: + Tensor + + Supported Platforms: + ``Ascend`` + + Examples: + >>> import mindspore as ms + >>> import ms_custom_ops + >>> # case 1: x and y are both tensor. + >>> x = ms.tensor([1., 2., 3.]) + >>> y = ms.tensor([4., 5., 6.]) + >>> output = ms_custom_ops.add(x, y) + >>> print(output) + [5. 7. 9.] + >>> # case 2: x is a scalar and y is a tensor + >>> x = ms.tensor(1, ms.int32) + >>> y = ms.tensor([4., 5., 6.]) + >>> output = ms_custom_ops.add(x, y) + >>> print(output) + [5. 6. 7.] + >>> # the data type of x is int32, the data type of y is float32, + >>> # and the output is the data format of higher precision float32. + >>> print(output.dtype) + Float32 diff --git a/yaml/doc/reshape_and_cache_doc.yaml b/yaml/doc/reshape_and_cache_doc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..67b9925fd2034c0b2197f7ec668d542235bd04d3 --- /dev/null +++ b/yaml/doc/reshape_and_cache_doc.yaml @@ -0,0 +1,34 @@ +reshape_and_cache: + description: | + The ReshapeAndCache is used for updating the block-wise KVCache of transformer network. + + Args: + key (Tensor): The key tensor with data type of float16. + :math:`(num\_tokens, num\_head, head\_dim)`. + value (Tensor, optional): The value tensor with data type of float16. + :math:`(num\_tokens, num\_head, head\_dim)`. + key_cache (Tensor): The cache tensor with data type of float16. + :math:`(num\_blocks, block\_size, num\_head, head\_dim)`. + value_cache (Tensor, optional): The cache tensor with data type of float16. + :math:`(num\_blocks, block\_size, num\_head, head\_dim)`. + slot_mapping (Tensor): The slot mapping tensor with data type of int32. + :math:`(num\_tokens,)`. + + Outputs: + With same data type and same shape as `key` tensor. + + Examples: + >>> from mindspore import Tensor, Parameter + >>> import ms_custom_ops + >>> num_tokens = = 4 + >>> num_head = 40 + >>> head_dim = 128 + >>> block_size = 16 + >>> num_blocks = 128 + >>> key = Tensor(np.random.randn(num_tokens, num_head, head_dim).astype(np.float16)) + >>> value = Tensor(np.random.randn(num_tokens, num_head, head_dim).astype(np.float16)) + >>> key_cache = Parameter(default_input=Tensor(np.random.randn(num_blocks, block_size, num_head, head_dim).astype(np.float16))) + >>> value_cache = Parameter(default_input=Tensor(np.random.randn(num_blocks, block_size, num_head, head_dim).astype(np.float16))) + >>> slot_mapping = Tensor(np.random.shuffle(np.arange(num_tokens, dtype=np.int32))) + >>> output = ms_custom_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) + >>> print(key_cache)