From ab1b3048be382b41dcf795b516cd379f3590ddc9 Mon Sep 17 00:00:00 2001 From: suqwe Date: Thu, 21 Aug 2025 10:48:19 +0800 Subject: [PATCH] add l2 cache example --- examples/matrix/matmul_l2cache/CMakeLists.txt | 149 +++++------ examples/matrix/matmul_l2cache/README.md | 85 ++++-- .../matrix/matmul_l2cache/cmake/cpu_lib.cmake | 2 +- .../matrix/matmul_l2cache/cmake/npu_lib.cmake | 5 +- examples/matrix/matmul_l2cache/main.cpp | 252 ++++++++++-------- .../op_kernel/l2_cache_optimizer.h | 229 ++++++++++++++++ .../matmul_l2cache_custom_kernel.cpp | 32 +++ .../op_kernel/matmul_l2cache_custom_kernel.h | 146 ++++++++++ examples/matrix/matmul_l2cache/run.sh | 19 +- .../matmul_l2cache/scripts/exec_test.py | 149 +++++------ .../matrix/matmul_l2cache/testcase/case.csv | 2 +- 11 files changed, 765 insertions(+), 305 deletions(-) create mode 100644 examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h create mode 100644 examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp create mode 100644 examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h diff --git a/examples/matrix/matmul_l2cache/CMakeLists.txt b/examples/matrix/matmul_l2cache/CMakeLists.txt index 8834905f..aa0e3eed 100644 --- a/examples/matrix/matmul_l2cache/CMakeLists.txt +++ b/examples/matrix/matmul_l2cache/CMakeLists.txt @@ -1,80 +1,71 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. -# This file is a part of the CANN Open Software. -# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ====================================================================================================================== - -cmake_minimum_required(VERSION 3.16) -project(Ascend_c) -if(${RUN_MODE}) - set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") -endif() -if (${SOC_VERSION}) - set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") -endif() - -set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) -endif() - -if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) - set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) -endif() - -file(GLOB KERNEL_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_l2_cache_custom_kernel.cpp -) - -if("${RUN_MODE}" STREQUAL "cpu") - include(cmake/cpu_lib.cmake) -elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") - include(cmake/npu_lib.cmake) -else() - message("invalid RUN_MODE: ${RUN_MODE}") -endif() - -add_executable(ascendc_matmul_l2_cache_bbit - ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_l2_cache_custom_tiling.cpp -) - -target_compile_options(ascendc_matmul_l2_cache_bbit PRIVATE - $:-g>> - -O2 - -std=c++17 - -D_GLIBCXX_USE_CXX11_ABI=0 -) - -target_compile_definitions(ascendc_matmul_l2_cache_bbit PRIVATE - SOC_VERSION="${SOC_VERSION}" -) - - -target_include_directories(ascendc_matmul_l2_cache_bbit PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - $:${ASCEND_CANN_PACKAGE_PATH}/include>> - $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> -) - -target_link_libraries(ascendc_matmul_l2_cache_bbit PRIVATE - $,$>:host_intf_pub>> - $:tikicpulib::${SOC_VERSION}>> - $:ascendcl>> - $:c_sec>> - ascendc_kernels_${RUN_MODE} - tiling_api - register - platform - ascendalog - dl -) - -install(TARGETS ascendc_matmul_l2_cache_bbit - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_l2cache_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp +) + +target_compile_options(ascendc_matmul_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) +target_include_directories(ascendc_matmul_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl ) \ No newline at end of file diff --git a/examples/matrix/matmul_l2cache/README.md b/examples/matrix/matmul_l2cache/README.md index 6ccb24ab..fd2b3637 100644 --- a/examples/matrix/matmul_l2cache/README.md +++ b/examples/matrix/matmul_l2cache/README.md @@ -13,7 +13,6 @@ | 目录 | 描述 | |----------------------------------|--------------------------------------| | [cmake](cmake) | 编译工程文件 | -| [op_host](op_host) | 本样例tiling代码实现 | | [op_kernel](op_kernel) | 本样例kernel代码实现 | | [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | | [testcase](testcase) | 用例文件,配置用例的计算shape信息 | @@ -23,7 +22,7 @@ ## 算子描述 - 算子功能 - MatmulL2CacheCustom算子在M方向切分输入的左矩阵,将左矩阵切分成多块数据,整体按数据切分的块数,分多次进行计算。每次计算前,第一个核第一次访问Global Memory上的左矩阵时,会将切分后的一块左矩阵数据加载到L2 Cache,其它核或第一个核后续数据访问都可以命中L2 Cache,访问完整的右矩阵行为相同,以提高算子性能。 + MatmulL2CacheCustom算子在M方向或者N方向切分输入的矩阵,将矩阵切分成多块数据,整体按数据切分的块数,分多次进行计算。每次计算前,第一个核第一次访问Global Memory上的矩阵时,会将切分后的一块矩阵数据加载到L2 Cache,其它核或第一个核后续数据访问都可以命中L2 Cache,以提高算子性能。 - 算子规格 @@ -31,22 +30,20 @@ 算子类型(OpType)MatmulL2CacheCustom 算子输入nameshapedata typeformatisTrans - a(30720, 1024)floatNDfalse - b(1024, 1024)floatNDfalse + a(30720, 4096)float16NDfalse + b(4096, 1024)float16NDfalse bias(1024, )floatND- 算子输出c - (30720, 1024)floatND- + (30720, 1024)float16ND- 核函数名matmul_l2_cache_custom ## 算子实现介绍 -本样例以AI处理器的L2 Cache大小192M为例,根据算子的输入、输出shape,计算算子输入和输出的总数据量为((30720 * 1024) + (1024 * 1024) + 1024 + (30720 * 1024)) * 4 = 255856640字节(约244M), -大于L2 Cache(192M),无法保证计算前读取的数据能够命中L2 Cache,而Global Memory带宽低于L2 Cache,且两者之间差距较大,搬运数据成为算子运行的性能瓶颈。将左矩阵的输入数据均等切分成两份后,整体分两次矩阵乘计算, -每次计算的输入和输出总数据量为(((30720 // 2) * 1024) + (1024 * 1024) + 1024 + ((30720 // 2) * 1024)) * 4 = 130027520字节(约124M),以保证计算前读取的数据能够命中L2 Cache。 +本样例以AI处理器的L2 Cache大小192M为例,根据算子的输入、输出shape,计算算子输入和输出的总数据量为((30720 * 4096) + (4096 * 1024) + (1024) + (30720 * 1024)) * 2 = 322963456字节(约308M),大于L2 Cache(192M),无法保证计算前读取的数据能够命中L2 Cache,而Global Memory带宽低于L2 Cache,且两者之间差距较大,搬运数据成为算子运行的性能瓶颈。因此需要将输入数据切分成多块,使得每个数据块的计算数据量(包含输入和输出),能够命中L2 Cache。本样例提供了L2CacheOptimizer类,其中GetTileNum接口用于自动根据左右矩阵的Shape获取左右矩阵总L2切分份数,GetBlockShape接口获取L2切分后M、N、K轴的长度,GetBlockCoord接口返回对应切块的位置坐标,即M、N、K方向相对矩阵起始位置的偏移。 - Kernel实现 - 计算逻辑:C = A * B + Bias。 @@ -56,34 +53,68 @@ - 具体步骤: - 创建Matmul对象。 - 初始化操作。 - - 设置右矩阵A、右矩阵B、Bias。将左矩阵A切分成两块,循环计算两次,每次对切分后一块左矩阵A的数据进行计算,并输出一半的计算结果。 + - 设置左矩阵A、右矩阵B、Bias。根据L2CacheOptimizer类的GetTileNum接口获取左右矩阵总L2切分份数,循环多次计算。 ``` - matmulObj.SetTensorB(bGlobal); - if (tiling.isBias) { - matmulObj.SetBias(biasGlobal); - } - const uint32_t splitTimes = 2; - const uint32_t splitOffsetA = this->tiling.M / splitTimes * this->tiling.Ka; - const uint32_t splitOffsetC = this->tiling.M / splitTimes * this->tiling.N; - for (uint32_t i = 0; i < splitTimes; i++) { - matmulObj.SetTensorA(aGlobal[splitOffsetA * i]); - matmulObj.IterateAll(cGlobal[splitOffsetC * i]); + L2CacheOpt l2Opt(shapes, blockNum); + matmulObj.SetOrgShape(shapes.m, shapes.n, shapes.k); + for (int64_t tileIdx = curBlockIdx; tileIdx < l2Opt.GetTileNum(); tileIdx += blockNum) { + auto blockShape = l2Opt.GetBlockShape(tileIdx); // 获取单次计算L2切分块大小 + if (Get<0>(blockShape) <= 0 || Get<1>(blockShape) <= 0) { + return; + } + auto blockCoord = l2Opt.GetBlockCoord(tileIdx); // 获取当前计算下标blockCoord + matmulObj.SetTail(Get<0>(blockShape), Get<1>(blockShape), Get<2>(blockShape)); + const auto& offsetCoord = CalcOffset(shapes, blockCoord); // 基于下标计算矩阵偏移 + int64_t offsetA = Get<0>(offsetCoord); + int64_t offsetB = Get<1>(offsetCoord); + int64_t offsetC = Get<2>(offsetCoord); + matmulObj.SetTensorA(aGlobal[offsetA], false); + matmulObj.SetTensorB(bGlobal[offsetB], false); + if (shapes.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal[offsetC]); // 计算L2切分块 } matmulObj.End(); ``` - - 完成两次矩阵乘操作。 + - 完成矩阵乘操作。 - 结束矩阵乘操作。 - Tiling实现 - - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 - - 获取Tiling参数的流程如下: - - 创建一个Tiling对象。 - - 设置A、B、C、Bias的参数类型信息。根据数据的切分方式,设置数据切分后每次计算的Shape和数据切分前的完整OrgShape。 + - 本样例采取了常量化Tiling计算,在Kernel侧通过设置一组固定的基本块信息,其他Tiling信息在Kernel侧通过常量化推导,Kernel侧不再需要运行时Tiling信息。基于这组最优的基本块信息,能够适用输入Shape中M,N较大的场景。样例中提供了一种L2Cache切分算法(参考样例L2CacheOptimizer类)。该算法当前在Kernel侧完成计算L2切块份数,也可以自行迁移代码至Host侧计算。 + - L2CacheOptimizer具体计算步骤: + - 判断是否需要进行L2分块 + ``` + bool smallDim = mTileNum_ < L1_MIN_UST_DIM && nTileNum_ < L1_MIN_UST_DIM; + if (smallDim || (!EnableL2Tile())) { // 判断计算数据总量是否小于L2Cache阈值 + mL2TileNum_ = mTileNum_; + nL2TileNum_ = nTileNum_; + mL2BlockNum_ = 1; + nL2BlockNum_ = 1; + return; // 不需要切分,提前返回 + } + InitL2TileTail(); // 计算L2切分 + ``` + - 基于负载均衡,计算L2最优分块 ``` - cubeTiling->SetShape(M / splitTimes, N, K); - cubeTiling->SetOrgShape(M, N, K); + int64_t mConflict = INT64_MAX; + int64_t nConflict = INT64_MAX; + constexpr bool isNMajor = l1N > l1M; // 根据shape大小,判断主维度 + for (int64_t i = maxMajor; i >= L1_MIN_UST_DIM; i--) { + for (int64_t j = maxMinor; j >= minMinor; j--) { + if (GetTotalSize(j * l1M, i * l1N, k_) <= L2_TILE_THRESHOLD) { // 确保分块小于L2Cache阈值 + uint64_t mConflictTmp = AscendC::Ceil(blockNum_, mL2TileNumTailTmp); // 计算负载冲突值 + uint64_t nConflictTmp = AscendC::Ceil(blockNum_, nL2TileNumTailTmp); + if (mConflict >= mConflictTmp && nConflict >= nConflictTmp) { // 若冲突值更小,更新分块数量 + mConflict = mConflictTmp; + nConflict = nConflictTmp; + mL2TileNum_ = curMajorDim; + nL2TileNum_ = curMinorDim; + } + } + } + } ``` - - 调用GetTiling接口,获取Tiling信息。 ## 编译运行样例 diff --git a/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake b/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake index 583dd703..244469fb 100644 --- a/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_l2cache/cmake/cpu_lib.cmake @@ -22,7 +22,7 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g - -O2 + -O0 -std=c++17 ) diff --git a/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake b/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake index 927d2490..05179a43 100644 --- a/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_l2cache/cmake/npu_lib.cmake @@ -20,8 +20,11 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ${KERNEL_FILES} ) +ascendc_include_directories(ascendc_kernels_${RUN_MODE} PRIVATE + ${ABS_WORK_DIR} +) + ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - -DASCENDC_DUMP -DHAVE_WORKSPACE -DHAVE_TILING ) \ No newline at end of file diff --git a/examples/matrix/matmul_l2cache/main.cpp b/examples/matrix/matmul_l2cache/main.cpp index 608da572..d1bfa978 100644 --- a/examples/matrix/matmul_l2cache/main.cpp +++ b/examples/matrix/matmul_l2cache/main.cpp @@ -1,4 +1,4 @@ -/* +/** * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). @@ -8,96 +8,120 @@ * See LICENSE in the root of the software repository for the full text of the License. */ +/*! +* \file main.cpp +* \brief +*/ + +#include +#include #include #include "../../common/data_utils.h" -#include "kernel_tiling/kernel_tiling.h" +#include "tiling/tiling_api.h" #include "tiling/platform/platform_ascendc.h" -#include "./op_host/matmul_l2_cache_custom_tiling.h" - #ifndef ASCENDC_CPU_DEBUG #include "acl/acl.h" -extern void matmul_l2_cache_custom_do(uint32_t coreDim, void* stream, - uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); + +extern void matmul_l2cache_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); #else #include "tikicpulib.h" -extern "C" void matmul_l2_cache_custom( - uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, uint8_t* workspace, uint8_t* tiling); +extern "C" void matmul_l2cache_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); #endif - +constexpr bool IS_BIAS = false; namespace MatmulHost { -#ifdef ASCENDC_CPU_DEBUG -void TestMatmulCpu(const optiling::TestcaseParams& caseParams) +static size_t GetSysWorkSpaceSize() { + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} +// Matmul compute shape info +struct MatmulProblemShape +{ +int32_t m; +int32_t n; +int32_t k; +int32_t isBias; +}; + +// Calculate single core shapes of per core +MatmulProblemShape GetSingleCoreShape(const int32_t m, const int32_t n, const int32_t k) +{ + MatmulProblemShape shapes{m, n, k, IS_BIAS}; + return shapes; +} + +int32_t GetUsedCoreNum() { auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); - matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); - uint32_t mixRatio = 2; // AIC:AIV = 1:2 - uint32_t aivCoreNum = ascendcPlatform->GetCoreNum(); - uint32_t aicCoreNum = aivCoreNum / mixRatio; - - size_t aFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.k); - size_t bFileSize = static_cast(sizeof(float) * caseParams.k * caseParams.n); - size_t biasFileSize = static_cast(sizeof(float) * caseParams.n); - size_t cFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); - size_t workspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); - size_t tilingFileSize = sizeof(TCubeTiling); - - uint8_t* a = (uint8_t *)AscendC::GmAlloc(aFileSize); - uint8_t* b = (uint8_t *)AscendC::GmAlloc(bFileSize); + return ascendcPlatform->GetCoreNumAic(); +} +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(int64_t m, int64_t n, int64_t k) +{ + size_t x1FileSize = m * k * sizeof(uint16_t); + size_t x2FileSize = k * n * sizeof(uint16_t); + size_t yFileSize = m * n * sizeof(uint16_t); + size_t biasFileSize = 1 * n * sizeof(uint32_t); + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); uint8_t* bias; - uint8_t* c = (uint8_t *)AscendC::GmAlloc(cFileSize); - uint8_t* workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize); - uint8_t* tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize); - - ReadFile("../input/a_gm.bin", aFileSize, a, aFileSize); - ReadFile("../input/b_gm.bin", bFileSize, b, bFileSize); - if (caseParams.isBias) { - bias = (uint8_t *)AscendC::GmAlloc(biasFileSize); + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); } - - optiling::GenerateTiling(aicCoreNum, &cubeTiling, caseParams, tiling); - memcpy_s(tiling, tilingFileSize, tiling, tilingFileSize); - - ICPU_RUN_KF(matmul_l2_cache_custom, aicCoreNum, a, b, bias, c, workspace, tiling); - - WriteFile("../output/output.bin", c, cFileSize); - - AscendC::GmFree((void*)a); - AscendC::GmFree((void*)b); - if (caseParams.isBias) { - AscendC::GmFree((void*)bias); - } - AscendC::GmFree((void*)c); + const auto shapes = GetSingleCoreShape(m, n, k); + size_t tilingFileSize = sizeof(shapes); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + memcpy_s(tiling, tilingFileSize, &shapes, tilingFileSize); + ICPU_RUN_KF(matmul_l2cache_custom, GetUsedCoreNum(), x1, x2, bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); AscendC::GmFree((void*)workspace); AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } } #else -void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, const optiling::TestcaseParams& caseParams, +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, void* stream = nullptr) { - auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); - matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); - uint32_t mixRatio = 2; // AIC:AIV = 1:2 - uint32_t aivCoreNum = ascendcPlatform->GetCoreNum(); - uint32_t aicCoreNum = aivCoreNum / mixRatio; - + // Init args uint8_t *workspaceDevice; - size_t workspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + // Initialize kernel with arguments and workspace pointer + const auto shapes = GetSingleCoreShape(m, n, k); + uint32_t coreNum = static_cast(GetUsedCoreNum()); uint8_t* tilingHost; uint8_t* tilingDevice; - size_t tilingFileSize = sizeof(TCubeTiling); + size_t tilingFileSize = sizeof(shapes); CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); - CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - optiling::GenerateTiling(aicCoreNum, &cubeTiling, caseParams, tilingHost); - CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); - CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - matmul_l2_cache_custom_do(aicCoreNum, stream, a, b, bias, c, workspaceDevice, tilingDevice); - CHECK_ACL(aclrtSynchronizeStream(stream)); - + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &shapes, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + matmul_l2cache_custom_do(coreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); CHECK_ACL(aclrtFreeHost(tilingHost)); CHECK_ACL(aclrtFree(workspaceDevice)); CHECK_ACL(aclrtFree(tilingDevice)); @@ -119,83 +143,77 @@ void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId CHECK_ACL(aclFinalize()); } -void TestMatmul(const optiling::TestcaseParams& caseParams) +void TestMatmul(int64_t m, int64_t n, int64_t k) { - size_t aFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.k); - size_t bFileSize = static_cast(sizeof(float) * caseParams.k * caseParams.n); - size_t biasFileSize = static_cast(sizeof(float) * caseParams.n); - size_t cFileSize = static_cast(sizeof(float) * caseParams.m * caseParams.n); + size_t x1FileSize = static_cast(m * k) * sizeof(uint16_t); + size_t x2FileSize = static_cast(k * n) * sizeof(uint16_t); + size_t yFileSize = static_cast(m * n) * sizeof(uint16_t); + size_t biasFileSize = static_cast(1 * n) * sizeof(uint32_t); aclrtContext context; aclrtStream stream = nullptr; int64_t deviceId = 0; TestAclInit(context, stream, deviceId); - uint8_t* aHost; - uint8_t* aDevice; - CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize)); - CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("../input/a_gm.bin", aFileSize, aHost, aFileSize); - CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - uint8_t* bHost; - uint8_t* bDevice; - CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize)); - CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("../input/b_gm.bin", bFileSize, bHost, bFileSize); - CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - uint8_t* biasHost; - uint8_t* biasDevice; - if (caseParams.isBias) { + uint8_t* x1Host; + uint8_t* x1Device; + CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host; + uint8_t* x2Device; + CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize)); CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); } + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - uint8_t* cHost; - uint8_t* cDevice; - CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize)); - CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - MatmulOp(aDevice, bDevice, biasDevice, cDevice, caseParams, stream); - - CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output.bin", cHost, cFileSize); - - CHECK_ACL(aclrtFree(aDevice)); - CHECK_ACL(aclrtFreeHost(aHost)); - - CHECK_ACL(aclrtFree(bDevice)); - CHECK_ACL(aclrtFreeHost(bHost)); - - CHECK_ACL(aclrtFree(biasDevice)); - CHECK_ACL(aclrtFreeHost(biasHost)); + MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtFree(cDevice)); - CHECK_ACL(aclrtFreeHost(cHost)); + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); TestAclDeInit(context, stream, deviceId); } #endif -} // end namespace MatmulHost - -int32_t main(int32_t argc, const char* args[]) +} // namespace MatmulHost +int32_t main(int32_t argc, const char *args[]) { - uint32_t problem[5]; - for (int32_t i = 1; i < argc && i < 6; ++i) { + int64_t problem[3] = {1, 1, 1}; + for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 std::stringstream ss(args[i]); ss >> problem[i - 1]; } - - bool isBias = problem[3]; - optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias, problem[4]}; auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); #ifdef ASCENDC_CPU_DEBUG - MatmulHost::TestMatmulCpu(caseParams); + MatmulHost::TestMatmulCpu(problem[0], problem[1], problem[2]); // 2 means problem shape k #else - MatmulHost::TestMatmul(caseParams); + MatmulHost::TestMatmul(problem[0], problem[1], problem[2]); // 2 means problem shape k #endif return 0; -} \ No newline at end of file +} diff --git a/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h b/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h new file mode 100644 index 00000000..6b7c452a --- /dev/null +++ b/examples/matrix/matmul_l2cache/op_kernel/l2_cache_optimizer.h @@ -0,0 +1,229 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file l2_cache_optimizer.h + * \brief + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_L2CACHE_L2_CACHE_OPTIMIZER_H +#define EXAMPLES_MATRIX_MATMUL_L2CACHE_L2_CACHE_OPTIMIZER_H + +namespace CustomMatmulL2Cache { + +constexpr int64_t L2_TILE_THRESHOLD = 100 * 1024 * 1024; +constexpr int64_t L1_MIN_UST_DIM = 4; +constexpr int64_t L1_MAX_UST_DIM = 8; + +template +__aicore__ constexpr inline decltype(auto) Get(T &&t) +{ + return AscendC::Std::get(AscendC::Std::forward(t)); +} + +template +class L2CacheOptimizer { +public: + using BlockShape = AscendC::Std::tuple; + using BlockCoord = AscendC::Std::tuple; + static constexpr int64_t l1M = MmTiling.baseM * MmTiling.stepM; + static constexpr int64_t l1N = MmTiling.baseN * MmTiling.stepN; + static constexpr int64_t l0M = MmTiling.baseM; + static constexpr int64_t l0N = MmTiling.baseN; + static constexpr int64_t l0K = MmTiling.baseK; +private: + int64_t mTileNum_; + int64_t nTileNum_; + int64_t blockNum_; + int64_t m_; // org shape + int64_t n_; + int64_t k_; + int64_t totalTileNum_; + // l2 spilit attribute + int64_t newBlockIdx_; + int64_t mL2TileNumTmp_; + int64_t nL2TileNumTmp_; + int64_t nL2Idx_; + int64_t mL2Idx_; + int64_t mL2BlockNum_; // l2 m block num + int64_t nL2BlockNum_; // l2 n block num + int64_t mL2TileNum_; // a1b1 m tile num of one l2 block + int64_t nL2TileNum_; // a1b1 n tile num of one l2 block +public: + __aicore__ inline L2CacheOptimizer(ProblemShape shape, int64_t blockNum) : + blockNum_(blockNum) + { + m_ = shape.m; + n_ = shape.n; + k_ = shape.k; + mTileNum_ = AscendC::Ceil(m_, l1M); + nTileNum_ = AscendC::Ceil(n_, l1N); + totalTileNum_ = mTileNum_ * nTileNum_; + InitL2Tile(); + } + + __aicore__ inline int64_t GetTileNum() + { + return totalTileNum_; + } + + __aicore__ inline BlockShape GetBlockShape(int64_t tileIdx) + { + GetCommonTileIndex(tileIdx); + int64_t mTileIdx = newBlockIdx_ % mL2TileNumTmp_; + mTileIdx = mTileIdx + mL2Idx_ * mL2TileNum_; + + int64_t nTileIdx = 0; + if (mL2TileNumTmp_ != 0 && nL2TileNumTmp_ != 0) { + int64_t tmp = newBlockIdx_ / CalcLcm(mL2TileNumTmp_, nL2TileNumTmp_); + nTileIdx = (newBlockIdx_ + tmp) % nL2TileNumTmp_; + } + nTileIdx = nTileIdx + nL2Idx_ * nL2TileNum_; + + // calc tail l1block mnk + int64_t tailL1M = (m_ % l1M == 0) ? l1M : m_ % l1M; + int64_t tailL1N = (n_ % l1N == 0) ? l1N : n_ % l1N; + int64_t blockShapeM = IsMTail(mTileIdx, mTileNum_) ? tailL1M : l1M; + int64_t blockShapeN = IsNTail(nTileIdx, nTileNum_) ? tailL1N : l1N; + + return {blockShapeM, blockShapeN, k_}; + } + + __aicore__ inline BlockCoord GetBlockCoord(int64_t tileIdx) + { + GetCommonTileIndex(tileIdx); + int64_t mTileIdx = newBlockIdx_ % mL2TileNumTmp_; + mTileIdx = mTileIdx + mL2Idx_ * mL2TileNum_; + + int64_t nTileIdx = 0; + if (mL2TileNumTmp_ != 0 && nL2TileNumTmp_ != 0) { + int64_t tmp = newBlockIdx_ / CalcLcm(mL2TileNumTmp_, nL2TileNumTmp_); + nTileIdx = (newBlockIdx_ + tmp) % nL2TileNumTmp_; + } + nTileIdx = nTileIdx + nL2Idx_ * nL2TileNum_; + + return {mTileIdx * l1M, nTileIdx * l1N, 0}; + } +private: + __aicore__ inline int64_t GetTotalSize(int64_t mL2, int64_t nL2, int64_t kL2) + { + int64_t sizeA = mL2 * kL2 * sizeof(half); + int64_t sizeB = kL2 * nL2 * sizeof(half); + int64_t sizeC = mL2 * nL2 * sizeof(half); + return sizeA + sizeB + sizeC; + } + + __aicore__ inline bool EnableL2Tile() + { + return GetTotalSize(m_, n_, k_) > L2_TILE_THRESHOLD; + } + + __aicore__ inline int64_t GetTail(int64_t dividend, int64_t divisor) + { + if (divisor == 0) { + return 0; + } + auto remainder = dividend % divisor; + return (remainder == 0) ? divisor : remainder; + } + + __aicore__ inline void InitL2TileTail() + { + int64_t mConflict = INT64_MAX; + int64_t nConflict = INT64_MAX; + constexpr bool isNMajor = l1N > l1M; + int64_t majorDim = isNMajor ? nTileNum_ : mTileNum_; + int64_t minorDim = isNMajor ? mTileNum_ : nTileNum_; + int64_t maxMajor = blockNum_ > majorDim ? majorDim : blockNum_; + int64_t maxMinor = blockNum_ > minorDim ? minorDim : blockNum_; + int64_t minMinor = isNMajor ? L1_MIN_UST_DIM : L1_MAX_UST_DIM; + for (int64_t i = maxMajor; i >= L1_MIN_UST_DIM; i--) { // if l1N greater than l1M, indicates n + for (int64_t j = maxMinor; j >= minMinor; j--) { + if (GetTotalSize(j * l1M, i * l1N, k_) <= L2_TILE_THRESHOLD) { + int64_t curMajorDim = isNMajor ? j : i; + int64_t curMinorDim = isNMajor ? i : j; + int64_t mL2TileNumTailTmp = GetTail(mTileNum_, curMajorDim); + int64_t nL2TileNumTailTmp = GetTail(nTileNum_, curMinorDim); + uint64_t mConflictTmp = AscendC::Ceil(blockNum_, mL2TileNumTailTmp); + uint64_t nConflictTmp = AscendC::Ceil(blockNum_, nL2TileNumTailTmp); + if (mConflict >= mConflictTmp && nConflict >= nConflictTmp) { + mConflict = mConflictTmp; + nConflict = nConflictTmp; + mL2TileNum_ = curMajorDim; + nL2TileNum_ = curMinorDim; + } + } + } + } + if (mL2TileNum_ == 0 || nL2TileNum_ == 0) { + mL2TileNum_ = mTileNum_; + nL2TileNum_ = nTileNum_; + } + } + + __aicore__ inline void InitL2Tile() + { + bool smallDim = mTileNum_ < L1_MIN_UST_DIM && nTileNum_ < L1_MIN_UST_DIM; + if (smallDim || (!EnableL2Tile())) { + mL2TileNum_ = mTileNum_; + nL2TileNum_ = nTileNum_; + mL2BlockNum_ = 1; + nL2BlockNum_ = 1; + return; + } + mL2TileNum_ = 0; + nL2TileNum_ = 0; + InitL2TileTail(); + mL2BlockNum_ = AscendC::Ceil(mTileNum_, mL2TileNum_); + nL2BlockNum_ = AscendC::Ceil(nTileNum_, nL2TileNum_); + } + + __aicore__ inline void GetCommonTileIndex(int64_t tileIdx) + { + mL2Idx_ = tileIdx / (mL2TileNum_ * nTileNum_); + mL2TileNumTmp_ = (mL2Idx_ == mL2BlockNum_ - 1) ? GetTail(mTileNum_, mL2TileNum_) : mL2TileNum_; + + nL2Idx_ = (tileIdx % (mL2TileNum_ * nTileNum_)) / (mL2TileNumTmp_ * nL2TileNum_); + nL2TileNumTmp_ = (nL2Idx_ == nL2BlockNum_ - 1) ? GetTail(nTileNum_, nL2TileNum_) : nL2TileNum_; + + int64_t startIdx = mL2Idx_ * mL2TileNum_ * nTileNum_ + nL2Idx_ * nL2TileNum_ * mL2TileNumTmp_; + int64_t startBlockIdx = startIdx % blockNum_; + newBlockIdx_ = tileIdx - startIdx; + } + + __aicore__ inline int64_t CalcLcm(int64_t a, int64_t b) + { + if (a == 0 || b == 0) { + return 0; + } + // calc GCD + int64_t m = a; + int64_t n = b; + while (n != 0) { + int64_t tmp = m % n; + m = n; + n = tmp; + } + int64_t gcd = m; + return (a / gcd) * b; + } + + __aicore__ inline bool IsMTail(int64_t mTileIdx, int64_t mTileNum) + { + return (mTileIdx - (mTileNum - 1)) % mTileNum == 0; + } + + __aicore__ inline bool IsNTail(int64_t nTileIdx, int64_t nTileNum) + { + return nTileIdx == (nTileNum - 1); + } +}; +} // namespace +#endif \ No newline at end of file diff --git a/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp new file mode 100644 index 00000000..1291dea4 --- /dev/null +++ b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_l2cache_custom_kernel.h" +// matmul kernel function +extern "C" __global__ __aicore__ void matmul_l2cache_custom( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + if (g_coreType == AscendC::AIV) { + return; + } + CustomMatmulL2Cache::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + matmulKernel.Init(a, b, bias, c, workspace, tilingGm); + matmulKernel.Process(&pipe); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_l2cache_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, + GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_l2cache_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif \ No newline at end of file diff --git a/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h new file mode 100644 index 00000000..edf1845a --- /dev/null +++ b/examples/matrix/matmul_l2cache/op_kernel/matmul_l2cache_custom_kernel.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_L2CACHE_OP_KERNEL_MATMUL_L2_CACHE_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_L2CACHE_OP_KERNEL_MATMUL_L2_CACHE_CUSTOM_KERNEL_H +#include "kernel_operator.h" +#define ASCENDC_CUBE_ONLY +#include "lib/matmul_intf.h" +#include "l2_cache_optimizer.h" +namespace CustomMatmulL2Cache { +constexpr static auto GetCFG() +{ + auto cfg = CFG_MDL; + cfg.singleCoreM = 10000; + cfg.singleCoreN = 10000; + cfg.singleCoreK = 10000; + cfg.basicM = 128; + cfg.basicN = 256; + cfg.basicK = 64; + cfg.enableSetBias = false; + cfg.enUnitFlag = true; + return cfg; +} +constexpr auto CUSTOM_CFG = GetCFG(); +constexpr static auto GetL1Tiling(const MatmulApiStaticTiling& mmTiling) +{ + auto tiling = mmTiling; + tiling.stepM = 1; + tiling.stepN = 1; + tiling.stepKa = 4; + tiling.stepKb = 4; + tiling.depthA1 = 8; + tiling.depthB1 = 8; + return tiling; +} +struct MatmulProblemShape { + int32_t m; + int32_t n; + int32_t k; + int32_t isBias; +}; + +__aicore__ inline void CopyTiling(MatmulProblemShape* tiling, GM_ADDR tilingGM) +{ + int32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ int32_t *>(tilingGM); + for (size_t i = 0; i < sizeof(MatmulProblemShape) / sizeof(int32_t); ++i, ++ptr) { + *ptr = *(tiling32 + i); + } +} + +template +class MatmulKernel { +public: + using BlockCoord = AscendC::Std::tuple; + __aicore__ inline MatmulKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + GM_ADDR tiling); + __aicore__ inline void Process(AscendC::TPipe* pipe); + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + constexpr static auto CONSTANT_CFG = AscendC::GetMatmulApiTiling(CUSTOM_CFG); + constexpr static auto MM_TILING = GetL1Tiling(CONSTANT_CFG); + // Use cubeOnly mode + AscendC::Matmul matmulObj; + using L2CacheOpt = L2CacheOptimizer; + MatmulProblemShape shapes; +private: + __aicore__ inline AscendC::Coord CalcOffset( + const MatmulProblemShape& param, const BlockCoord &blockCoord); + __aicore__ inline int64_t GetTotalSize(int64_t m, int64_t n, int64_t k); + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; +}; + +template +__aicore__ inline void MatmulKernel::Init(GM_ADDR a, + GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) +{ + CopyTiling(&shapes, tiling); + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), shapes.m * shapes.k); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), shapes.k * shapes.n); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), shapes.m * shapes.n); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), shapes.n); +} + +template +__aicore__ inline void MatmulKernel::Process(AscendC::TPipe* pipe) +{ + REGIST_MATMUL_OBJ(pipe, GetSysWorkSpacePtr(), matmulObj, (TCubeTiling*)nullptr); + auto curBlockIdx = AscendC::GetBlockIdx(); + auto blockNum = AscendC::GetBlockNum(); + if (curBlockIdx >= blockNum) { + return; + } + L2CacheOpt l2Opt(shapes, blockNum); + matmulObj.SetOrgShape(shapes.m, shapes.n, shapes.k); + for (int64_t tileIdx = curBlockIdx; tileIdx < l2Opt.GetTileNum(); tileIdx += blockNum) { + auto blockShape = l2Opt.GetBlockShape(tileIdx); + if (Get<0>(blockShape) <= 0 || Get<1>(blockShape) <= 0) { + return; + } + auto blockCoord = l2Opt.GetBlockCoord(tileIdx); + matmulObj.SetTail(Get<0>(blockShape), Get<1>(blockShape), Get<2>(blockShape)); + const auto& offsetCoord = CalcOffset(shapes, blockCoord); + int64_t offsetA = Get<0>(offsetCoord); + int64_t offsetB = Get<1>(offsetCoord); + int64_t offsetC = Get<2>(offsetCoord); + matmulObj.SetTensorA(aGlobal[offsetA], false); + matmulObj.SetTensorB(bGlobal[offsetB], false); + if (shapes.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal[offsetC]); + } + matmulObj.End(); +} + +template +__aicore__ inline AscendC::Coord MatmulKernel::CalcOffset( + const MatmulProblemShape& param, const BlockCoord& blockCoord) +{ + AscendC::Coord aCoord = AscendC::MakeCoord(Get<0>(blockCoord), Get<2>(blockCoord)); + AscendC::Coord bCoord = AscendC::MakeCoord(Get<2>(blockCoord), Get<1>(blockCoord)); + AscendC::Coord cCoord = AscendC::MakeCoord(Get<0>(blockCoord), Get<1>(blockCoord)); + auto aLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.m, param.k), AscendC::MakeStride(param.k, 1)); + auto bLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.k, param.n), AscendC::MakeStride(param.n, 1)); + auto cLayOut = AscendC::MakeLayout(AscendC::MakeShape(param.m, param.n), AscendC::MakeStride(param.n, 1)); + int64_t offsetA = aLayOut(aCoord); + int64_t offsetB = bLayOut(bCoord); + int64_t offsetC = cLayOut(cCoord); + return {offsetA, offsetB, offsetC}; +} +} // namespace CustomMatmulL2Cache +#endif // EXAMPLES_MATRIX_MATMUL_CONSTANT_L2_H \ No newline at end of file diff --git a/examples/matrix/matmul_l2cache/run.sh b/examples/matrix/matmul_l2cache/run.sh index c7bb8bf6..631f7ba3 100644 --- a/examples/matrix/matmul_l2cache/run.sh +++ b/examples/matrix/matmul_l2cache/run.sh @@ -8,6 +8,8 @@ # See LICENSE in the root of the software repository for the full text of the License. # ====================================================================================================================== +export IS_PERF="0" + SHORT=r:,v:,p:, LONG=run-mode:,soc-version:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") @@ -76,6 +78,19 @@ mkdir output rm -rf bin/ mkdir -p bin cd bin -cp ../build/ascendc_matmul_l2_cache_bbit ./ +cp ../build/ascendc_matmul_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 -python3 -u ../scripts/exec_test.py -r ${RUN_MODE} -p ${IS_PERF} \ No newline at end of file +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/matmul_l2cache/scripts/exec_test.py b/examples/matrix/matmul_l2cache/scripts/exec_test.py index 273be84c..aacdcd2d 100644 --- a/examples/matrix/matmul_l2cache/scripts/exec_test.py +++ b/examples/matrix/matmul_l2cache/scripts/exec_test.py @@ -10,109 +10,104 @@ # See LICENSE in the root of the software repository for the full text of the License. # ====================================================================================================================== import os -import shlex -import subprocess import sys import csv import time import logging -import argparse -from collections import namedtuple +import subprocess -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) -from gen_l2_cache_data import gen_golden_data +import numpy as np + +sys.path.append("../..") +from common_scripts.gen_data import MatmulGenData from common_scripts.compare_data import compare_data -from common_scripts.exec_utils import clear_file_cache -from common_scripts.exec_utils import get_perf_task_duration +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd -DATA_TYPE_STR = "float16_float32" +IS_BIAS = False +IS_TRANS_A = False +IS_TRANS_B = False +# float16 in float16 out +DATA_TYPE_STR = "float16_float16" logging.basicConfig(level=logging.INFO) -ProcessParams = namedtuple("ProcessParams", ["isProcess", "caseName", 'm', 'n', 'k', 'isBias', 'splitTimes']) - - -def get_case_list(work_path): - case_list = [] - case_dir = os.path.join(work_path, "testcase") - if not os.path.isdir(case_dir): - logging.error(f"case path: {case_dir} not exist!") - return case_list - - for file_name in os.listdir(case_dir): - if not file_name.endswith(".csv"): - continue - - abs_file_name = os.path.join(case_dir, file_name) - with open(abs_file_name, mode='r', encoding='utf-8') as file: - csv_reader = csv.reader(file) - for row in csv_reader: - item_list = [int(i) if i.strip().isdigit() else i.strip() for i in row] - case_list.append(item_list) - return case_list - -def process_case(work_path, args, params): - case_name, m, n, k, is_bias, split_times = \ - (params.caseName, params.m, params.n, params.k, params.isBias, params.splitTimes) - run_mode, is_perf = args.r, args.p +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode - logging.info(f"start process case [{case_name}]") - logging.info(f"IS_PERF is set [{args.p}]") - clear_file_cache(work_path) +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + clear_file_cache(file_work_dir) - gen_golden_data(work_path, params) - params_str = f"{m} {n} {k} {is_bias} {split_times}" - if args.p: # npu(is_perf = 1) - cmd = f"msprof op --application=\"./ascendc_matmul_l2_cache_bbit {params_str}\" --output=./prof_out" - elif run_mode == "sim": # sim - cmd = f"msprof op simulator --application=\"./ascendc_matmul_l2_cache_bbit {params_str}\" --output=./sim_out" - else: # cpu or npu(is_perf = 0) - cmd = f"./ascendc_matmul_l2_cache_bbit {params_str}" - subprocess.run(shlex.split(cmd)) - - total_num = params.m * params.n - if args.p: + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + else: + matmul_gen_data.gen_golden_data(file_work_dir) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) + if is_perf: wrong_num = -1 - result = None - task_duration = get_perf_task_duration("./prof_out") else: - logging.info(f"compare data case[{case_name}]") - wrong_num = compare_data(work_path, DATA_TYPE_STR) - result = "Fail" if (wrong_num / total_num > 0.001 or wrong_num < 0) else "Success" - task_duration = None - - return [params.caseName, wrong_num, total_num, result, task_duration] + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(b * m * n) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (b * m * n) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data def main(): - parser = argparse.ArgumentParser() - parser.add_argument('-r', type=str, choices=['cpu', 'sim', 'npu'], required=True) - parser.add_argument('-p', type=int, default=0, required=False) - args = parser.parse_args() + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 - if args.r != 'npu' and args.p: - logging.error(f"IS_PERF can only be enabled in 'npu' mode, current mode: '{args.r}'.") + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) return -1 - work_path = os.path.dirname(os.getcwd()) + is_perf = False + if sys.argv[2] == "perf": + is_perf = True - case_list = get_case_list(work_path) + case_list = get_case_list() res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] - - for case in case_list: - try: - process_params = ProcessParams(*case) - except Exception as e: - logging.error(f"{case[0]}: invalid parameter, error msg: {e}") - continue - if process_params.isProcess: - res_data = process_case(work_path, args, process_params) + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) res_list.append(res_data) - result_file_name = f"result_{time.time()}.csv" - with open(os.path.join(work_path, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerows(res_list) diff --git a/examples/matrix/matmul_l2cache/testcase/case.csv b/examples/matrix/matmul_l2cache/testcase/case.csv index c4c30f8f..0ee900b2 100644 --- a/examples/matrix/matmul_l2cache/testcase/case.csv +++ b/examples/matrix/matmul_l2cache/testcase/case.csv @@ -1 +1 @@ -1, case001, 30720, 1024, 1024, 1, 2 \ No newline at end of file +1, case001, 30720, 1024, 4096 \ No newline at end of file -- Gitee