diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4478eda45222a2675fff10fc81dd983ca678c0a6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,57 @@ +cmake_minimum_required(VERSION 3.16) +project(OnlineSoftmax_CPU) + +# 1. 设置 ASCEND_HOME +if(NOT DEFINED ENV{ASCEND_HOME}) + set(ASCEND_HOME "/usr/local/Ascend/ascend-toolkit/7.0.0") +else() + set(ASCEND_HOME $ENV{ASCEND_HOME}) +endif() + +# 2. 定义可执行文件 +add_executable(online_softmax_cpu main.cpp online_softmax.cpp) + +# 3. 核心包含路径:覆盖 CANN 7.0 TikCpp 的所有接口与实现目录 +target_include_directories(online_softmax_cpu PRIVATE + ${ASCEND_HOME}/include + ${ASCEND_HOME}/tools/tikicpulib/lib/include + + # 针对你 find 到的深度路径进行匹配 + ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw # kernel_operator.h 所在 + ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw/interface # kernel_common.h 所在 + ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw/impl # kernel_utils.h 所在 (解决本次报错) + + # 其他标准路径 + ${ASCEND_HOME}/compiler/include + ${ASCEND_HOME}/runtime/include +) + +# 4. 关键仿真宏定义: +# __CCE_KT_TEST__ 是解决 g_coreType 冲突的金钥匙 +target_compile_definitions(online_softmax_cpu PRIVATE + ASCENDC_CPU_DEBUG + __CCE_KT_TEST__ # 必须定义,用于跳过 NPU 硬件常量的重定义 + __TIC_V200__ # 定义架构,解锁 AddIntrinsicsImpl 等底层指令 + __TIC_CORE_MIX__ # 混合核架构支持 +) + +# 5. 编译选项 +target_compile_options(online_softmax_cpu PRIVATE + -std=c++17 + -fPIC + -fpermissive # 允许忽略头文件中的细微语法瑕疵 + -Wno-attributes # 忽略 NPU 专有属性警告 +) + +# 6. 库搜索与链接 +target_link_directories(online_softmax_cpu PRIVATE + ${ASCEND_HOME}/tools/tikicpulib/lib + ${ASCEND_HOME}/compiler/lib + ${ASCEND_HOME}/runtime/lib64 +) + +target_link_libraries(online_softmax_cpu PRIVATE + tikicpulib_sim + ascendl2_simulator + pthread dl rt +) diff --git a/README.md b/README.md index 055353f7c199d1a70bc4f441fb6e9333a6601284..5aee18f9b8d69bd0f1ffb3afea35160313af88e9 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,56 @@ -# AscendC_2025_2026_1 -#### 介绍 -本仓库为2025-2026-1并行处理与体系结构课程Ascend C算法开发案例库,由课程学生结合个人研究方向开发的案例代码。 +# Ascend C 算法开发作业 +本作业尝试实现在华为昇腾 NPU 上利用 Ascend C 编程语言(算子开发)实现高效的 Softmax 计算。该案例结合了并行处理理论与昇腾 AI 处理器的硬件特性,采用 **Tiling(分块)** 技术解决大算力场景下的片上内存(UB)限制问题。 -#### 提交说明 -请先fork本仓库,然后在自己的仓库中提交代码和markdown文档后,提交Pull Requests到本仓库。具体操作方法请参考以下链接: -https://gitee.com/betty-bell/programming2025 +## 1. 背景介绍 +### 1.1 算法原理 +Softmax 算子通常定义为: +$$y_i = \frac{e^{x_i - \max(X)}}{\sum_{j} e^{x_j - \max(X)}}$$ +其中减去 $\max(X)$ 是为了防止指数运算溢出。 +**Online Softmax** 是一种优化方案(常用于 FlashAttention 等),旨在减少内存读写次数。在本案例中,我们实现了分块化的处理流程: +1. **第一遍扫描 (Pass 1)**:遍历所有分块,提取全局最大值 $max$ 和全局指数和 $sum$。 +2. **第二遍扫描 (Pass 2)**:利用全局统计量进行归一化计算并写回结果。 + +### 1.2 课程关联 +本案例体现了《并行处理与体系结构》中的核心概念: +- **数据并行**:利用向量计算单元(Vector Core)同时处理多个浮点数。 +- **存储层次结构优化**:管理 Global Memory (DDR) 与 Local Memory (Unified Buffer) 之间的数据搬运。 +- **流水线(Pipeline)**:通过 `TQue` 和 `TPipe` 实现搬运与计算的重叠。 + +## 2. 代码结构 +```bash +. +├── online_softmax_tiling.h # 结构定义:定义 Host 与 Kernel 传递的 Tiling 参数 +├── online_softmax_kernel.cpp # 核函数实现:基于 Ascend C 的类封装实现 +└── main.cpp # 仿真测试:CPU 端验证逻辑与调用模拟器 +``` + +## 3. 核心实现细节 + +### 3.1 Tiling 策略 +为了处理超出片上内存(UB)大小的数据,我们将 $N$ 长度的向量划分为多个 `tileLength` 大小的块。通过 `OnlineSoftmaxTilingData` 结构体在 Host 端计算分块信息并传递至 NPU。 + +### 3.2 算子类封装 +采用 `KernelOnlineSoftmax` 类进行封装,主要包含: +- **Init()**: 初始化 `TPipe` 内存池,设置输入输出队列(TQue)。 +- **Process()**: + - 使用 `DataCopy` 将数据从 Global Memory 搬运至 Local Memory。 + - 调用指令级 API 如 `ReduceMax`, `ReduceSum`, `Subs`, `Exp`, `Muls` 进行矢量化计算。 + + + +## 4. 环境要求 +- **昇腾 AI 处理器硬件** (或 Ascend 仿真环境) +- **CANN 软件栈**: 推荐 7.0 及以上版本 +- **编译器**: `ascend-build` 或配套的 `g++` 交叉编译器 + + + +--- + +**课程项目说明**: +- **学期**:2025-2026-1 +- **课程名称**:并行处理与体系结构 +- **作者** : 张云哲 diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a2677622e1d7777e6357c925649fe5229eeb1b6 --- /dev/null +++ b/main.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include "tikicpulib.h" +#include "online_softmax_tiling.h" + +// 7.0 仿真补丁:补充丢失的内部常量 +namespace AscendC { + #ifndef RESERVED_WORKSPACE + constexpr uint32_t RESERVED_WORKSPACE = 0; + #endif +} + +extern "C" void online_softmax_kernel(uint8_t*, uint8_t*, uint8_t*); + +int main() { + uint32_t N = 1024; + uint32_t tileLen = 256; + + OnlineSoftmaxTilingData tiling; + tiling.totalLength = N; + tiling.tileLength = tileLen; + tiling.tileNum = N / tileLen; + + std::vector x(N); + std::vector y(N, 0.0f); + for (int i = 0; i < N; ++i) x[i] = static_cast(i % 10); + + std::cout << "Starting Simulation (CANN 7.0)..." << std::endl; + ICPU_RUN_KF(online_softmax_kernel, 1, (uint8_t*)x.data(), (uint8_t*)y.data(), (uint8_t*)&tiling); + std::cout << "Simulation Success!" << std::endl; + + for (int i = 0; i < 5; ++i) std::cout << "y[" << i << "] = " << y[i] << std::endl; + return 0; +} diff --git a/online_softmax.cpp b/online_softmax.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3026a393afcc46f55981c841890cc55476e676d5 --- /dev/null +++ b/online_softmax.cpp @@ -0,0 +1,84 @@ +#include "kernel_operator.h" +#include "online_softmax_tiling.h" + +using namespace AscendC; + +class KernelOnlineSoftmax { +public: + __aicore__ inline KernelOnlineSoftmax() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, OnlineSoftmaxTilingData* tiling) { + this->totalLength = tiling->totalLength; + this->tileLength = tiling->tileLength; + this->tileNum = tiling->tileNum; + + xGm.SetGlobalBuffer((__gm__ float*)x); + yGm.SetGlobalBuffer((__gm__ float*)y); + + // 初始化 UB 空间 + pipe.InitBuffer(inQueueX, 1, tileLength * sizeof(float)); + pipe.InitBuffer(outQueueY, 1, tileLength * sizeof(float)); + pipe.InitBuffer(tmpBuffer, 2, tileLength * sizeof(float)); + } + + __aicore__ inline void Process() { + float maxVal = -3.4028235e+38f; + float sumVal = 0.0f; + + // Pass 1: 计算全局 Max 和 Sum + for (uint32_t i = 0; i < tileNum; i++) { + LocalTensor xLocal = inQueueX.AllocTensor(); + DataCopy(xLocal, xGm[i * tileLength], tileLength); + + LocalTensor tmpMaxTensor = tmpBuffer.Get(0); + ReduceMax(tmpMaxTensor, xLocal, tmpMaxTensor, tileLength); + float curTileMax = tmpMaxTensor.GetValue(0); + + float oldMax = maxVal; + if (curTileMax > maxVal) maxVal = curTileMax; + + float scaleOld = exp(oldMax - maxVal); + LocalTensor xExp = tmpBuffer.Get(1); + + // --- 修正:向量减标量使用 Subs --- + Subs(xExp, xLocal, maxVal, tileLength); + Exp(xExp, xExp, tileLength); + + LocalTensor tmpSumTensor = tmpBuffer.Get(0); + ReduceSum(tmpSumTensor, xExp, tmpSumTensor, tileLength); + sumVal = sumVal * scaleOld + tmpSumTensor.GetValue(0); + + inQueueX.FreeTensor(xLocal); + } + + // Pass 2: 归一化并写回 + for (uint32_t i = 0; i < tileNum; i++) { + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = outQueueY.AllocTensor(); + DataCopy(xLocal, xGm[i * tileLength], tileLength); + + // --- 修正:向量减标量使用 Subs --- + Subs(xLocal, xLocal, maxVal, tileLength); + Exp(xLocal, xLocal, tileLength); + Muls(yLocal, xLocal, 1.0f / sumVal, tileLength); + + DataCopy(yGm[i * tileLength], yLocal, tileLength); + inQueueX.FreeTensor(xLocal); + outQueueY.FreeTensor(yLocal); + } + } + +private: + TPipe pipe; + TQue inQueueX; + TQue outQueueY; + TBuf tmpBuffer; + GlobalTensor xGm, yGm; + uint32_t totalLength, tileLength, tileNum; +}; + +extern "C" __global__ __aicore__ void online_softmax_kernel(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) { + OnlineSoftmaxTilingData* tilingData = (OnlineSoftmaxTilingData*)tiling; + KernelOnlineSoftmax op; + op.Init(x, y, tilingData); + op.Process(); +} diff --git a/online_softmax_tiling.h b/online_softmax_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..4dbe1b49e28c3c5d7bebfd30f414eb925d0cafbc --- /dev/null +++ b/online_softmax_tiling.h @@ -0,0 +1,10 @@ +#ifndef ONLINE_SOFTMAX_TILING_H +#define ONLINE_SOFTMAX_TILING_H +#include + +struct OnlineSoftmaxTilingData { + uint32_t totalLength; + uint32_t tileLength; + uint32_t tileNum; +}; +#endif