From 09092b704e6fcd55716c76766710144433453265 Mon Sep 17 00:00:00 2001 From: betty_bell Date: Tue, 10 Mar 2026 08:04:56 +0000 Subject: [PATCH] =?UTF-8?q?=E5=9B=9E=E9=80=80=20'Pull=20Request=20!4=20:?= =?UTF-8?q?=20update=20README.md.'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 57 ---------------------------- README.md | 58 +++------------------------- main.cpp | 35 ----------------- online_softmax.cpp | 84 ----------------------------------------- online_softmax_tiling.h | 10 ----- 5 files changed, 6 insertions(+), 238 deletions(-) delete mode 100644 CMakeLists.txt delete mode 100644 main.cpp delete mode 100644 online_softmax.cpp delete mode 100644 online_softmax_tiling.h diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 4478eda..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,57 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(OnlineSoftmax_CPU) - -# 1. 设置 ASCEND_HOME -if(NOT DEFINED ENV{ASCEND_HOME}) - set(ASCEND_HOME "/usr/local/Ascend/ascend-toolkit/7.0.0") -else() - set(ASCEND_HOME $ENV{ASCEND_HOME}) -endif() - -# 2. 定义可执行文件 -add_executable(online_softmax_cpu main.cpp online_softmax.cpp) - -# 3. 核心包含路径:覆盖 CANN 7.0 TikCpp 的所有接口与实现目录 -target_include_directories(online_softmax_cpu PRIVATE - ${ASCEND_HOME}/include - ${ASCEND_HOME}/tools/tikicpulib/lib/include - - # 针对你 find 到的深度路径进行匹配 - ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw # kernel_operator.h 所在 - ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw/interface # kernel_common.h 所在 - ${ASCEND_HOME}/x86_64-linux/tikcpp/tikcfw/impl # kernel_utils.h 所在 (解决本次报错) - - # 其他标准路径 - ${ASCEND_HOME}/compiler/include - ${ASCEND_HOME}/runtime/include -) - -# 4. 关键仿真宏定义: -# __CCE_KT_TEST__ 是解决 g_coreType 冲突的金钥匙 -target_compile_definitions(online_softmax_cpu PRIVATE - ASCENDC_CPU_DEBUG - __CCE_KT_TEST__ # 必须定义,用于跳过 NPU 硬件常量的重定义 - __TIC_V200__ # 定义架构,解锁 AddIntrinsicsImpl 等底层指令 - __TIC_CORE_MIX__ # 混合核架构支持 -) - -# 5. 编译选项 -target_compile_options(online_softmax_cpu PRIVATE - -std=c++17 - -fPIC - -fpermissive # 允许忽略头文件中的细微语法瑕疵 - -Wno-attributes # 忽略 NPU 专有属性警告 -) - -# 6. 库搜索与链接 -target_link_directories(online_softmax_cpu PRIVATE - ${ASCEND_HOME}/tools/tikicpulib/lib - ${ASCEND_HOME}/compiler/lib - ${ASCEND_HOME}/runtime/lib64 -) - -target_link_libraries(online_softmax_cpu PRIVATE - tikicpulib_sim - ascendl2_simulator - pthread dl rt -) diff --git a/README.md b/README.md index 5aee18f..055353f 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,10 @@ +# AscendC_2025_2026_1 -# Ascend C 算法开发作业 +#### 介绍 +本仓库为2025-2026-1并行处理与体系结构课程Ascend C算法开发案例库,由课程学生结合个人研究方向开发的案例代码。 -本作业尝试实现在华为昇腾 NPU 上利用 Ascend C 编程语言(算子开发)实现高效的 Softmax 计算。该案例结合了并行处理理论与昇腾 AI 处理器的硬件特性,采用 **Tiling(分块)** 技术解决大算力场景下的片上内存(UB)限制问题。 -## 1. 背景介绍 -### 1.1 算法原理 -Softmax 算子通常定义为: -$$y_i = \frac{e^{x_i - \max(X)}}{\sum_{j} e^{x_j - \max(X)}}$$ -其中减去 $\max(X)$ 是为了防止指数运算溢出。 +#### 提交说明 +请先fork本仓库,然后在自己的仓库中提交代码和markdown文档后,提交Pull Requests到本仓库。具体操作方法请参考以下链接: +https://gitee.com/betty-bell/programming2025 -**Online Softmax** 是一种优化方案(常用于 FlashAttention 等),旨在减少内存读写次数。在本案例中,我们实现了分块化的处理流程: -1. **第一遍扫描 (Pass 1)**:遍历所有分块,提取全局最大值 $max$ 和全局指数和 $sum$。 -2. **第二遍扫描 (Pass 2)**:利用全局统计量进行归一化计算并写回结果。 - -### 1.2 课程关联 -本案例体现了《并行处理与体系结构》中的核心概念: -- **数据并行**:利用向量计算单元(Vector Core)同时处理多个浮点数。 -- **存储层次结构优化**:管理 Global Memory (DDR) 与 Local Memory (Unified Buffer) 之间的数据搬运。 -- **流水线(Pipeline)**:通过 `TQue` 和 `TPipe` 实现搬运与计算的重叠。 - -## 2. 代码结构 -```bash -. -├── online_softmax_tiling.h # 结构定义:定义 Host 与 Kernel 传递的 Tiling 参数 -├── online_softmax_kernel.cpp # 核函数实现:基于 Ascend C 的类封装实现 -└── main.cpp # 仿真测试:CPU 端验证逻辑与调用模拟器 -``` - -## 3. 核心实现细节 - -### 3.1 Tiling 策略 -为了处理超出片上内存(UB)大小的数据,我们将 $N$ 长度的向量划分为多个 `tileLength` 大小的块。通过 `OnlineSoftmaxTilingData` 结构体在 Host 端计算分块信息并传递至 NPU。 - -### 3.2 算子类封装 -采用 `KernelOnlineSoftmax` 类进行封装,主要包含: -- **Init()**: 初始化 `TPipe` 内存池,设置输入输出队列(TQue)。 -- **Process()**: - - 使用 `DataCopy` 将数据从 Global Memory 搬运至 Local Memory。 - - 调用指令级 API 如 `ReduceMax`, `ReduceSum`, `Subs`, `Exp`, `Muls` 进行矢量化计算。 - - - -## 4. 环境要求 -- **昇腾 AI 处理器硬件** (或 Ascend 仿真环境) -- **CANN 软件栈**: 推荐 7.0 及以上版本 -- **编译器**: `ascend-build` 或配套的 `g++` 交叉编译器 - - - ---- - -**课程项目说明**: -- **学期**:2025-2026-1 -- **课程名称**:并行处理与体系结构 -- **作者** : 张云哲 diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 0a26776..0000000 --- a/main.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include -#include -#include -#include "tikicpulib.h" -#include "online_softmax_tiling.h" - -// 7.0 仿真补丁:补充丢失的内部常量 -namespace AscendC { - #ifndef RESERVED_WORKSPACE - constexpr uint32_t RESERVED_WORKSPACE = 0; - #endif -} - -extern "C" void online_softmax_kernel(uint8_t*, uint8_t*, uint8_t*); - -int main() { - uint32_t N = 1024; - uint32_t tileLen = 256; - - OnlineSoftmaxTilingData tiling; - tiling.totalLength = N; - tiling.tileLength = tileLen; - tiling.tileNum = N / tileLen; - - std::vector x(N); - std::vector y(N, 0.0f); - for (int i = 0; i < N; ++i) x[i] = static_cast(i % 10); - - std::cout << "Starting Simulation (CANN 7.0)..." << std::endl; - ICPU_RUN_KF(online_softmax_kernel, 1, (uint8_t*)x.data(), (uint8_t*)y.data(), (uint8_t*)&tiling); - std::cout << "Simulation Success!" << std::endl; - - for (int i = 0; i < 5; ++i) std::cout << "y[" << i << "] = " << y[i] << std::endl; - return 0; -} diff --git a/online_softmax.cpp b/online_softmax.cpp deleted file mode 100644 index 3026a39..0000000 --- a/online_softmax.cpp +++ /dev/null @@ -1,84 +0,0 @@ -#include "kernel_operator.h" -#include "online_softmax_tiling.h" - -using namespace AscendC; - -class KernelOnlineSoftmax { -public: - __aicore__ inline KernelOnlineSoftmax() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, OnlineSoftmaxTilingData* tiling) { - this->totalLength = tiling->totalLength; - this->tileLength = tiling->tileLength; - this->tileNum = tiling->tileNum; - - xGm.SetGlobalBuffer((__gm__ float*)x); - yGm.SetGlobalBuffer((__gm__ float*)y); - - // 初始化 UB 空间 - pipe.InitBuffer(inQueueX, 1, tileLength * sizeof(float)); - pipe.InitBuffer(outQueueY, 1, tileLength * sizeof(float)); - pipe.InitBuffer(tmpBuffer, 2, tileLength * sizeof(float)); - } - - __aicore__ inline void Process() { - float maxVal = -3.4028235e+38f; - float sumVal = 0.0f; - - // Pass 1: 计算全局 Max 和 Sum - for (uint32_t i = 0; i < tileNum; i++) { - LocalTensor xLocal = inQueueX.AllocTensor(); - DataCopy(xLocal, xGm[i * tileLength], tileLength); - - LocalTensor tmpMaxTensor = tmpBuffer.Get(0); - ReduceMax(tmpMaxTensor, xLocal, tmpMaxTensor, tileLength); - float curTileMax = tmpMaxTensor.GetValue(0); - - float oldMax = maxVal; - if (curTileMax > maxVal) maxVal = curTileMax; - - float scaleOld = exp(oldMax - maxVal); - LocalTensor xExp = tmpBuffer.Get(1); - - // --- 修正:向量减标量使用 Subs --- - Subs(xExp, xLocal, maxVal, tileLength); - Exp(xExp, xExp, tileLength); - - LocalTensor tmpSumTensor = tmpBuffer.Get(0); - ReduceSum(tmpSumTensor, xExp, tmpSumTensor, tileLength); - sumVal = sumVal * scaleOld + tmpSumTensor.GetValue(0); - - inQueueX.FreeTensor(xLocal); - } - - // Pass 2: 归一化并写回 - for (uint32_t i = 0; i < tileNum; i++) { - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = outQueueY.AllocTensor(); - DataCopy(xLocal, xGm[i * tileLength], tileLength); - - // --- 修正:向量减标量使用 Subs --- - Subs(xLocal, xLocal, maxVal, tileLength); - Exp(xLocal, xLocal, tileLength); - Muls(yLocal, xLocal, 1.0f / sumVal, tileLength); - - DataCopy(yGm[i * tileLength], yLocal, tileLength); - inQueueX.FreeTensor(xLocal); - outQueueY.FreeTensor(yLocal); - } - } - -private: - TPipe pipe; - TQue inQueueX; - TQue outQueueY; - TBuf tmpBuffer; - GlobalTensor xGm, yGm; - uint32_t totalLength, tileLength, tileNum; -}; - -extern "C" __global__ __aicore__ void online_softmax_kernel(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) { - OnlineSoftmaxTilingData* tilingData = (OnlineSoftmaxTilingData*)tiling; - KernelOnlineSoftmax op; - op.Init(x, y, tilingData); - op.Process(); -} diff --git a/online_softmax_tiling.h b/online_softmax_tiling.h deleted file mode 100644 index 4dbe1b4..0000000 --- a/online_softmax_tiling.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef ONLINE_SOFTMAX_TILING_H -#define ONLINE_SOFTMAX_TILING_H -#include - -struct OnlineSoftmaxTilingData { - uint32_t totalLength; - uint32_t tileLength; - uint32_t tileNum; -}; -#endif -- Gitee