diff --git a/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.cpp b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4ce12680427a68108ee4f244814f548a04b27427 --- /dev/null +++ b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.cpp @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: Normal combine function device implementation file + * Create: 2025-11-28 + * Note: + * History: 2025-11-28 create normal combine function file in device part + */ + +#include "kernel_operator.h" +#include "lib/matmul_intf.h" +#include "cam_moe_combine_normal.h" +#include "cam_moe_combine_normal_tiling.h" +using namespace AscendC; +using namespace CamMoeCombineNormalImpl; + +extern "C" __global__ __aicore__ void cam_moe_combine_normal(GM_ADDR recvX, GM_ADDR tokenSrcInfo, GM_ADDR epRecvCount, + GM_ADDR topkWeights, GM_ADDR tpRecvCount, GM_ADDR XOut, + GM_ADDR sendCostStatsOut, GM_ADDR workspaceGM, + GM_ADDR tilingGM) + +{ + REGISTER_TILING_DEFAULT(CamMoeCombineNormalTilingData); + TPipe pipe; + +#if (ORIG_DTYPE_RECV_X == DT_BF16 || ORIG_DTYPE_RECV_X == DT_FLOAT16) + GET_TILING_DATA_WITH_STRUCT(CamMoeCombineNormalTilingData, tilingData, tilingGM); + CamMoeCombineNormal op; + op.Init(recvX, tokenSrcInfo, epRecvCount, topkWeights, tpRecvCount, XOut, sendCostStatsOut, workspaceGM, &pipe, + &tilingData); + op.Process(); +#endif +} diff --git a/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.h b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.h new file mode 100644 index 0000000000000000000000000000000000000000..04b56390e5f6f904c02afb1b8d177e28210bfedf --- /dev/null +++ b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal.h @@ -0,0 +1,432 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: Normal combine function device header file + * Create: 2025-11-28 + * Note: + * History: 2025-11-28 create normal combine header file in device part + */ + +#ifndef CAM_MOE_COMBINE_NORMAL_H +#define CAM_MOE_COMBINE_NORMAL_H + +#include "kernel_operator.h" +#include "kernel_tiling/kernel_tiling.h" +#include "moe_distribute_base.h" +#include "cam_moe_combine_normal_tiling.h" +#include "comm_args.h" + +namespace CamMoeCombineNormalImpl { +constexpr uint32_t RANK_ID_OFFSET_IN_SRC_INFO = 0U; +constexpr uint32_t TOKEN_IDX_OFFSET_IN_SRC_INFO = 1U; +constexpr uint32_t TOPK_IDX_OFFSET_IN_SRC_INFO = 2U; +constexpr uint64_t COMBINE_STATE_WIN_OFFSET = 3UL * 1024UL * 1024UL; +constexpr uint64_t MAGIC_WIN_OFFSET = 975UL * 1024UL; +constexpr uint32_t TOKEN_SRC_INFO_LEN = 3U; +constexpr uint32_t UB_32_ALIGN = 32U; +constexpr uint32_t MUL_256_ALIGN = 256U; +constexpr uint64_t WIN_512_ALIGN = 512UL; +constexpr uint32_t FLOAT_NUM_PER_ALIGN = 8U; +constexpr uint8_t DOUBLE_BUFFER = 2; +constexpr int64_t CYCLE_TO_TIME = 50; // cycle num is converted into a fixed base unit of time, set at 50 + +template +__aicore__ inline void SyncFunc() +{ + int32_t eventID = static_cast(GetTPipePtr()->FetchEventID(event)); + AscendC::SetFlag(eventID); + AscendC::WaitFlag(eventID); +} + +#define TemplateMC2TypeClass typename RecvXType, typename XType, typename SrcInfoType +#define TemplateMC2TypeFunc RecvXType, XType, SrcInfoType + +using namespace AscendC; +template +class CamMoeCombineNormal +{ +public: + __aicore__ inline CamMoeCombineNormal(){}; + __aicore__ inline void Init(GM_ADDR recvX, GM_ADDR tokenSrcInfo, GM_ADDR epRecvCount, GM_ADDR topkWeights, + GM_ADDR tpRecvCount, GM_ADDR XOut, GM_ADDR sendCostStatsOut, GM_ADDR workspaceGM, + TPipe *pipe, const CamMoeCombineNormalTilingData *tilingData); + __aicore__ inline void Process(); + +private: + __aicore__ inline void InitMagic(); + __aicore__ inline void InitGlobalBuffer(GM_ADDR recvX, GM_ADDR tokenSrcInfo, GM_ADDR epRecvCount, + GM_ADDR topkWeights, GM_ADDR XOut, GM_ADDR sendCostStatsOut); + __aicore__ inline void InitTilingData(const CamMoeCombineNormalTilingData *tilingData); + __aicore__ inline void InitBuffLen(); + __aicore__ inline void CopyBufferToShareAndSetStatus(); + __aicore__ inline void CopyBufferToShare(uint32_t srcRankId, uint32_t srcTokenId, uint32_t srcTopkId, + uint32_t tkIndex); + __aicore__ inline void ReadBufferFromRemote(); + __aicore__ inline void WaitBuffCopy(uint32_t tokenIndex); + __aicore__ inline void SetStatusBySrcInfo(uint32_t srcRankId, uint32_t srcTokenId, uint32_t srcTopkId); + __aicore__ inline void ReadBufferAndWeightedSum(uint32_t tokenIndex, uint32_t startTokenIndex); + + __aicore__ GM_ADDR GetStateAddrByRankId(const int32_t rankId) + { + GM_ADDR bufferAddr; + if (epRankId_ == rankId) { + bufferAddr = (GM_ADDR)epWinContext_->localWindowsIn; + } else { + bufferAddr = (GM_ADDR)((HcclRankRelationResV2 *)epWinContext_->remoteRes[rankId].nextDevicePtr)->windowsIn; + } + return (GM_ADDR)(bufferAddr + winDataSizeOffset_ + Moe::NOTIFY_DISPATCH_BUFF_OFFSET); + } + + __aicore__ GM_ADDR GetBufferAddrByRankId(const int32_t rankId) + { + return GetStateAddrByRankId(rankId) + COMBINE_STATE_WIN_OFFSET; + } + + __aicore__ inline void SplitCoreCal(uint32_t totalNum, uint32_t &perCoreNum, uint32_t &startIdx, uint32_t &endIdx) + { + perCoreNum = totalNum / aivNum_; + uint32_t remainderRankNum = totalNum % aivNum_; + + startIdx = perCoreNum * coreIdx_; + if (coreIdx_ < remainderRankNum) { + perCoreNum++; + startIdx += coreIdx_; + } else { + startIdx += remainderRankNum; + } + endIdx = startIdx + perCoreNum; + } + + __gm__ HcclOpResParam *epWinContext_{nullptr}; + __gm__ HcclOpResParam *tpWinContext_{nullptr}; + uint32_t axisBS_{0}; + uint32_t axisH_{0}; + uint32_t axisK_{0}; + uint32_t aivNum_{0}; + uint32_t epWorldSize_{0}; + uint32_t epRankId_{0}; + uint32_t coreIdx_{0}; + uint32_t moeExpertNum_{0}; + uint32_t moeExpertPerRankNum_{0}; + uint32_t magic_{0}; + uint64_t winDataSizeOffset_{0}; + uint32_t selfSendCnt_{0}; + uint32_t hRecvXTypeLen_{0}; + uint32_t h32AlignFloatLen_{0}; + uint32_t h256AlignFloatLen_{0}; + uint32_t h32AlignRecvXLen_{0}; + uint32_t h512AlignRecvXLen_{0}; + uint32_t sendCostStatsBufSize_{0}; + + bool isEnableDiagnose_{false}; + + TPipe *tpipe_{nullptr}; + TQue weightedSumQueue_; + TQue sendCostStatsOutQueue_; + TQueBind localCopyQueue_; + TBuf<> stateBuf_; + TBuf<> topkWeightsBuf_; + TBuf<> tokenFloatBuf_; + TBuf<> sumFloatBuf_; + TBuf<> weightedMulBuf_; + TBuf<> srcInfoBuf_; + TBuf<> xOutBuf_; + TBuf<> tempStateBuf_; + + GlobalTensor recvXGM_; + GlobalTensor tokenSrcInfoGM_; + GlobalTensor epRecvCountGM_; + GlobalTensor topkWeightsGM_; + GlobalTensor xOutGlobal_; + GlobalTensor sendCostStatsGT_; + GM_ADDR localRankGM_; + GM_ADDR workspaceGM_; +}; + +template +__aicore__ inline void CamMoeCombineNormal::InitMagic() +{ + auto contextGM0 = AscendC::GetHcclContext(); + epWinContext_ = (__gm__ HcclOpResParam *)contextGM0; + + GlobalTensor selfMagicTensor; + selfMagicTensor.SetGlobalBuffer( + (__gm__ int32_t *)((GM_ADDR)epWinContext_->localWindowsExp + MAGIC_WIN_OFFSET + coreIdx_ * WIN_512_ALIGN)); + DataCacheCleanAndInvalid(selfMagicTensor); + magic_ = selfMagicTensor(0); + selfMagicTensor(0) = ((magic_ == 0) ? 1 : 0); + DataCacheCleanAndInvalid(selfMagicTensor); +} + +template +__aicore__ inline void CamMoeCombineNormal::InitGlobalBuffer(GM_ADDR recvX, GM_ADDR tokenSrcInfo, + GM_ADDR epRecvCount, + GM_ADDR topkWeights, GM_ADDR XOut, + GM_ADDR sendCostStatsOut) +{ + recvXGM_.SetGlobalBuffer((__gm__ RecvXType *)recvX); + tokenSrcInfoGM_.SetGlobalBuffer((__gm__ SrcInfoType *)tokenSrcInfo); + epRecvCountGM_.SetGlobalBuffer((__gm__ int32_t *)epRecvCount); + topkWeightsGM_.SetGlobalBuffer((__gm__ float *)topkWeights); + xOutGlobal_.SetGlobalBuffer((__gm__ XType *)XOut); + if (isEnableDiagnose_) { + sendCostStatsGT_.SetGlobalBuffer((__gm__ int32_t *)sendCostStatsOut); + } +} + +template +__aicore__ inline void +CamMoeCombineNormal::InitTilingData(const CamMoeCombineNormalTilingData *tilingData) +{ + axisBS_ = tilingData->camMoeCombineNormalInfo.bs; + axisH_ = tilingData->camMoeCombineNormalInfo.h; + axisK_ = tilingData->camMoeCombineNormalInfo.k; + aivNum_ = tilingData->camMoeCombineNormalInfo.aivNum; + moeExpertNum_ = tilingData->camMoeCombineNormalInfo.moeExpertNum; + moeExpertPerRankNum_ = tilingData->camMoeCombineNormalInfo.moeExpertPerRankNum; + epWorldSize_ = tilingData->camMoeCombineNormalInfo.epWorldSize; + epRankId_ = tilingData->camMoeCombineNormalInfo.epRankId; + isEnableDiagnose_ = tilingData->camMoeCombineNormalInfo.isEnableDiagnose; +} + +template +__aicore__ inline void CamMoeCombineNormal::InitBuffLen() +{ + uint32_t hFloatSize = axisH_ * static_cast(sizeof(float)); + h32AlignFloatLen_ = Ceil(hFloatSize, UB_32_ALIGN) * UB_32_ALIGN; + h256AlignFloatLen_ = Ceil(hFloatSize, MUL_256_ALIGN) * MUL_256_ALIGN; + hRecvXTypeLen_ = axisH_ * sizeof(RecvXType); + h32AlignRecvXLen_ = Ceil(hRecvXTypeLen_, UB_32_ALIGN) * UB_32_ALIGN; + h512AlignRecvXLen_ = Ceil(hRecvXTypeLen_, WIN_512_ALIGN) * WIN_512_ALIGN; + if (isEnableDiagnose_) { + sendCostStatsBufSize_ = Ceil(epWorldSize_ * sizeof(int32_t), UB_32_ALIGN) * UB_32_ALIGN; + } +} + +template +__aicore__ inline void CamMoeCombineNormal::Init( + GM_ADDR recvX, GM_ADDR tokenSrcInfo, GM_ADDR epRecvCount, GM_ADDR topkWeights, GM_ADDR tpRecvCount, GM_ADDR XOut, + GM_ADDR sendCostStatsOut, GM_ADDR workspaceGM, TPipe *pipe, const CamMoeCombineNormalTilingData *tilingData) +{ + workspaceGM_ = workspaceGM; + tpipe_ = pipe; + coreIdx_ = GetBlockIdx(); + + InitMagic(); + InitTilingData(tilingData); + InitGlobalBuffer(recvX, tokenSrcInfo, epRecvCount, topkWeights, XOut, sendCostStatsOut); + InitBuffLen(); + + PipeBarrier(); + winDataSizeOffset_ = static_cast(magic_) * (tilingData->camMoeCombineNormalInfo.totalWinSize / 2UL); + localRankGM_ = GetBufferAddrByRankId(epRankId_); + DataCacheCleanAndInvalid( + epRecvCountGM_[moeExpertNum_ - 1]); + selfSendCnt_ = epRecvCountGM_(moeExpertNum_ - 1); +} + +template +__aicore__ inline void CamMoeCombineNormal::CopyBufferToShareAndSetStatus() +{ + PipeBarrier(); + uint32_t perBlockSendNum = 0, startTokenId = 0, endTokenId = 0; + SplitCoreCal(selfSendCnt_, perBlockSendNum, startTokenId, endTokenId); + if (perBlockSendNum == 0U) { + return; + } + + uint32_t blockLen = static_cast(perBlockSendNum * TOKEN_SRC_INFO_LEN * sizeof(uint32_t)); + tpipe_->Reset(); + tpipe_->InitBuffer(stateBuf_, UB_32_ALIGN); + tpipe_->InitBuffer(localCopyQueue_, DOUBLE_BUFFER, h32AlignRecvXLen_); + tpipe_->InitBuffer(srcInfoBuf_, blockLen); + LocalTensor statusTensor = stateBuf_.AllocTensor(); + Duplicate(statusTensor, 0x3F800000, FLOAT_NUM_PER_ALIGN); + + LocalTensor srcInfoLocal = srcInfoBuf_.Get(); + const DataCopyExtParams dataCopyParams{1U, blockLen, 0U, 0U, 0U}; + const DataCopyPadExtParams padParams{false, 0U, 0U, 0U}; + DataCopyPad(srcInfoLocal, tokenSrcInfoGM_[startTokenId * TOKEN_SRC_INFO_LEN], dataCopyParams, padParams); + SyncFunc(); + + LocalTensor sendCostStatsTensor; + if (isEnableDiagnose_) { + tpipe_->InitBuffer(sendCostStatsOutQueue_, DOUBLE_BUFFER, sendCostStatsBufSize_); + sendCostStatsTensor = sendCostStatsOutQueue_.AllocTensor(); + Duplicate(sendCostStatsTensor, 0, sendCostStatsBufSize_ / sizeof(int32_t)); + } + + for (uint32_t tokenIndex = startTokenId; tokenIndex < endTokenId; tokenIndex++) { + uint32_t index = (tokenIndex - startTokenId) * TOKEN_SRC_INFO_LEN; + uint32_t srcRankId = static_cast(srcInfoLocal(index + RANK_ID_OFFSET_IN_SRC_INFO)); + uint32_t srcTokenId = static_cast(srcInfoLocal(index + TOKEN_IDX_OFFSET_IN_SRC_INFO)); + uint32_t srcTopkId = static_cast(srcInfoLocal(index + TOPK_IDX_OFFSET_IN_SRC_INFO)); + int64_t sendStartCycle = GetSystemCycle(); + + CopyBufferToShare(srcRankId, srcTokenId, srcTopkId, tokenIndex); + PipeBarrier(); + SetStatusBySrcInfo(srcRankId, srcTokenId, srcTopkId); + + if (isEnableDiagnose_) { + SyncFunc(); + int32_t durationTime = static_cast((GetSystemCycle() - sendStartCycle) / CYCLE_TO_TIME); // us + int32_t preTime = sendCostStatsTensor.GetValue(srcRankId); + sendCostStatsTensor.SetValue(srcRankId, preTime + durationTime); + } + } + + if (isEnableDiagnose_) { + SyncFunc(); + AscendC::SetAtomicAdd(); + DataCopyExtParams statsCopyOutParams = {1U, static_cast(epWorldSize_ * sizeof(int32_t)), 0U, 0U, 0U}; + DataCopyPad(sendCostStatsGT_, sendCostStatsTensor, statsCopyOutParams); + AscendC::SetAtomicNone(); + sendCostStatsOutQueue_.FreeTensor(sendCostStatsTensor); + } + + SyncFunc(); +} + +template +__aicore__ inline void CamMoeCombineNormal::CopyBufferToShare(uint32_t srcRankId, + uint32_t srcTokenId, + uint32_t srcTopkId, uint32_t tkIndex) +{ + uint32_t tokenOffset = tkIndex * axisH_; + GM_ADDR dstGM = GetBufferAddrByRankId(srcRankId) + (srcTokenId * axisK_ + srcTopkId) * h512AlignRecvXLen_; + GlobalTensor dstWindow; + dstWindow.SetGlobalBuffer((__gm__ XType *)dstGM); + DataCopyExtParams xOutCopyParams{1U, static_cast(hRecvXTypeLen_), 0U, 0U, 0U}; + DataCopyPadExtParams copyPadExtParams{false, 0U, 0U, 0U}; + + LocalTensor localCopyTensor; + localCopyTensor = localCopyQueue_.AllocTensor(); + DataCopyPad(localCopyTensor, recvXGM_[tokenOffset], xOutCopyParams, copyPadExtParams); + localCopyQueue_.EnQue(localCopyTensor); + localCopyTensor = localCopyQueue_.DeQue(); + DataCopyPad(dstWindow, localCopyTensor, xOutCopyParams); + localCopyQueue_.FreeTensor(localCopyTensor); +} + +template +__aicore__ inline void CamMoeCombineNormal::SetStatusBySrcInfo(uint32_t srcRankId, + uint32_t srcTokenId, + uint32_t srcTopkId) +{ + LocalTensor statusTensor = stateBuf_.AllocTensor(); + GM_ADDR stateGM = GetStateAddrByRankId(srcRankId) + (srcTokenId * axisK_ + srcTopkId) * UB_32_ALIGN; + GlobalTensor stateGMTensor; + stateGMTensor.SetGlobalBuffer((__gm__ uint32_t *)stateGM); + DataCopy(stateGMTensor, statusTensor, FLOAT_NUM_PER_ALIGN); +} + +template +__aicore__ inline void CamMoeCombineNormal::WaitBuffCopy(uint32_t tokenIndex) +{ + uint32_t calCount = axisK_ * FLOAT_NUM_PER_ALIGN; + GM_ADDR stateGM = GetStateAddrByRankId(epRankId_) + tokenIndex * axisK_ * UB_32_ALIGN; // 计算地址偏移 + GlobalTensor stateGMTensor; + stateGMTensor.SetGlobalBuffer((__gm__ float *)stateGM); + float current = (float)0.0; + float target = (float)1.0 * axisK_ * FLOAT_NUM_PER_ALIGN; + SumParams sumPerKParams{1, calCount, calCount}; + LocalTensor stateTensorLocal = stateBuf_.Get(); + LocalTensor tempStateTensorLocal = tempStateBuf_.Get(); + while (current != target) { + SyncFunc(); + DataCopy(stateTensorLocal, stateGMTensor, calCount); + SyncFunc(); + Sum(tempStateTensorLocal, stateTensorLocal, sumPerKParams); + SyncFunc(); + current = tempStateTensorLocal(0); + } + SyncFunc(); + Duplicate(tempStateTensorLocal, (float)0.0, calCount); + SyncFunc(); + DataCopy(stateGMTensor, tempStateTensorLocal, calCount); +} + +template +__aicore__ inline void CamMoeCombineNormal::ReadBufferAndWeightedSum(uint32_t tokenIndex, + uint32_t startTokenIndex) +{ + LocalTensor tokenFloatLocal = tokenFloatBuf_.Get(); + LocalTensor weightedMulBufLocal = weightedMulBuf_.Get(); + LocalTensor sumFloatBufLocal = sumFloatBuf_.Get(); + LocalTensor topkWeightsLocal = topkWeightsBuf_.Get(); + LocalTensor stateTensorLocal = stateBuf_.Get(); + Duplicate(sumFloatBufLocal, static_cast(0), axisH_); + const DataCopyExtParams xOutCopyParams{1U, static_cast(hRecvXTypeLen_), 0U, 0U, 0U}; + + for (uint32_t topkId = 0U; topkId < axisK_; topkId++) { + float scale = topkWeightsLocal.GetValue((tokenIndex - startTokenIndex) * axisK_ + topkId); + GM_ADDR localTokenAddr = localRankGM_ + (tokenIndex * axisK_ + topkId) * h512AlignRecvXLen_; + GlobalTensor localTokenTensor; + localTokenTensor.SetGlobalBuffer((__gm__ XType *)localTokenAddr); + + LocalTensor tmpToken = weightedSumQueue_.AllocTensor(); + const DataCopyPadExtParams copyPadExtParams{false, 0U, 0U, 0U}; + DataCopyPad(tmpToken, localTokenTensor, xOutCopyParams, copyPadExtParams); + weightedSumQueue_.EnQue(tmpToken); + tmpToken = weightedSumQueue_.DeQue(); + Cast(tokenFloatLocal, tmpToken, AscendC::RoundMode::CAST_NONE, axisH_); + PipeBarrier(); + AscendC::Muls(weightedMulBufLocal, tokenFloatLocal, scale, axisH_); + PipeBarrier(); + AscendC::Add(sumFloatBufLocal, sumFloatBufLocal, weightedMulBufLocal, axisH_); + weightedSumQueue_.FreeTensor(tmpToken); + } + PipeBarrier(); + LocalTensor xOutLocal = xOutBuf_.Get(); + Cast(xOutLocal, sumFloatBufLocal, AscendC::RoundMode::CAST_RINT, axisH_); + SyncFunc(); + DataCopyPad(xOutGlobal_[tokenIndex * axisH_], xOutLocal, xOutCopyParams); +} + +template +__aicore__ inline void CamMoeCombineNormal::ReadBufferFromRemote() +{ + if (axisBS_ == 0U) { + return; + } + uint32_t tokenPerBlock = 0U, startTokenIndex = 0U, endTokenIndex = 0U; + SplitCoreCal(axisBS_, tokenPerBlock, startTokenIndex, endTokenIndex); + + if (tokenPerBlock == 0U) { + return; + } + + tpipe_->Reset(); + tpipe_->InitBuffer(xOutBuf_, h32AlignRecvXLen_); + tpipe_->InitBuffer(tokenFloatBuf_, h32AlignFloatLen_); + tpipe_->InitBuffer(weightedMulBuf_, h256AlignFloatLen_); + tpipe_->InitBuffer(sumFloatBuf_, h32AlignFloatLen_); + tpipe_->InitBuffer(weightedSumQueue_, DOUBLE_BUFFER, h32AlignRecvXLen_); + tpipe_->InitBuffer(stateBuf_, (axisK_)*UB_32_ALIGN); + tpipe_->InitBuffer(tempStateBuf_, (axisK_)*UB_32_ALIGN); + tpipe_->InitBuffer(topkWeightsBuf_, tokenPerBlock * axisK_ * sizeof(float)); + + LocalTensor topkWeightsLocal = topkWeightsBuf_.Get(); + const DataCopyExtParams bskParams{1U, static_cast(tokenPerBlock * axisK_ * sizeof(float)), 0U, 0U, 0U}; + const DataCopyPadExtParams copyPadFloatParams{false, 0U, 0U, 0U}; + DataCopyPad(topkWeightsLocal, topkWeightsGM_[startTokenIndex * axisK_], bskParams, copyPadFloatParams); + SyncFunc(); + + for (uint32_t tokenIndex = startTokenIndex; tokenIndex < endTokenIndex; tokenIndex++) { + WaitBuffCopy(tokenIndex); + SyncFunc(); // 与结果搬出datacopy同tensor + ReadBufferAndWeightedSum(tokenIndex, startTokenIndex); + } +} + +template +__aicore__ inline void CamMoeCombineNormal::Process() +{ + if ASCEND_IS_AIV { // 全aiv处理 + CopyBufferToShareAndSetStatus(); + ReadBufferFromRemote(); + } +} + +} // namespace CamMoeCombineNormalImpl +#endif // MOE_COMBINE_IMPL_H diff --git a/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal_tiling.h b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..49b7cafecebaa82372664f44d6f985a28ed04a16 --- /dev/null +++ b/src/cam/comm_operator/ascend_kernels/moe_combine_normal/op_kernel/cam_moe_combine_normal_tiling.h @@ -0,0 +1,42 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: Normal combine tiling header file + * Create: 2025-11-28 + * Note: + * History: 2025-11-28 create normal combine tiling header file + */ + +#ifndef CAM_MOE_COMBINE_NORMAL_TILING_H +#define CAM_MOE_COMBINE_NORMAL_TILING_H + +#include +#include "kernel_tiling/kernel_tiling.h" + +struct CamMoeCombineNormalInfo { + uint32_t epWorldSize; + uint32_t tpWorldSize; + uint32_t epRankId; + uint32_t tpRankId; + uint32_t expertShardType; + uint32_t moeExpertNum; + uint32_t moeExpertPerRankNum; + uint32_t globalBs; + uint32_t bs; + uint32_t k; + uint32_t h; + uint32_t aivNum; + uint64_t totalUbSize; + uint64_t totalWinSize; + float armAvgFactor; + float epsilon; + bool isEnableDiagnose; +}; +struct CamMoeCombineNormalTilingData { + Mc2InitTiling mc2InitTiling; + Mc2CcTiling mc2CcTiling1; + Mc2CcTiling mc2CcTiling2; + CamMoeCombineNormalInfo camMoeCombineNormalInfo; +}; + +#endif // CAM_MOE_COMBINE_NORMAL_TILING_H