From 148f1302a5a94fd4c1dc22992c69a297d576eb57 Mon Sep 17 00:00:00 2001 From: caoyujuan Date: Wed, 3 Sep 2025 15:41:05 +0800 Subject: [PATCH] FusedDeepMoe --- ccsrc/ops/ascendc/CMakeLists.txt | 2 +- .../ascendc/fused_deep_moe/fused_deep_moe.cc | 106 ++++++++++++++++++ yaml/ascendc/fused_deep_moe_op.yaml | 47 ++++++++ yaml/doc/fused_deep_moe.md | 106 ++++++++++++++++++ 4 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc create mode 100644 yaml/ascendc/fused_deep_moe_op.yaml create mode 100644 yaml/doc/fused_deep_moe.md diff --git a/ccsrc/ops/ascendc/CMakeLists.txt b/ccsrc/ops/ascendc/CMakeLists.txt index f76f27a9..4429cffd 100644 --- a/ccsrc/ops/ascendc/CMakeLists.txt +++ b/ccsrc/ops/ascendc/CMakeLists.txt @@ -23,7 +23,7 @@ set(ASCENDC_SRC_FILES ${SRC_FILES} PARENT_SCOPE) set(OP_COMPILER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/../../../scripts/op_compiler.py") -if(SRC_FILES) +if(ASCENDC_OP_DIRS) include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/compile_ascendc_ops.cmake) endif() diff --git a/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc b/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc new file mode 100644 index 00000000..cd1c2770 --- /dev/null +++ b/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc @@ -0,0 +1,106 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// ============================================================================= +// GRAPH MODE IMPLEMENTATION +// ============================================================================= + +#include "ascendc_kernel_mod.h" +#include "ms_extension/api.h" +#include +#include + +namespace ms_custom_ops { +class OPS_API FusedDeepMoeCustomOpFuncImpl : public OpFuncImpl { +public: + ShapeArray InferShape(const PrimitivePtr &primitive, + const InferInfoPtrList &input_infos) const override { + auto out_shape = input_infos[0]->GetShape(); + return {out_shape}; + } + std::vector + InferType(const PrimitivePtr &primitive, + const InferInfoPtrList &input_infos) const override { + return {input_infos[0]->GetType()}; + } + + bool GeneralInferRegistered() const override { return true; } +}; + +class FusedDeepMoeCustomAscend : public AscendCKernelMod { +public: + FusedDeepMoeCustomAscend() : AscendCKernelMod(std::move("aclnnFusedDeepMoe")) {} + ~FusedDeepMoeCustomAscend() = default; + + bool Launch(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, void *stream_ptr) override { + MS_EXCEPTION_IF_NULL(stream_ptr); + RunOp(stream_ptr, workspace, inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], + inputs[5], inputs[6], inputs[7], inputs[8], inputs[9], inputs[10], inputs[11], + inputs[12], inputs[13], inputs[14], inputs[15], outputs[0]); + return true; + } + + void GetWorkSpaceInfo(const std::vector &inputs, + const std::vector &outputs) override { + GetWorkspaceForResize(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5], + inputs[6], inputs[7], inputs[8], inputs[9], inputs[10], inputs[11], + inputs[12], inputs[13], inputs[14], inputs[15], outputs[0]); + } + +private: + DEFINE_GET_WORKSPACE_FOR_RESIZE(); +}; +} // namespace ms_custom_ops + +REG_GRAPH_MODE_OP(fused_deep_moe, ms_custom_ops::FusedDeepMoeCustomOpFuncImpl, + ms_custom_ops::FusedDeepMoeCustomAscend); + +// ============================================================================= +// PYBOOST MODE IMPLEMENTATION +// ============================================================================= + +namespace ms_custom_ops { +using namespace mindspore; +using namespace mindspore::device::ascend; +constexpr size_t kFusedDeepMoeOutputNum = 1; + +ms::Tensor custom_fused_deep_moe(const ms::Tensor &x, const ms::Tensor &expert_ids, + const ms::Tensor &gmm1_permuted_weight, const ms::Tensor &gmm1_permuted_weight_scale, + const ms::Tensor &gmm2_weight, const ms::Tensor &gmm2_weight_scale, + const ms::Tensor &expert_smooth_scales, const ms::Tensor &expert_scales, + const string &group_ep, const int64_t &ep_rank_size, + const int64_t &ep_rank_id, const int64_t &moe_expert_num, + const int64_t &share_expert_num, const int64_t &share_expert_rank_num, + const int64_t &quant_mode, const int64_t &global_bs) { + MS_LOG(WARNING)<<"-------line90-------"; + auto out = ms::Tensor(x.data_type(), x.shape()); + auto runner = std::make_shared("FusedDeepMoe"); + runner->SetLaunchFunc(LAUNCH_ACLNN_FUNC(aclnnFusedDeepMoe, x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, + gmm2_weight, gmm2_weight_scale, expert_smooth_scales, expert_scales, group_ep, ep_rank_size, + ep_rank_id, moe_expert_num, share_expert_num, share_expert_rank_num, quant_mode, global_bs, out)); + MS_LOG(WARNING)<<"-------line96------"; + runner->Run({x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, gmm2_weight, gmm2_weight_scale, expert_smooth_scales, expert_scales}, {out}); + return out; +} +} // namespace ms_custom_ops + +MS_CUSTOM_OPS_EXTENSION_MODULE(m) { + m.def("fused_deep_moe", + PYBOOST_CALLER(ms_custom_ops::kFusedDeepMoeOutputNum, ms_custom_ops::custom_fused_deep_moe)); +} + diff --git a/yaml/ascendc/fused_deep_moe_op.yaml b/yaml/ascendc/fused_deep_moe_op.yaml new file mode 100644 index 00000000..aa40cb30 --- /dev/null +++ b/yaml/ascendc/fused_deep_moe_op.yaml @@ -0,0 +1,47 @@ +# operator fused_deep_moe +fused_deep_moe: + args: + x: + dtype: tensor + expert_ids: + dtype: tensor + gmm1_permuted_weight: + dtype: tensor + gmm1_permuted_weightScale: + dtype: tensor + gmm2_weight: + dtype: tensor + gmm2_weight_scale: + dtype: tensor + moe_expert_num: + dtype: int + expert_smooth_scales: + dtype: tensor + default: None + expert_scales: + dtype: tensor + default: None + group_ep: + dtype: str + default: None + ep_rank_size: + dtype: int + default: 1 + ep_rank_id: + dtype: int + default: 0 + share_expert_num: + dtype: int + default: 1 + share_expert_rank_num: + dtype: int + default: 0 + quant_mode: + dtype: int + default: 0 + global_bs: + dtype: int + default: 0 + returns: + output: + dtype: tensor diff --git a/yaml/doc/fused_deep_moe.md b/yaml/doc/fused_deep_moe.md new file mode 100644 index 00000000..00d3b651 --- /dev/null +++ b/yaml/doc/fused_deep_moe.md @@ -0,0 +1,106 @@ +# fused_deep_moe算子 + +## 描述 + +fused_deep_moe算子是Dispatch+FFN(GmmDeqSwigluQuantGmmDeq)+Combine子融合。 + +## 输入参数 + +| Name | DType | Shape | Description | +|------------------------------------|-----------------|----------------------------------------------------------|------------------------------------------------| +| x (required) | Tensor[float16/bfloat16] | ND:(bs, token_length) | dispatch输入 | +| expertIds (required) | Tensor[int32] | ND:(bs, topk) | dispatch发送表 | +| gmm1PermutedWeight (required) | Tensor[int8] | NZ: (group_num, token_length, gmm1_hidden_size) | GMM1的权重矩阵 【重排、NZ格式】 | +| gmm1PermutedWeightScale (required) | Tensor[float32] | ND: (group_num, gmm1_hidden_size) | GMM1的权重矩阵量化时使用的缩放系数【重排】 | +| gmm2Weight (required) | Tensor[int8] | NZ: (group_num, token_length, gmm2_hidden_size) | GMM2的权重矩阵 【转置、NZ格式】 | +| gmm2WeightScale (required) | Tensor[float32] | ND: (group_num, token_length) | GMM1的权重矩阵量化时使用的缩放系数 | +| expertSmoothScales (optional) | Tensor[float] | NZ: (maxRecvTokenNum, token_length) | - | +| expertScales (optional) | Tensor[float] | ND: (bs, topk) | 每个Token的topK个专家权重 | +| groupEp (required) | char | - | 通信域名称 | +| epRankSize (required) | int64 | - | 通信域大小 | +| epRankId (required) | int64 | - | 本卡在通信域的rankId | +| moeExpertNum (required) | int64 | - | 路由专家数量 | +| shareExpertNum (required) | int64 | - | 共享专家数量 【当前固定传1】 | +| shareExpertRankNum (required) | int64 | - | 共享专家卡数 | +| quantMode (optional) | int64 | - | 量化模式【0:非量化,2:动态量化】 | +| globalBs (optional) | int64 | - | EP域全局的batch size大小 | +## 输出参数 + +| Name | DType | Shape | Description | +|--------|-----------------|--------------------------------------|-------------| +| output | Tensor[float16/bfloat16] | (bs, token_length) | 处理后的token | + +## 使用示例 + +```python +import mindspore as ms +import ms_custom_ops + +# 创建输入张量 +bs = 16 +token_length = 7168 +topk = 8 +group_num +gmm1_hidden_size = 7168 +gmm2_hidden_size = 2048 +ep_rank_size = 16 +moe_expert_num = 64 +share_expert_num = 1 +share_expert_rank_num = 0 +quantMode = 0 +globalBs = bs * ep_rank_size +moe_rank_num = ep_rank_size - share_expert_rank_num +moe_expert_num_per_rank = moe_expert_num // moe_rank_num +tile_n = 128 + +init() +rank_id = get_rank() +ep_rank_list = [[i for i in range(ep_rank_size)]] +group_ep = None +for ep_rank in ep_rank_list: + if rank_id in ep_rank: + group_ep = "group_name_" + "_".join([str(i) for i in ep_rank]) + create_group(group_ep, ep_rank) +def generate_uniform_random_tensor(lower: float, upper: float, size, dtype: ms.dtype) -> ms.Tensor: + return ms.mint.rand(size, dtype=dtype) * (upper - lower) + lower + + +def permute_weight(w: ms.Tensor, tile_n): + *dims, n = w.shape + order = list(range(len(dims))) + [-2, -3, -1] + return w.reshape(*dims, 2, n // tile_n, tile_n // 2).permute(order).reshape(*dims, n).contiguous() + + +x = ms.mint.rand((bs, token_length), dtype=ms.float16) +expert_ids = (ms.mint.arange(0, bs * topk) + rank_id) % moe_expert_num +expert_scales = ms.ops.ones((RANK_BS, topK), dtype=ms.float32) +expert_smooth_scales = None + +if rank_id < share_rank_num: + group_num = 1 +else: + group_num = moe_expert_num_per_rank + +if rank_id < share_rank_num: + gmm_origin_weight1 = ms.ones(size=(group_num, token_length, gmm1_hidden_size), dtype=ms.int8) + gmm_origin_scale1 = ms.ones(size=(group_num, gmm1_hidden_size,), dtype=ms.float32) * 0.004 +else: + gmm_origin_weight1 = ms.mint.randint(-16, 16, size=(group_num, token_length, gmm1_hidden_size), dtype=ms.int8) + gmm_origin_scale1 = generate_uniform_random_tensor(0.004, 0.005, size=(group_num, gmm1_hidden_size,), dtype=ms.float32) +gmm_permute_weight1 = ms.ops.auto_generate.format_cast(permute_weight(gmm_origin_weight1, tile_n), 29) +gmm_permute_scale1 = permute_weight(gmm1_origin_scale1, tile_n) + +if rank_id < SHARE_RANK_NUM: + gmm_origin_weight2 = ms.ones(size=(group_num, gmm2_hidden_size, token_length), dtype=ms.int8) + gmm_scale2 = ms.ones(size=(group_num, token_length,), dtype=ms.float32) * 0.004 +else: + gmm_origin_weight2 = ms.mint.randint(-16, 16, size=(group_num, gmm2_hidden_size, token_length), dtype=ms.int8) + gmm_scale2 = generate_uniform_random_tensor(0.004, 0.005, size=(group_num, token_length,), dtype=ms.float32) +gmm_weight2 = ms.ops.auto_generate.format_cast(gmm_origin_weight2.transpose(0, 2, 1), 29) + +# 调用算子 +outputs = ms_custom_ops.fused_deep_moe(x, expert_ids, gmm_permute_weight1, gmm_permute_scale1, + gmm_weight2, gmm_scale2, expert_smooth_scales, expert_scales, group_ep, ep_rank_size, + rank_id, moe_expert_num, share_expert_num, share_expert_rank_num, quant_mode, globalBs) + +``` -- Gitee