From 148f1302a5a94fd4c1dc22992c69a297d576eb57 Mon Sep 17 00:00:00 2001
From: caoyujuan <caoyujuan1@huawei.com>
Date: Wed, 3 Sep 2025 15:41:05 +0800
Subject: [PATCH] FusedDeepMoe

---
 ccsrc/ops/ascendc/CMakeLists.txt              |   2 +-
 .../ascendc/fused_deep_moe/fused_deep_moe.cc  | 106 ++++++++++++++++++
 yaml/ascendc/fused_deep_moe_op.yaml           |  47 ++++++++
 yaml/doc/fused_deep_moe.md                    | 106 ++++++++++++++++++
 4 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc
 create mode 100644 yaml/ascendc/fused_deep_moe_op.yaml
 create mode 100644 yaml/doc/fused_deep_moe.md
diff --git a/ccsrc/ops/ascendc/CMakeLists.txt b/ccsrc/ops/ascendc/CMakeLists.txt
index f76f27a9..4429cffd 100644
--- a/ccsrc/ops/ascendc/CMakeLists.txt
+++ b/ccsrc/ops/ascendc/CMakeLists.txt
@@ -23,7 +23,7 @@ set(ASCENDC_SRC_FILES ${SRC_FILES} PARENT_SCOPE)
 
 set(OP_COMPILER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/../../../scripts/op_compiler.py")
 
-if(SRC_FILES)
+if(ASCENDC_OP_DIRS)
     include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/compile_ascendc_ops.cmake)
 endif()
 
diff --git a/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc b/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc
new file mode 100644
index 00000000..cd1c2770
--- /dev/null
+++ b/ccsrc/ops/ascendc/fused_deep_moe/fused_deep_moe.cc
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// =============================================================================
+// GRAPH MODE IMPLEMENTATION
+// =============================================================================
+
+#include "ascendc_kernel_mod.h"
+#include "ms_extension/api.h"
+#include <string>
+#include <vector>
+
+namespace ms_custom_ops {
+class OPS_API FusedDeepMoeCustomOpFuncImpl : public OpFuncImpl {
+public:
+  ShapeArray InferShape(const PrimitivePtr &primitive,
+                        const InferInfoPtrList &input_infos) const override {
+    auto out_shape = input_infos[0]->GetShape();
+    return {out_shape};
+  }
+  std::vector<TypeId>
+  InferType(const PrimitivePtr &primitive,
+            const InferInfoPtrList &input_infos) const override {
+    return {input_infos[0]->GetType()};
+  }
+
+  bool GeneralInferRegistered() const override { return true; }
+};
+
+class FusedDeepMoeCustomAscend : public AscendCKernelMod {
+public:
+  FusedDeepMoeCustomAscend() : AscendCKernelMod(std::move("aclnnFusedDeepMoe")) {}
+  ~FusedDeepMoeCustomAscend() = default;
+
+  bool Launch(const std::vector<KernelTensor *> &inputs,
+              const std::vector<KernelTensor *> &workspace,
+              const std::vector<KernelTensor *> &outputs, void *stream_ptr) override {
+    MS_EXCEPTION_IF_NULL(stream_ptr);
+    RunOp(stream_ptr, workspace, inputs[0], inputs[1], inputs[2], inputs[3], inputs[4],
+          inputs[5], inputs[6], inputs[7], inputs[8], inputs[9], inputs[10], inputs[11],
+          inputs[12], inputs[13], inputs[14], inputs[15], outputs[0]);
+    return true;
+  }
+
+  void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs,
+                        const std::vector<KernelTensor *> &outputs) override {
+    GetWorkspaceForResize(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5],
+                          inputs[6], inputs[7], inputs[8], inputs[9], inputs[10], inputs[11],
+                          inputs[12], inputs[13], inputs[14], inputs[15], outputs[0]);
+  }
+
+private:
+  DEFINE_GET_WORKSPACE_FOR_RESIZE();
+};
+} // namespace ms_custom_ops
+
+REG_GRAPH_MODE_OP(fused_deep_moe, ms_custom_ops::FusedDeepMoeCustomOpFuncImpl,
+                  ms_custom_ops::FusedDeepMoeCustomAscend);
+
+// =============================================================================
+// PYBOOST MODE IMPLEMENTATION
+// =============================================================================
+
+namespace ms_custom_ops {
+using namespace mindspore;
+using namespace mindspore::device::ascend;
+constexpr size_t kFusedDeepMoeOutputNum = 1;
+
+ms::Tensor custom_fused_deep_moe(const ms::Tensor &x, const ms::Tensor &expert_ids, 
+                                 const ms::Tensor &gmm1_permuted_weight, const ms::Tensor &gmm1_permuted_weight_scale,
+                                 const ms::Tensor &gmm2_weight, const ms::Tensor &gmm2_weight_scale,
+                                 const ms::Tensor &expert_smooth_scales, const ms::Tensor &expert_scales,
+                                 const string &group_ep, const int64_t &ep_rank_size,
+                                 const int64_t &ep_rank_id, const int64_t &moe_expert_num,
+                                 const int64_t &share_expert_num, const int64_t &share_expert_rank_num,
+                                 const int64_t &quant_mode, const int64_t &global_bs) {
+  MS_LOG(WARNING)<<"-------line90-------";
+  auto out = ms::Tensor(x.data_type(), x.shape());
+  auto runner = std::make_shared<ms::pynative::AclnnOpRunner>("FusedDeepMoe");
+  runner->SetLaunchFunc(LAUNCH_ACLNN_FUNC(aclnnFusedDeepMoe, x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale,
+                                          gmm2_weight, gmm2_weight_scale, expert_smooth_scales, expert_scales, group_ep, ep_rank_size,
+                                          ep_rank_id, moe_expert_num, share_expert_num, share_expert_rank_num, quant_mode, global_bs, out));
+  MS_LOG(WARNING)<<"-------line96------";
+  runner->Run({x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, gmm2_weight, gmm2_weight_scale, expert_smooth_scales, expert_scales}, {out});
+  return out;
+}
+} // namespace ms_custom_ops
+
+MS_CUSTOM_OPS_EXTENSION_MODULE(m) {
+  m.def("fused_deep_moe",
+        PYBOOST_CALLER(ms_custom_ops::kFusedDeepMoeOutputNum, ms_custom_ops::custom_fused_deep_moe));
+}
+
diff --git a/yaml/ascendc/fused_deep_moe_op.yaml b/yaml/ascendc/fused_deep_moe_op.yaml
new file mode 100644
index 00000000..aa40cb30
--- /dev/null
+++ b/yaml/ascendc/fused_deep_moe_op.yaml
@@ -0,0 +1,47 @@
+# operator fused_deep_moe
+fused_deep_moe:
+  args:
+    x:
+      dtype: tensor
+    expert_ids:
+      dtype: tensor
+    gmm1_permuted_weight:
+      dtype: tensor
+    gmm1_permuted_weightScale:
+      dtype: tensor
+    gmm2_weight:
+      dtype: tensor
+    gmm2_weight_scale:
+      dtype: tensor
+    moe_expert_num:
+      dtype: int
+    expert_smooth_scales:
+      dtype: tensor
+      default: None
+    expert_scales:
+      dtype: tensor
+      default: None
+    group_ep:
+      dtype: str
+      default: None
+    ep_rank_size:
+      dtype: int
+      default: 1
+    ep_rank_id:
+      dtype: int
+      default: 0
+    share_expert_num:
+      dtype: int
+      default: 1
+    share_expert_rank_num:
+      dtype: int
+      default: 0
+    quant_mode:
+      dtype: int
+      default: 0
+    global_bs:
+      dtype: int
+      default: 0
+  returns:
+    output:
+      dtype: tensor
diff --git a/yaml/doc/fused_deep_moe.md b/yaml/doc/fused_deep_moe.md
new file mode 100644
index 00000000..00d3b651
--- /dev/null
+++ b/yaml/doc/fused_deep_moe.md
@@ -0,0 +1,106 @@
+# fused_deep_moe算子
+
+## 描述
+
+fused_deep_moe算子是Dispatch+FFN(GmmDeqSwigluQuantGmmDeq)+Combine子融合。
+
+## 输入参数
+
+| Name                               | DType           | Shape                                                                    | Description                    |
+|------------------------------------|-----------------|----------------------------------------------------------|------------------------------------------------|
+| x (required)                       | Tensor[float16/bfloat16] | ND:(bs, token_length)                           | dispatch输入                                   |
+| expertIds (required)               | Tensor[int32]            | ND:(bs, topk)                                   | dispatch发送表                                 |
+| gmm1PermutedWeight (required)      | Tensor[int8]             | NZ: (group_num, token_length, gmm1_hidden_size) | GMM1的权重矩阵 【重排、NZ格式】                  |
+| gmm1PermutedWeightScale (required) | Tensor[float32]          | ND: (group_num, gmm1_hidden_size)               | GMM1的权重矩阵量化时使用的缩放系数【重排】        |
+| gmm2Weight (required)              | Tensor[int8]             | NZ: (group_num, token_length, gmm2_hidden_size) | GMM2的权重矩阵 【转置、NZ格式】                  |
+| gmm2WeightScale (required)         | Tensor[float32]          | ND: (group_num, token_length)                   | GMM1的权重矩阵量化时使用的缩放系数               |
+| expertSmoothScales (optional)      | Tensor[float]            | NZ: (maxRecvTokenNum, token_length)             | -                                              |
+| expertScales (optional)            | Tensor[float]            | ND: (bs, topk)                                  | 每个Token的topK个专家权重                       |
+| groupEp (required)                 | char                     | -                                               | 通信域名称                                      |
+| epRankSize (required)              | int64                    | -                                               | 通信域大小                                      |
+| epRankId (required)                | int64                    | -                                               | 本卡在通信域的rankId                            |
+| moeExpertNum (required)            | int64                    | -                                               | 路由专家数量                                    |
+| shareExpertNum (required)          | int64                    | -                                               | 共享专家数量 【当前固定传1】                      |
+| shareExpertRankNum (required)      | int64                    | -                                               | 共享专家卡数                                    |
+| quantMode (optional)               | int64                    | -                                               | 量化模式【0：非量化，2：动态量化】                |
+| globalBs (optional)                | int64                    | -                                               | EP域全局的batch size大小                        |
+## 输出参数
+
+| Name   | DType           | Shape                                | Description |
+|--------|-----------------|--------------------------------------|-------------|
+| output | Tensor[float16/bfloat16] | (bs, token_length)     | 处理后的token    |
+
+## 使用示例
+
+```python
+import mindspore as ms
+import ms_custom_ops
+
+# 创建输入张量
+bs = 16
+token_length = 7168
+topk = 8
+group_num
+gmm1_hidden_size = 7168
+gmm2_hidden_size = 2048
+ep_rank_size = 16
+moe_expert_num = 64
+share_expert_num = 1
+share_expert_rank_num = 0
+quantMode = 0
+globalBs = bs * ep_rank_size
+moe_rank_num = ep_rank_size - share_expert_rank_num
+moe_expert_num_per_rank = moe_expert_num // moe_rank_num
+tile_n = 128
+
+init()
+rank_id = get_rank()
+ep_rank_list = [[i for i in range(ep_rank_size)]]
+group_ep = None
+for ep_rank in ep_rank_list:
+    if rank_id in ep_rank:
+        group_ep = "group_name_" + "_".join([str(i) for i in ep_rank])
+        create_group(group_ep, ep_rank)
+def generate_uniform_random_tensor(lower: float, upper: float, size, dtype: ms.dtype) -> ms.Tensor:
+    return ms.mint.rand(size, dtype=dtype) * (upper - lower) + lower
+
+
+def permute_weight(w: ms.Tensor, tile_n):
+    *dims, n = w.shape
+    order = list(range(len(dims))) + [-2, -3, -1]
+    return w.reshape(*dims, 2, n // tile_n, tile_n // 2).permute(order).reshape(*dims, n).contiguous()
+
+
+x = ms.mint.rand((bs, token_length), dtype=ms.float16)
+expert_ids = (ms.mint.arange(0, bs * topk) + rank_id) % moe_expert_num
+expert_scales = ms.ops.ones((RANK_BS, topK), dtype=ms.float32)
+expert_smooth_scales = None
+
+if rank_id < share_rank_num:
+    group_num = 1
+else:
+    group_num = moe_expert_num_per_rank
+
+if rank_id < share_rank_num:
+    gmm_origin_weight1 = ms.ones(size=(group_num, token_length, gmm1_hidden_size), dtype=ms.int8)
+    gmm_origin_scale1 = ms.ones(size=(group_num, gmm1_hidden_size,), dtype=ms.float32) * 0.004
+else:
+    gmm_origin_weight1 = ms.mint.randint(-16, 16, size=(group_num, token_length, gmm1_hidden_size), dtype=ms.int8)
+    gmm_origin_scale1 = generate_uniform_random_tensor(0.004, 0.005, size=(group_num, gmm1_hidden_size,), dtype=ms.float32)
+gmm_permute_weight1 = ms.ops.auto_generate.format_cast(permute_weight(gmm_origin_weight1, tile_n), 29)
+gmm_permute_scale1 = permute_weight(gmm1_origin_scale1, tile_n)
+
+if rank_id < SHARE_RANK_NUM:
+    gmm_origin_weight2 = ms.ones(size=(group_num, gmm2_hidden_size, token_length), dtype=ms.int8)
+    gmm_scale2 = ms.ones(size=(group_num, token_length,), dtype=ms.float32) * 0.004
+else:
+    gmm_origin_weight2 = ms.mint.randint(-16, 16, size=(group_num, gmm2_hidden_size, token_length), dtype=ms.int8)
+    gmm_scale2 = generate_uniform_random_tensor(0.004, 0.005, size=(group_num, token_length,), dtype=ms.float32)
+gmm_weight2 = ms.ops.auto_generate.format_cast(gmm_origin_weight2.transpose(0, 2, 1), 29)
+
+# 调用算子
+outputs = ms_custom_ops.fused_deep_moe(x, expert_ids, gmm_permute_weight1, gmm_permute_scale1, 
+    gmm_weight2, gmm_scale2, expert_smooth_scales, expert_scales, group_ep, ep_rank_size, 
+    rank_id, moe_expert_num, share_expert_num, share_expert_rank_num, quant_mode, globalBs)
+
+```
-- 
Gitee