diff --git a/ads/common/__init__.py b/ads/common/__init__.py
index 8e59829b5732c061d84fec33f3401d83ca9257a3..021418f12d15c708b799729a39fd351db043dd69 100644
--- a/ads/common/__init__.py
+++ b/ads/common/__init__.py
@@ -21,3 +21,4 @@ from .ops.npu_bounding_box_encode import npu_bounding_box_encode
 from .ops.npu_batch_nms import npu_batch_nms
 from .ops.npu_confusion_transpose import npu_confusion_transpose
 from .ops.npu_broadcast import npu_broadcast
+from .ops.npu_moe_tutel import npu_moe_tutel
diff --git a/ads/common/ops/csrc/MoeTutelOpApi.cpp b/ads/common/ops/csrc/MoeTutelOpApi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ec9fa354b375b96b8cc2f61bba993cbed69a3f0
--- /dev/null
+++ b/ads/common/ops/csrc/MoeTutelOpApi.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
+#include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/aten/CustomFunctions.h"
+#include "functions.h"
+#include "common.h"
+#include "OpApiCommon.h"
+
+using npu_preparation = at_npu::native::OpPreparation;
+using torch::autograd::Function;
+using torch::autograd::AutogradContext;
+using tensor_tuple = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
+
+namespace {
+inline void npu_moe_tutel_check(
+    const at::Tensor& self,
+    const at::Tensor& gates,
+    const at::Tensor& indices,
+    const at::Tensor& locations)
+{
+    TORCH_CHECK(self.dim() == 2, "The dim of input tensor [x] should equal to (sample, hidden).");
+    TORCH_CHECK(gates.dim() == 2, "The dim of gates tensor [x] should equal to (batch, sample).");
+    TORCH_CHECK(self.sizes()[0] == gates.sizes()[1], "input's sample size should equal to gates's samples size.");
+    TORCH_CHECK((gates.sizes() == indices.sizes()) && (indices.sizes() == locations.sizes()),
+        "Shape of gates should match shape of indices and locations.");
+}
+} // namespace
+
+at::Tensor npu_moe_tutel(
+    const at::Tensor& self,
+    const at::Tensor& gates,
+    const at::Tensor& indices,
+    const at::Tensor& locations,
+    int64_t capacity)
+{
+    npu_moe_tutel_check(self, gates, indices, locations);
+    auto gates_size = gates.sizes();
+    auto self_size = self.sizes();
+    auto output_size = {gates_size[0], capacity, self_size[1]};
+    at::Tensor result = at::zeros(output_size, self.options());
+    EXEC_NPU_CMD(aclnnMoeTutelDispatch, self, gates, indices, locations, capacity, result);
+    return result;
+}
+
+at::Tensor npu_moe_tutel_data_backward(
+    const at::Tensor& y_grad,
+    const at::Tensor& gates,
+    const at::Tensor& indices,
+    const at::Tensor& locations)
+{
+    auto gates_size = gates.sizes();
+    auto grad_size = y_grad.sizes();
+    auto output_size = {gates_size[1], grad_size[2]};
+    at::Tensor result = at::zeros(output_size, y_grad.options());
+    EXEC_NPU_CMD(aclnnMoeTutelCombineX, y_grad, gates, indices, locations, result);
+    return result;
+}
+
+at::Tensor npu_moe_tutel_gate_backward(
+    const at::Tensor& self,
+    const at::Tensor& y_grad,
+    const at::Tensor& indices,
+    const at::Tensor& locations)
+{
+    at::Tensor result = at::zeros(indices.sizes(), y_grad.options());
+    EXEC_NPU_CMD(aclnnMoeTutelCombineGates, self, y_grad, indices, locations, result);
+    return result;
+}
diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h
index 0afa2edce5b9de5e41969fbd8b7d91855377f509..f713b98400c6d2168727ff675f61b2c73a086b74 100644
--- a/ads/common/ops/csrc/functions.h
+++ b/ads/common/ops/csrc/functions.h
@@ -116,5 +116,21 @@ at::Tensor npu_conv_transpose2d(
     int64_t groups);
 at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size);
 at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result);
+at::Tensor npu_moe_tutel(
+    const at::Tensor &self,
+    const at::Tensor &gates,
+    const at::Tensor &indices,
+    const at::Tensor &locations,
+    int64_t capacity);
+at::Tensor npu_moe_tutel_data_backward(
+    const at::Tensor &y_grad,
+    const at::Tensor &gates,
+    const at::Tensor &indices,
+    const at::Tensor &locations);
+at::Tensor npu_moe_tutel_gate_backward(
+    const at::Tensor &self,
+    const at::Tensor &y_grad,
+    const at::Tensor &indices,
+    const at::Tensor &locations);
 
 #endif // __FUNCTIONS_H__
diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp
index b8ebe3f5250e9add59301ffab14216dcf2b18539..c91e4093ea2c93a19f213e43557b3cdd6d25cb1e 100644
--- a/ads/common/ops/csrc/pybind.cpp
+++ b/ads/common/ops/csrc/pybind.cpp
@@ -66,4 +66,9 @@ void init_common(pybind11::module &m)
 
     // npu_broadcast
     m.def("npu_broadcast", &npu_broadcast);
+    
+    // npu_moe_tutel
+    m.def("npu_moe_tutel", &npu_moe_tutel, "npu_moe_tutel NPU version");
+    m.def("npu_moe_tutel_data_backward", &npu_moe_tutel_data_backward, "npu_moe_tutel_data_backward NPU version");
+    m.def("npu_moe_tutel_gate_backward", &npu_moe_tutel_gate_backward, "npu_moe_tutel_gate_backward NPU version");
 }
diff --git a/ads/common/ops/npu_moe_tutel.py b/ads/common/ops/npu_moe_tutel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba98c3574f4de65ed8c014d40a6981bce529a606
--- /dev/null
+++ b/ads/common/ops/npu_moe_tutel.py
@@ -0,0 +1,27 @@
+import torch
+from torch.autograd import Function
+from torch.nn import Module
+
+import torch_npu
+import ads_c
+
+
+class MoeTutelFunction(Function):
+    @staticmethod
+    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(ctx, x, gates, indices, locations, capacity):
+        result = ads_c.npu_moe_tutel(x, gates, indices, locations, capacity)
+        ctx.save_for_backward(x, gates, indices, locations)
+        return result
+
+    @staticmethod
+    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+    # 'pylint: disable=too-many-return-arguments,huawei-too-many-return-arguments
+    def backward(ctx, y_grad):
+        x0, gates, indices, locations = ctx.saved_tensors
+        x_grad = ads_c.npu_moe_tutel_data_backward(y_grad, gates, indices, locations)
+        gates_grad = ads_c.npu_moe_tutel_gate_backward(x0, y_grad, indices, locations)
+        return x_grad, gates_grad, None, None, None
+
+
+npu_moe_tutel = MoeTutelFunction.apply
diff --git a/tests/test_npu_moe_tutel.py b/tests/test_npu_moe_tutel.py
new file mode 100644
index 0000000000000000000000000000000000000000..8723ced23c99a89456458a9e98d16309387c7760
--- /dev/null
+++ b/tests/test_npu_moe_tutel.py
@@ -0,0 +1,88 @@
+import unittest
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+class TestMoeTutel(TestCase):
+    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype):
+        result = torch.zeros([batch_size, capacity, hidden]).to(dtype)
+        for tensor_idx in range(batch_size):
+            for i in range(sample_size):
+                if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0:
+                    result[int(indices[tensor_idx, i]), int(locations[tensor_idx, i]), :] = gates[tensor_idx, i] * x[i,
+                                                                                                                   :]
+        return result
+
+    def npu_to_exec(self, x, gates, indices, locations, capacity):
+        out = ads.common.npu_moe_tutel(x, gates, indices, locations, capacity)
+        return out.cpu()
+
+    def gen_data(self, shape, dtype):
+        cpu_input = torch.rand(shape, dtype=dtype)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+    
+    def gen_data_gates(self, shape, dtype):
+        cpu_input = torch.rand(shape).bool().to(dtype)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    def gen_data_indices(self, shape):
+        batch_size = shape[0]
+        sample_size = shape[1]
+        cpu_input = torch.zeros((1, sample_size)).int()
+        indices = torch.ones((1, sample_size)).int()
+        for i in range(batch_size - 1):
+            cpu_input = torch.cat((cpu_input, torch.mul(indices, torch.tensor(i + 1, dtype=torch.int32))), 0)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    def gen_data_locations(self, shape):
+        batch_size = shape[0]
+        sample_size = shape[1]
+        cpu_input = torch.arange(0, sample_size).reshape(1, sample_size).int()
+        cpu_input = cpu_input.repeat(batch_size, 1)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `MoeTutel` is only supported on 910B, skip this ut!")
+    def test_moe_tutel(self):
+        dtype_list = [torch.float16, torch.float32, torch.bfloat16]
+        shape_list = [
+            [[2, 5], [5, 16], 6],
+            [[3, 6], [6, 16], 6],
+            [[4, 7], [7, 32], 12],
+            [[5, 8], [8, 32], 12],
+            [[2, 16384], [16384, 32], 16384],
+        ]
+        items = [
+            [shape, dtype]
+            for shape in shape_list
+            for dtype in dtype_list
+        ]
+        for shape, dtype in items:
+            capacity = shape[2]
+            batch_size = shape[0][0]
+            sample_size = shape[0][1]
+            hidden = shape[1][1]
+            cpu_x, npu_x = self.gen_data(shape[1], dtype)
+            cpu_gates, npu_gates = self.gen_data_gates(shape[0], dtype)
+            cpu_indices, npu_indices = self.gen_data_indices(shape[0])
+            cpu_locations, npu_locations = self.gen_data_locations(shape[0])
+            cpu_out = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size, sample_size,
+                                       hidden, dtype)
+            npu_out = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity)
+            if dtype == torch.bfloat16 or dtype == torch.float16:
+                npu_out = npu_out.to(torch.float32)
+                cpu_out = cpu_out.to(torch.float32)
+            self.assertRtolEqual(npu_out.numpy(), cpu_out.numpy())
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/tests/test_npu_moe_tutel_backward.py b/tests/test_npu_moe_tutel_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee401777c0b2976d82fc19a78de73d0187a2b960
--- /dev/null
+++ b/tests/test_npu_moe_tutel_backward.py
@@ -0,0 +1,100 @@
+import unittest
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+class TestMoeTutel(TestCase):
+    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype):
+        x.requires_grad = True
+        gates.requires_grad = True
+        out = torch.zeros([batch_size, capacity, hidden]).to(dtype)
+        for tensor_idx in range(batch_size):
+            for i in range(sample_size):
+                if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0:
+                    out[indices[tensor_idx, i], locations[tensor_idx, i], :] = gates[tensor_idx, i] * x[i, :]
+        out.backward(torch.ones_like(out))
+        x_grad = x.grad
+        gates_grad = gates.grad
+        return x_grad, gates_grad
+
+    def npu_to_exec(self, x, gates, indices, locations, capacity):
+        x.requires_grad = True
+        gates.requires_grad = True
+        out = ads.common.npu_moe_tutel(x, gates, indices, locations, capacity)
+        out.backward(torch.ones_like(out))
+        x_grad = x.grad
+        gates_grad = gates.grad
+        return x_grad, gates_grad
+
+    def gen_data(self, shape, dtype):
+        cpu_input = torch.rand(shape, dtype=dtype)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    def gen_data_gates(self, shape, dtype):
+        cpu_input = torch.rand(shape).bool().to(dtype)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    def gen_data_indices(self, shape):
+        batch_size = shape[0]
+        sample_size = shape[1]
+        cpu_input = torch.zeros((1, sample_size)).int()
+        indices = torch.ones((1, sample_size)).int()
+        for i in range(batch_size - 1):
+            cpu_input = torch.cat((cpu_input, torch.mul(indices, torch.tensor(i + 1, dtype=torch.int32))), 0)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    def gen_data_locations(self, shape):
+        batch_size = shape[0]
+        sample_size = shape[1]
+        cpu_input = torch.arange(0, sample_size).reshape(1, sample_size).int()
+        cpu_input = cpu_input.repeat(batch_size, 1)
+        npu_input = cpu_input.npu()
+        return cpu_input, npu_input
+
+    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `MoeTutel` is only supported on 910B, skip this ut!")
+    def test_moe_tutel(self):
+        dtype_list = [torch.float16, torch.float32, torch.bfloat16]
+        shape_list = [
+            [[2, 5], [5, 16], 6],
+            [[3, 6], [6, 16], 6],
+            [[4, 7], [7, 32], 12],
+            [[5, 8], [8, 32], 12],
+            [[2, 16384], [16384, 32], 16384],
+        ]
+        items = [
+            [shape, dtype]
+            for shape in shape_list
+            for dtype in dtype_list
+        ]
+        for shape, dtype in items:
+            capacity = shape[2]
+            batch_size = shape[0][0]
+            sample_size = shape[0][1]
+            hidden = shape[1][1]
+            cpu_x, npu_x = self.gen_data(shape[1], dtype)
+            cpu_gates, npu_gates = self.gen_data_gates(shape[0], dtype)
+            cpu_indices, npu_indices = self.gen_data_indices(shape[0])
+            cpu_locations, npu_locations = self.gen_data_locations(shape[0])
+            cpu_grad1, cpu_grad2 = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size,
+                                                    sample_size, hidden, dtype)
+            npu_grad1, npu_grad2 = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity)
+            if dtype == torch.bfloat16 or dtype == torch.float16:
+                cpu_grad1 = cpu_grad1.to(torch.float32)
+                cpu_grad2 = cpu_grad2.to(torch.float32)
+                npu_grad1 = npu_grad1.to(torch.float32)
+                npu_grad2 = npu_grad2.to(torch.float32)
+            self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.numpy())
+            self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.numpy())
+
+
+if __name__ == '__main__':
+    run_tests()