diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..602507e06123f48e0d3bfa8dfc4cb3fe365a1d0b
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft04/matmulfusion.h"
+#include <algorithm>
+#include <string>
+#include "src/litert/kernel_registry.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_MatMulFusion;
+
+namespace mindspore::kernel {
+
+int MatMulFusionDSPKernel::Prepare() { return RET_OK; }
+
+int MatMulFusionDSPKernel::CheckSpecs() {
+  // inputs: A, B, (optional) bias; output: C
+  if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) {
+    MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
+    MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  int M = 0, N = 0, K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(WARNING) << "MatMulFusion shape inference failed.";
+    return RET_ERROR;
+  }
+  // Bias check if present
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
+    auto bias_shape = in_tensors_[2]->shape();
+    if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) {
+      MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape;
+      return RET_ERROR;
+    }
+  }
+  // Output shape check
+  auto out_shape = out_tensors_[0]->shape();
+  if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) {
+    MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const {
+  if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR;
+  const auto &a_shape = in_tensors_[0]->shape();
+  const auto &b_shape = in_tensors_[1]->shape();
+  if (a_shape.size() != 2 || b_shape.size() != 2) {
+    MS_LOG(WARNING) << "A/B must be rank-2";
+    return RET_ERROR;
+  }
+  int aM = a_shape[0];
+  int aK = a_shape[1];
+  int bK = b_shape[0];
+  int bN = b_shape[1];
+  if (aK != bK) {
+    MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK;
+    return RET_ERROR;
+  }
+  *M = aM;
+  *K = aK;
+  *N = bN;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetActTypeCode(int *code) const {
+  if (code == nullptr) return RET_ERROR;
+  // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2)
+  int act = 0;  // default NONE
+  auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  if (param != nullptr) {
+    switch (param->act_type_) {
+      case ActType_Relu:
+        act = 1;
+        break;
+      case ActType_Relu6:
+        act = 2;
+        break;  // DSP uses 2 for RELU6, nnacl uses enum value 3
+      default:
+        act = 0;
+        break;
+    }
+  }
+  *code = act;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::RunFp32() {
+  kernel_name_ = "fp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunFp16() {
+  kernel_name_ = "hp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunInt32() {
+  kernel_name_ = "i32_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunInt16() {
+  kernel_name_ = "i16_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunComplex64() {
+  kernel_name_ = "c64_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::Run() {
+  int M = 0, N = 0, K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion GetMNK failed";
+    return RET_ERROR;
+  }
+  int act_code = 0;
+  (void)GetActTypeCode(&act_code);  // default 0 if not set
+
+  auto allocator = dsp_runtime_->GetAllocator();
+  uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
+  uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
+  uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
+  uint64_t bias_ptr = 0;
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
+    bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
+  }
+  // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type
+  SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast<uint64_t>(M), static_cast<uint64_t>(N),
+                static_cast<uint64_t>(K), static_cast<uint64_t>(act_code)});
+
+  int ret = RET_ERROR;
+  auto dtype = in_tensors_[0]->data_type();
+  if (dtype == kNumberTypeFloat32) {
+    ret = RunFp32();
+  } else if (dtype == kNumberTypeFloat16) {
+    ret = RunFp16();
+  } else if (dtype == kNumberTypeInt32) {
+    ret = RunInt32();
+  } else if (dtype == kNumberTypeInt16) {
+    ret = RunInt16();
+  } else if (dtype == kNumberTypeComplex64) {
+    ret = RunComplex64();
+  } else {
+    MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast<int>(dtype);
+    return RET_ERROR;
+  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion DSP run failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a487f089c878e328cd1a9200b923716978b91d0
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
+
+#include <vector>
+#include <string>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+class MatMulFusionDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+  ~MatMulFusionDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int RunFp32();
+  int RunFp16();
+  int RunInt32();
+  int RunInt16();
+  int RunComplex64();
+
+  // helpers
+  int GetMNK(int *M, int *N, int *K) const;
+  int GetActTypeCode(int *code) const;
+
+ private:
+  std::string kernel_name_;
+  uint64_t core_mask_{0xF};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d17ab5738380822ad3a301dac924baaf5e73d298
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc
@@ -0,0 +1,223 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft78/matmulfusion.h"
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include "src/litert/kernel_registry.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_MatMulFusion;
+
+namespace mindspore::kernel {
+
+int MatMulFusionDSPKernel::Prepare() { return RET_OK; }
+
+int MatMulFusionDSPKernel::CheckSpecs() {
+  if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3 &&
+      in_tensors_.size() != INPUT_TENSOR_SIZE_4) {
+    MS_LOG(WARNING) << "MatMulFusion expects 2, 3 or 4 inputs, got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+
+  if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
+    MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  int M = 0;
+  int N = 0;
+  int K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(WARNING) << "MatMulFusion shape inference failed.";
+    return RET_ERROR;
+  }
+
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3 || in_tensors_.size() == INPUT_TENSOR_SIZE_4) {
+    auto bias_shape = in_tensors_[INPUT_TENSOR_SIZE_2]->shape();
+    if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) {
+      MS_LOG(WARNING) << "Bias shape mismatch MxN.";
+      return RET_ERROR;
+    }
+  }
+
+  auto out_shape = out_tensors_[0]->shape();
+  if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) {
+    MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ").";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const {
+  if (M == nullptr || N == nullptr || K == nullptr) {
+    return RET_ERROR;
+  }
+  const auto &a_shape = in_tensors_[0]->shape();
+  const auto &b_shape = in_tensors_[1]->shape();
+  if (a_shape.size() != 2 || b_shape.size() != 2) {
+    MS_LOG(WARNING) << "A/B must be rank-2.";
+    return RET_ERROR;
+  }
+  int aM = a_shape[0];
+  int aK = a_shape[1];
+  int bK = b_shape[0];
+  int bN = b_shape[1];
+  if (aK != bK) {
+    MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK;
+    return RET_ERROR;
+  }
+  *M = aM;
+  *K = aK;
+  *N = bN;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetActTypeCode(int *code) const {
+  if (code == nullptr) {
+    return RET_ERROR;
+  }
+  int act = 0;
+  auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  if (param != nullptr) {
+    switch (param->act_type_) {
+      case ActType_Relu:
+        act = 1;
+        break;
+      case ActType_Relu6:
+        act = 2;
+        break;
+      default:
+        act = 0;
+        break;
+    }
+  }
+  *code = act;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::RunFp32() {
+  kernel_name_ = "fp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunFp64() {
+  kernel_name_ = "dp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunInt32() {
+  kernel_name_ = "i32_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunInt16() {
+  kernel_name_ = "i16_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunInt8() {
+  kernel_name_ = "i8_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunComplex64() {
+  kernel_name_ = "c64_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::RunComplex128() {
+  kernel_name_ = "c128_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::Run() {
+  int M = 0;
+  int N = 0;
+  int K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion GetMNK failed";
+    return RET_ERROR;
+  }
+  int act_code = 0;
+  (void)GetActTypeCode(&act_code);
+
+  auto allocator = dsp_runtime_->GetAllocator();
+  uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
+  uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
+  uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
+  uint64_t bias_ptr = 0;
+  if (in_tensors_.size() >= INPUT_TENSOR_SIZE_3) {
+    bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[INPUT_TENSOR_SIZE_2]->data());
+  }
+
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_4) {
+    uint64_t mnk_ptr = allocator->GetDeviceMemPtr(in_tensors_[INPUT_TENSOR_SIZE_4 - 1]->data());
+    SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, mnk_ptr});
+  } else {
+    SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast<uint64_t>(M), static_cast<uint64_t>(N),
+                  static_cast<uint64_t>(K), static_cast<uint64_t>(act_code)});
+  }
+
+  auto dtype = in_tensors_[0]->data_type();
+  int ret = RET_ERROR;
+  switch (dtype) {
+    case kNumberTypeFloat32:
+      ret = RunFp32();
+      break;
+    case kNumberTypeFloat64:
+      ret = RunFp64();
+      break;
+    case kNumberTypeInt32:
+      ret = RunInt32();
+      break;
+    case kNumberTypeInt16:
+      ret = RunInt16();
+      break;
+    case kNumberTypeInt8:
+      ret = RunInt8();
+      break;
+    case kNumberTypeComplex64:
+      ret = RunComplex64();
+      break;
+    case kNumberTypeComplex128:
+      ret = RunComplex128();
+      break;
+    default:
+      MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast<int>(dtype);
+      return RET_ERROR;
+  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion DSP run failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeComplex128, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..905db563b7569308cb620c921a7fdb40b06b5214
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_
+
+#include <string>
+#include <vector>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+class MatMulFusionDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+  ~MatMulFusionDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int RunFp32();
+  int RunFp64();
+  int RunInt32();
+  int RunInt16();
+  int RunInt8();
+  int RunComplex64();
+  int RunComplex128();
+
+  int GetMNK(int *M, int *N, int *K) const;
+  int GetActTypeCode(int *code) const;
+
+ private:
+  std::string kernel_name_;
+  uint64_t core_mask_{0xff};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..597856545417702914bea083d7f080e363cfbb60
--- /dev/null
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
@@ -0,0 +1,1031 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <cstring>
+#include <cmath>
+#include <limits>
+#include "ut/src/runtime/kernel/dsp/dsp_test.h"
+#include "include/api/context.h"
+#include "include/api/data_type.h"
+#include "include/api/model.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel_registry.h"
+#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
+
+namespace mindspore::lite::dsp::test {
+
+class TestDSP_MatMulFusion : public DSPCommonTest {};
+
+static void FillFloat(float *data, int size, float base = 0.1f) {
+  for (int i = 0; i < size; ++i) {
+    data[i] = base * static_cast<float>((i % 10));
+  }
+}
+
+typedef uint16_t float16_t_u;
+static inline float16_t_u Fp32ToFp16Bits(float v) {
+  uint32_t bits;
+  std::memcpy(&bits, &v, sizeof(bits));
+  uint32_t sign = (bits >> 31) & 0x1;
+  int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15;
+  uint32_t mantissa = bits & 0x007FFFFF;
+  uint16_t result;
+  if (exponent <= 0) {
+    if (exponent < -10) {
+      result = static_cast<uint16_t>(sign << 15);
+    } else {
+      mantissa |= 0x00800000;
+      int shift = 14 - exponent;
+      uint32_t mantissa_shifted = mantissa >> shift;
+      uint32_t remainder = mantissa & ((1U << shift) - 1);
+      if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) {
+        mantissa_shifted++;
+      }
+      result = static_cast<uint16_t>((sign << 15) | (mantissa_shifted & 0x3FF));
+    }
+  } else if (exponent == 0xFF - 127 + 15) {
+    result = static_cast<uint16_t>((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00));
+  } else if (exponent > 30) {
+    result = static_cast<uint16_t>((sign << 15) | 0x7C00);
+  } else {
+    uint32_t mantissa_rounded = mantissa >> 13;
+    uint32_t remainder = mantissa & 0x1FFF;
+    if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
+      mantissa_rounded++;
+      if (mantissa_rounded == 0x400) {
+        mantissa_rounded = 0;
+        exponent++;
+        if (exponent > 30) return static_cast<uint16_t>((sign << 15) | 0x7C00);
+      }
+    }
+    result = static_cast<uint16_t>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
+  }
+  return result;
+}
+
+#ifdef SUPPORT_FT78
+static inline int GetActCode(int act_type) {
+  if (act_type == ActType_Relu) {
+    return 1;
+  }
+  if (act_type == ActType_Relu6) {
+    return 2;
+  }
+  return 0;
+}
+
+static lite::Tensor *CreateFT78ParamTensor(const std::shared_ptr<DSPAllocator> &allocator, int M, int N, int K,
+                                           int act_code) {
+  std::vector<int> param_shape = {4};
+  auto tensor = new lite::Tensor(kNumberTypeInt32, param_shape, NHWC, lite::Category::CONST_TENSOR);
+  tensor->MallocData(allocator);
+  auto data = reinterpret_cast<int32_t *>(tensor->MutableData());
+  data[0] = M;
+  data[1] = N;
+  data[2] = K;
+  data[3] = act_code;
+  return tensor;
+}
+#endif
+
+#ifndef SUPPORT_FT78
+// Large size tests (M=N=K=256) across dtypes
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  FillFloat(reinterpret_cast<float *>(t_A->MutableData()), M * K, 0.02f);
+  FillFloat(reinterpret_cast<float *>(t_B->MutableData()), K * N, 0.03f);
+  FillFloat(reinterpret_cast<float *>(t_bias->MutableData()), M * N, 0.005f);
+  std::memset(t_out->MutableData(), 0, M * N * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());
+  std::vector<float> expect(M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.f;
+      for (int k = 0; k < K; ++k) {
+        sum += A[m * K + k] * B[k * N + n];
+      }
+      sum += bias[m * N + n];
+      expect[m * N + n] = sum > 0.f ? sum : 0.f;
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A16 = reinterpret_cast<uint16_t *>(t_A->MutableData());
+  auto B16 = reinterpret_cast<uint16_t *>(t_B->MutableData());
+  auto bias16 = reinterpret_cast<uint16_t *>(t_bias->MutableData());
+  auto C16 = reinterpret_cast<uint16_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A16[i] = Fp32ToFp16Bits(0.01f * static_cast<float>(i % 13));
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B16[i] = Fp32ToFp16Bits(0.02f * static_cast<float>(i % 17));
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias16[i] = Fp32ToFp16Bits(0.003f * static_cast<float>(i % 11));
+  }
+  std::memset(C16, 0, M * N * sizeof(uint16_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto Fp16ToFp32 = [&](uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16;
+    uint32_t exp = (h & 0x7C00) >> 10;
+    uint32_t frac = (h & 0x03FF);
+    uint32_t fexp, ffrac;
+    if (exp == 0) {
+      if (frac == 0) {
+        fexp = 0;
+        ffrac = 0;
+      } else {
+        int shift = 0;
+        while ((frac & 0x0200) == 0) {
+          frac <<= 1;
+          ++shift;
+        }
+        frac &= 0x03FF;
+        fexp = 127 - 15 - shift;
+        ffrac = frac << 13;
+      }
+    } else if (exp == 0x1F) {
+      fexp = 255;
+      ffrac = frac << 13;
+    } else {
+      fexp = exp - 15 + 127;
+      ffrac = frac << 13;
+    }
+    uint32_t bits = sign | (fexp << 23) | ffrac;
+    float out;
+    std::memcpy(&out, &bits, sizeof(out));
+    return out;
+  };
+  std::vector<float> expect_fp32(M * N, 0.f);
+  std::vector<float> actual_fp32(M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.f;
+      for (int k = 0; k < K; ++k) {
+        float a = Fp16ToFp32(A16[m * K + k]);
+        float b = Fp16ToFp32(B16[k * N + n]);
+        sum += a * b;
+      }
+      sum += Fp16ToFp32(bias16[m * N + n]);
+      expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f;
+      actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int32_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int32_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int32_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int32_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = (i % 11) - 5;
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = (i % 13) - 6;
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = (i % 9) - 4;
+  }
+  std::memset(C, 0, M * N * sizeof(int32_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<int32_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      int64_t sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<int64_t>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<int64_t>(bias[m * N + n]);
+      expect[m * N + n] = static_cast<int32_t>(sum > 0 ? sum : 0);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int16_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int16_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int16_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int16_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = static_cast<int16_t>((i % 21) - 10);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = static_cast<int16_t>((i % 19) - 9);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = static_cast<int16_t>(i % 15);
+  }
+  std::memset(C, 0, M * N * sizeof(int16_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<int16_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      int64_t sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<int64_t>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<int64_t>(bias[m * N + n]);
+      sum = sum > 0 ? sum : 0;
+      if (sum > std::numeric_limits<int16_t>::max()) sum = std::numeric_limits<int16_t>::max();
+      expect[m * N + n] = static_cast<int16_t>(sum);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());  // complex64 stored as interleaved real,imag
+  for (int i = 0; i < M * K; ++i) {
+    A[2 * i] = 0.01f * (i % 17);
+    A[2 * i + 1] = 0.02f * (i % 19);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[2 * i] = 0.03f * (i % 23);
+    B[2 * i + 1] = 0.01f * (i % 29);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[2 * i] = 0.002f * (i % 31);
+    bias[2 * i + 1] = 0.001f * (i % 37);
+  }
+  std::memset(C, 0, M * N * 2 * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<float> expect(2 * M * N, 0.f);
+  std::vector<float> actual(2 * M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float real = 0.f;
+      float imag = 0.f;
+      for (int k = 0; k < K; ++k) {
+        float ar = A[2 * (m * K + k)];
+        float ai = A[2 * (m * K + k) + 1];
+        float br = B[2 * (k * N + n)];
+        float bi = B[2 * (k * N + n) + 1];
+        real += ar * br - ai * bi;
+        imag += ar * bi + ai * br;
+      }
+      real += bias[2 * (m * N + n)];
+      imag += bias[2 * (m * N + n) + 1];
+      if (real < 0.f) real = 0.f;
+      expect[2 * (m * N + n)] = real;
+      expect[2 * (m * N + n) + 1] = imag;
+      actual[2 * (m * N + n)] = C[2 * (m * N + n)];
+      actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1];
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+}
+#endif
+
+#ifdef SUPPORT_FT78
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  FillFloat(reinterpret_cast<float *>(t_A->MutableData()), M * K, 0.02f);
+  FillFloat(reinterpret_cast<float *>(t_B->MutableData()), K * N, 0.03f);
+  FillFloat(reinterpret_cast<float *>(t_bias->MutableData()), M * N, 0.005f);
+  std::memset(t_out->MutableData(), 0, M * N * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());
+  std::vector<float> expect(M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.f;
+      for (int k = 0; k < K; ++k) {
+        sum += A[m * K + k] * B[k * N + n];
+      }
+      sum += bias[m * N + n];
+      expect[m * N + n] = sum > 0.f ? sum : 0.f;
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int32_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int32_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int32_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int32_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = (i % 11) - 5;
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = (i % 13) - 6;
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = (i % 9) - 4;
+  }
+  std::memset(C, 0, M * N * sizeof(int32_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<int32_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      int64_t sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<int64_t>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<int64_t>(bias[m * N + n]);
+      expect[m * N + n] = static_cast<int32_t>(sum > 0 ? sum : 0);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int16_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int16_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int16_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int16_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = static_cast<int16_t>((i % 21) - 10);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = static_cast<int16_t>((i % 19) - 9);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = static_cast<int16_t>(i % 15);
+  }
+  std::memset(C, 0, M * N * sizeof(int16_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<int16_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      int64_t sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<int64_t>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<int64_t>(bias[m * N + n]);
+      sum = sum > 0 ? sum : 0;
+      if (sum > std::numeric_limits<int16_t>::max()) sum = std::numeric_limits<int16_t>::max();
+      expect[m * N + n] = static_cast<int16_t>(sum);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[2 * i] = 0.01f * (i % 17);
+    A[2 * i + 1] = 0.02f * (i % 19);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[2 * i] = 0.03f * (i % 23);
+    B[2 * i + 1] = 0.01f * (i % 29);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[2 * i] = 0.002f * (i % 31);
+    bias[2 * i + 1] = 0.001f * (i % 37);
+  }
+  std::memset(C, 0, M * N * 2 * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<float> expect(2 * M * N, 0.f);
+  std::vector<float> actual(2 * M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float real = 0.f;
+      float imag = 0.f;
+      for (int k = 0; k < K; ++k) {
+        float ar = A[2 * (m * K + k)];
+        float ai = A[2 * (m * K + k) + 1];
+        float br = B[2 * (k * N + n)];
+        float bi = B[2 * (k * N + n) + 1];
+        real += ar * br - ai * bi;
+        imag += ar * bi + ai * br;
+      }
+      real += bias[2 * (m * N + n)];
+      imag += bias[2 * (m * N + n) + 1];
+      if (real < 0.f) real = 0.f;
+      expect[2 * (m * N + n)] = real;
+      expect[2 * (m * N + n) + 1] = imag;
+      actual[2 * (m * N + n)] = C[2 * (m * N + n)];
+      actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1];
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp64_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat64, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat64, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat64, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat64, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<double *>(t_A->MutableData());
+  auto B = reinterpret_cast<double *>(t_B->MutableData());
+  auto bias = reinterpret_cast<double *>(t_bias->MutableData());
+  auto C = reinterpret_cast<double *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = 0.015 * static_cast<double>(i % 13);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = 0.018 * static_cast<double>(i % 17);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = 0.004 * static_cast<double>(i % 19);
+  }
+  std::fill_n(C, M * N, 0.0);
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<double> expect(M * N, 0.0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      double sum = 0.0;
+      for (int k = 0; k < K; ++k) {
+        sum += A[m * K + k] * B[k * N + n];
+      }
+      sum += bias[m * N + n];
+      expect[m * N + n] = sum > 0.0 ? sum : 0.0;
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-6));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int8_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 32, K = 32, N = 32;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt8, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt8, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt8, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt8, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int8_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int8_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int8_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int8_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = static_cast<int8_t>((i % 7) - 3);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = static_cast<int8_t>((i % 9) - 4);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = static_cast<int8_t>(i % 5 - 2);
+  }
+  std::fill_n(C, M * N, static_cast<int8_t>(0));
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<int8_t> expect(M * N, static_cast<int8_t>(0));
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      int32_t sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<int32_t>(A[m * K + k]) * static_cast<int32_t>(B[k * N + n]);
+      }
+      sum += static_cast<int32_t>(bias[m * N + n]);
+      sum = sum < 0 ? 0 : sum;
+      if (sum > std::numeric_limits<int8_t>::max()) {
+        sum = std::numeric_limits<int8_t>::max();
+      }
+      expect[m * N + n] = static_cast<int8_t>(sum);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.0f));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex128_Large_BiasRelu_FT78) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeComplex128, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeComplex128, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeComplex128, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeComplex128, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<double *>(t_A->MutableData());
+  auto B = reinterpret_cast<double *>(t_B->MutableData());
+  auto bias = reinterpret_cast<double *>(t_bias->MutableData());
+  auto C = reinterpret_cast<double *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[2 * i] = 0.01f * (i % 17);
+    A[2 * i + 1] = 0.02f * (i % 19);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[2 * i] = 0.03f * (i % 23);
+    B[2 * i + 1] = 0.01f * (i % 29);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[2 * i] = 0.002f * (i % 31);
+    bias[2 * i + 1] = 0.001f * (i % 37);
+  }
+  std::memset(C, 0, M * N * 2 * sizeof(double));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  int act_code = GetActCode(param->act_type_);
+  auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code);
+  inputs_.push_back(t_param);
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex128, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  std::vector<double> expect(2 * M * N, 0.0);
+  std::vector<double> actual(2 * M * N, 0.0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      double real = 0.0;
+      double imag = 0.0;
+      for (int k = 0; k < K; ++k) {
+        double ar = A[2 * (m * K + k)];
+        double ai = A[2 * (m * K + k) + 1];
+        double br = B[2 * (k * N + n)];
+        double bi = B[2 * (k * N + n) + 1];
+        real += ar * br - ai * bi;
+        imag += ar * bi + ai * br;
+      }
+      real += bias[2 * (m * N + n)];
+      imag += bias[2 * (m * N + n) + 1];
+      if (real < 0.0) {
+        real = 0.0;
+      }
+      expect[2 * (m * N + n)] = real;
+      expect[2 * (m * N + n) + 1] = imag;
+      actual[2 * (m * N + n)] = C[2 * (m * N + n)];
+      actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1];
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 1e-3));
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_param;
+  delete t_out;
+}
+#endif
+
+}  // namespace mindspore::lite::dsp::test