From 69dea2162c7e88c4adaf5f2b843b56aed11cb29e Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Thu, 6 Nov 2025 13:27:45 +0000
Subject: [PATCH 1/7] add ft04 raggedrange

---
 .../litert/kernel/dsp/ft04/ragged_range.cc    | 132 ++++++
 .../src/litert/kernel/dsp/ft04/ragged_range.h |  48 +++
 .../runtime/kernel/dsp/ragged_range_tests.cc  | 391 ++++++++++++++++++
 3 files changed, 571 insertions(+)
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h
 create mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
new file mode 100644
index 00000000..8dbd1c29
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft04/ragged_range.h"
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include "src/litert/kernel_registry.h"
+#include "schema/inner/model_generated.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_RaggedRange;
+
+namespace mindspore::kernel {
+
+int RaggedRangeDSPKernel::CheckSpecs() {
+  // inputs: starts, limits, deltas; outputs: splits, values
+  if (in_tensors_.size() != 3 || out_tensors_.size() != 2) {
+    MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() << ", out: "
+                    << out_tensors_.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int RaggedRangeDSPKernel::Prepare() { return RET_OK; }
+
+int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) {
+  if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) {
+    return RET_ERROR;
+  }
+  const auto &s0 = in_tensors_[0]->shape();
+  const auto &s1 = in_tensors_[1]->shape();
+  const auto &s2 = in_tensors_[2]->shape();
+  *starts_scalar = s0.empty();
+  *limits_scalar = s1.empty();
+  *deltas_scalar = s2.empty();
+  int non_scalar_rows = -1;
+  if (!*starts_scalar) non_scalar_rows = s0[0];
+  if (!*limits_scalar) {
+    if (non_scalar_rows == -1) non_scalar_rows = s1[0];
+    if (non_scalar_rows != s1[0]) return RET_ERROR;
+  }
+  if (!*deltas_scalar) {
+    if (non_scalar_rows == -1) non_scalar_rows = s2[0];
+    if (non_scalar_rows != s2[0]) return RET_ERROR;
+  }
+  *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows;
+  return RET_OK;
+}
+
+int RaggedRangeDSPKernel::RunFp32() {
+  kernel_name_ = "fp_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunFp16() {
+  kernel_name_ = "hp_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunInt32() {
+  kernel_name_ = "i32_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunInt16() {
+  kernel_name_ = "i16_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::Run() {
+  int rows = 0;
+  bool starts_scalar = false, limits_scalar = false, deltas_scalar = false;
+  int ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "RaggedRange rows check failed.";
+    return RET_ERROR;
+  }
+
+  auto allocator = dsp_runtime_->GetAllocator();
+  // device pointers for inputs/outputs
+  uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
+  uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
+  uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
+
+  // outputs: [0] splits (int32), [1] values (same type as inputs)
+  uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
+  uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data());
+
+  // Note: s-variant core mask passed as separate arg by runtime; do not include in args.
+  // Arg order: starts, limits, deltas, range_count, values, splits
+  SetKernelArg({starts_dev, limits_dev, deltas_dev, static_cast<uint64_t>(rows), values_dev, splits_dev});
+
+  auto out_dt = out_tensors_[1]->data_type();
+  switch (out_dt) {
+    case kNumberTypeFloat32:
+      return RunFp32();
+    case kNumberTypeFloat16:
+      return RunFp16();
+    case kNumberTypeInt32:
+      return RunInt32();
+    case kNumberTypeInt16:
+      return RunInt16();
+    default:
+      MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast<int>(out_dt);
+      return RET_ERROR;
+  }
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h
new file mode 100644
index 00000000..3c177940
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_
+
+#include <string>
+#include <vector>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+class RaggedRangeDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+  ~RaggedRangeDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int RunFp32();
+  int RunFp16();
+  int RunInt32();
+  int RunInt16();
+
+  int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar);
+
+ private:
+  std::string kernel_name_;
+  uint64_t core_mask_{0xF};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
new file mode 100644
index 00000000..220e45ba
--- /dev/null
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
@@ -0,0 +1,391 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include "ut/src/runtime/kernel/dsp/dsp_test.h"
+#include "include/api/context.h"
+#include "include/api/data_type.h"
+#include "include/api/model.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel/dsp/dsp_subgraph.h"
+#include "src/litert/kernel_registry.h"
+
+namespace mindspore::lite::dsp::test {
+
+class TestDSP_RaggedRange : public DSPCommonTest {};
+
+// fp16 helpers (consistent with other tests)
+typedef short float16;
+static inline float fp16_to_fp32(float16 h) {
+  uint32_t sign = (h & 0x8000) << 16;
+  uint32_t exp = (h & 0x7C00) >> 10;
+  uint32_t frac = (h & 0x03FF);
+  uint32_t f_exp, f_frac;
+  if (exp == 0) {
+    if (frac == 0) {
+      f_exp = 0; f_frac = 0;
+    } else {
+      int shift = 0;
+      while ((frac & 0x0200) == 0) { frac <<= 1; ++shift; }
+      frac &= 0x03FF;
+      f_exp = 127 - 15 - shift;
+      f_frac = frac << 13;
+    }
+  } else if (exp == 0x1F) {
+    f_exp = 255; f_frac = frac << 13;
+  } else {
+    f_exp = exp - 15 + 127; f_frac = frac << 13;
+  }
+  uint32_t f_bits = sign | (f_exp << 23) | f_frac;
+  float result; std::memcpy(&result, &f_bits, sizeof(result));
+  return result;
+}
+[[maybe_unused]] static inline float16 fp32_to_fp16(float v) {
+  uint32_t bits; std::memcpy(&bits, &v, sizeof(bits));
+  uint32_t sign = (bits >> 31) & 0x1;
+  int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15;
+  uint32_t mantissa = bits & 0x007FFFFF;
+  float16 result;
+  if (exponent <= 0) {
+    if (exponent < -10) {
+      result = static_cast<float16>(sign << 15);
+    } else {
+      mantissa |= 0x00800000; int shift = 14 - exponent; uint32_t mantissa_shifted = mantissa >> shift;
+      uint32_t remainder = mantissa & ((1U << shift) - 1);
+      if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) {
+        mantissa_shifted++; }
+      result = static_cast<float16>((sign << 15) | (mantissa_shifted & 0x3FF));
+    }
+  } else if (exponent == 0xFF - 127 + 15) {
+    result = (mantissa == 0) ? static_cast<float16>((sign << 15) | 0x7C00) : static_cast<float16>((sign << 15) | 0x7E00);
+  } else if (exponent > 30) {
+    result = static_cast<float16>((sign << 15) | 0x7C00);
+  } else {
+    uint32_t mantissa_rounded = mantissa >> 13; uint32_t remainder = mantissa & 0x1FFF;
+    if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
+      mantissa_rounded++; if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) {
+        return static_cast<float16>((sign << 15) | 0x7C00); } } }
+    result = static_cast<float16>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
+  }
+  return result;
+}
+
+TEST_F(TestDSP_RaggedRange, RaggedRange_Fp32) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  // Larger dataset: rows=5
+  // starts=[0,10,-5,100,7], limits=[50,60,5,110,27], deltas=[1,2,3,1,4]
+  std::vector<int> vec5 = {5};
+  auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  auto starts_data = reinterpret_cast<float *>(t_starts->MutableData());
+  auto limits_data = reinterpret_cast<float *>(t_limits->MutableData());
+  auto deltas_data = reinterpret_cast<float *>(t_deltas->MutableData());
+  float starts_host[5] = {0.f, 10.f, -5.f, 100.f, 7.f};
+  float limits_host[5] = {50.f, 60.f, 5.f, 110.f, 27.f};
+  float deltas_host[5] = {1.f, 2.f, 3.f, 1.f, 4.f};
+  std::memcpy(starts_data, starts_host, sizeof(starts_host));
+  std::memcpy(limits_data, limits_host, sizeof(limits_host));
+  std::memcpy(deltas_data, deltas_host, sizeof(deltas_host));
+
+  // outputs (splits size rows+1, values computed below)
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {6}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  // rough upper bound for values, we'll only compare first computed_len elements
+  auto t_values = new lite::Tensor(kNumberTypeFloat32, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 6, 0);
+  std::fill_n(reinterpret_cast<float *>(t_values->MutableData()), 200, 0.0f);
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  // build expected
+  std::vector<int32_t> expect_splits(6, 0);
+  std::vector<float> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 5; ++r) {
+    expect_splits[r] = acc;
+    for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) {
+      expect_values.push_back(v);
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[5] = acc;
+
+  // compare splits
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(),
+                                 6));
+  // compare first acc values
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[1]->MutableData()), expect_values.data(),
+                                 acc));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+
+TEST_F(TestDSP_RaggedRange, RaggedRange_Int32) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  // Larger dataset: rows=4
+  std::vector<int> vec4 = {4};
+  auto t_starts = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  auto starts_data = reinterpret_cast<int32_t *>(t_starts->MutableData());
+  auto limits_data = reinterpret_cast<int32_t *>(t_limits->MutableData());
+  auto deltas_data = reinterpret_cast<int32_t *>(t_deltas->MutableData());
+  int32_t starts_host[4] = {0, -100, 5, 1000};
+  int32_t limits_host[4] = {200, -50, 50, 1010};
+  int32_t deltas_host[4] = {2, 5, 3, 1};
+  std::memcpy(starts_data, starts_host, sizeof(starts_host));
+  std::memcpy(limits_data, limits_host, sizeof(limits_host));
+  std::memcpy(deltas_data, deltas_host, sizeof(deltas_host));
+
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeInt32, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 5, 0);
+  std::fill_n(reinterpret_cast<int32_t *>(t_values->MutableData()), 300, 0);
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  std::vector<int32_t> expect_splits(5, 0);
+  std::vector<int32_t> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 4; ++r) {
+    expect_splits[r] = acc;
+    for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) {
+      expect_values.push_back(v);
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[4] = acc;
+
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(), 5));
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[1]->MutableData()), expect_values.data(), acc));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+
+TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  // Larger dataset with fp32 inputs and fp16 outputs
+  std::vector<int> vec3 = {3};
+  auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  auto starts_f = reinterpret_cast<float *>(t_starts->MutableData());
+  auto limits_f = reinterpret_cast<float *>(t_limits->MutableData());
+  auto deltas_f = reinterpret_cast<float *>(t_deltas->MutableData());
+  float starts_host[3] = {-10.f, 0.f, 1.5f};
+  float limits_host[3] = {0.f, 50.f, 6.f};
+  float deltas_host[3] = {0.5f, 1.f, 1.25f};
+  std::memcpy(starts_f, starts_host, sizeof(starts_host));
+  std::memcpy(limits_f, limits_host, sizeof(limits_host));
+  std::memcpy(deltas_f, deltas_host, sizeof(deltas_host));
+
+  // outputs
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeFloat16, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 4, 0);
+  std::memset(t_values->MutableData(), 0, 200 * sizeof(uint16_t));
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  // expected
+  std::vector<int32_t> expect_splits(4, 0);
+  std::vector<float> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 3; ++r) {
+    expect_splits[r] = acc;
+    for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) {
+      expect_values.push_back(v);
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[3] = acc;
+
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(), 4));
+
+  auto out_fp16 = reinterpret_cast<uint16_t *>(outputs_[1]->MutableData());
+  std::vector<float> actual(acc);
+  for (int i = 0; i < acc; ++i) actual[i] = fp16_to_fp32(static_cast<float16>(out_fp16[i]));
+  std::vector<float> correct(acc);
+  for (int i = 0; i < acc; ++i) correct[i] = fp16_to_fp32(fp32_to_fp16(expect_values[i]));
+  ASSERT_EQ(0, CompareOutputData(actual.data(), correct.data(), acc, 1e-3));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+
+TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  // Larger dataset with int32 inputs and int16 outputs
+  std::vector<int> vec3 = {3};
+  auto t_starts = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  auto starts_d32 = reinterpret_cast<int32_t *>(t_starts->MutableData());
+  auto limits_d32 = reinterpret_cast<int32_t *>(t_limits->MutableData());
+  auto deltas_d32 = reinterpret_cast<int32_t *>(t_deltas->MutableData());
+  int32_t starts_host[3] = {-10, 0, 100};
+  int32_t limits_host[3] = {10, 100, 110};
+  int32_t deltas_host[3] = {2, 3, 1};
+  std::memcpy(starts_d32, starts_host, sizeof(starts_host));
+  std::memcpy(limits_d32, limits_host, sizeof(limits_host));
+  std::memcpy(deltas_d32, deltas_host, sizeof(deltas_host));
+
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeInt16, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 4, 0);
+  std::fill_n(reinterpret_cast<int16_t *>(t_values->MutableData()), 300, 0);
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  std::vector<int32_t> expect_splits(4, 0);
+  std::vector<int16_t> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 3; ++r) {
+    expect_splits[r] = acc;
+    for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) {
+      expect_values.push_back(static_cast<int16_t>(v));
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[3] = acc;
+
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(), 4));
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int16_t *>(outputs_[1]->MutableData()), expect_values.data(), acc));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+
+}  // namespace mindspore::lite::dsp::test
-- 
Gitee


From ec9da3165278e9181accfa3b59a0ddf7f3808051 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Fri, 7 Nov 2025 15:01:21 +0000
Subject: [PATCH 2/7] code format

---
 .../litert/kernel/dsp/ft04/ragged_range.cc    |  4 +-
 .../runtime/kernel/dsp/ragged_range_tests.cc  | 52 +++++++++++++------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
index 8dbd1c29..1acc8965 100644
--- a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc
@@ -32,8 +32,8 @@ namespace mindspore::kernel {
 int RaggedRangeDSPKernel::CheckSpecs() {
   // inputs: starts, limits, deltas; outputs: splits, values
   if (in_tensors_.size() != 3 || out_tensors_.size() != 2) {
-    MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() << ", out: "
-                    << out_tensors_.size();
+    MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size()
+                    << ", out: " << out_tensors_.size();
     return RET_ERROR;
   }
   return RET_OK;
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
index 220e45ba..a3047238 100644
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
@@ -33,7 +33,7 @@ namespace mindspore::lite::dsp::test {
 class TestDSP_RaggedRange : public DSPCommonTest {};
 
 // fp16 helpers (consistent with other tests)
-typedef short float16;
+typedef uint16_t float16;
 static inline float fp16_to_fp32(float16 h) {
   uint32_t sign = (h & 0x8000) << 16;
   uint32_t exp = (h & 0x7C00) >> 10;
@@ -41,25 +41,33 @@ static inline float fp16_to_fp32(float16 h) {
   uint32_t f_exp, f_frac;
   if (exp == 0) {
     if (frac == 0) {
-      f_exp = 0; f_frac = 0;
+      f_exp = 0;
+      f_frac = 0;
     } else {
       int shift = 0;
-      while ((frac & 0x0200) == 0) { frac <<= 1; ++shift; }
+      while ((frac & 0x0200) == 0) {
+        frac <<= 1;
+        ++shift;
+      }
       frac &= 0x03FF;
       f_exp = 127 - 15 - shift;
       f_frac = frac << 13;
     }
   } else if (exp == 0x1F) {
-    f_exp = 255; f_frac = frac << 13;
+    f_exp = 255;
+    f_frac = frac << 13;
   } else {
-    f_exp = exp - 15 + 127; f_frac = frac << 13;
+    f_exp = exp - 15 + 127;
+    f_frac = frac << 13;
   }
   uint32_t f_bits = sign | (f_exp << 23) | f_frac;
-  float result; std::memcpy(&result, &f_bits, sizeof(result));
+  float result;
+  std::memcpy(&result, &f_bits, sizeof(result));
   return result;
 }
 [[maybe_unused]] static inline float16 fp32_to_fp16(float v) {
-  uint32_t bits; std::memcpy(&bits, &v, sizeof(bits));
+  uint32_t bits;
+  std::memcpy(&bits, &v, sizeof(bits));
   uint32_t sign = (bits >> 31) & 0x1;
   int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15;
   uint32_t mantissa = bits & 0x007FFFFF;
@@ -68,21 +76,33 @@ static inline float fp16_to_fp32(float16 h) {
     if (exponent < -10) {
       result = static_cast<float16>(sign << 15);
     } else {
-      mantissa |= 0x00800000; int shift = 14 - exponent; uint32_t mantissa_shifted = mantissa >> shift;
+      mantissa |= 0x00800000;
+      int shift = 14 - exponent;
+      uint32_t mantissa_shifted = mantissa >> shift;
       uint32_t remainder = mantissa & ((1U << shift) - 1);
       if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) {
-        mantissa_shifted++; }
+        mantissa_shifted++;
+      }
       result = static_cast<float16>((sign << 15) | (mantissa_shifted & 0x3FF));
     }
   } else if (exponent == 0xFF - 127 + 15) {
-    result = (mantissa == 0) ? static_cast<float16>((sign << 15) | 0x7C00) : static_cast<float16>((sign << 15) | 0x7E00);
+    result =
+      (mantissa == 0) ? static_cast<float16>((sign << 15) | 0x7C00) : static_cast<float16>((sign << 15) | 0x7E00);
   } else if (exponent > 30) {
     result = static_cast<float16>((sign << 15) | 0x7C00);
   } else {
-    uint32_t mantissa_rounded = mantissa >> 13; uint32_t remainder = mantissa & 0x1FFF;
+    uint32_t mantissa_rounded = mantissa >> 13;
+    uint32_t remainder = mantissa & 0x1FFF;
     if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
-      mantissa_rounded++; if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) {
-        return static_cast<float16>((sign << 15) | 0x7C00); } } }
+      mantissa_rounded++;
+      if (mantissa_rounded == 0x400) {
+        mantissa_rounded = 0;
+        exponent++;
+        if (exponent > 30) {
+          return static_cast<float16>((sign << 15) | 0x7C00);
+        }
+      }
+    }
     result = static_cast<float16>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
   }
   return result;
@@ -154,11 +174,9 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp32) {
   expect_splits[5] = acc;
 
   // compare splits
-  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(),
-                                 6));
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(), 6));
   // compare first acc values
-  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[1]->MutableData()), expect_values.data(),
-                                 acc));
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[1]->MutableData()), expect_values.data(), acc));
 
   UninitDSPRuntime();
   delete ctx;
-- 
Gitee


From 3c30419d70f3628d598348e46de864a3e6973619 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Fri, 7 Nov 2025 16:19:57 +0000
Subject: [PATCH 3/7] add matmulfusion

---
 .../litert/kernel/dsp/ft04/matmulfusion.cc    | 176 ++++++++++++
 .../src/litert/kernel/dsp/ft04/matmulfusion.h |  51 ++++
 .../runtime/kernel/dsp/matmulfusion_tests.cc  | 260 ++++++++++++++++++
 3 files changed, 487 insertions(+)
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
 create mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
new file mode 100644
index 00000000..4982c9d1
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
@@ -0,0 +1,176 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft04/matmulfusion.h"
+#include <algorithm>
+#include <string>
+#include "src/litert/kernel_registry.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_MatMulFusion;
+
+namespace mindspore::kernel {
+
+int MatMulFusionDSPKernel::Prepare() { return RET_OK; }
+
+int MatMulFusionDSPKernel::CheckSpecs() {
+  // inputs: A, B, (optional) bias; output: C
+  if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) {
+    MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
+    MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  int M = 0, N = 0, K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(WARNING) << "MatMulFusion shape inference failed.";
+    return RET_ERROR;
+  }
+  // Bias check if present
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
+    auto bias_shape = in_tensors_[2]->shape();
+    if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) {
+      MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape;
+      return RET_ERROR;
+    }
+  }
+  // Output shape check
+  auto out_shape = out_tensors_[0]->shape();
+  if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) {
+    MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const {
+  if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR;
+  const auto &a_shape = in_tensors_[0]->shape();
+  const auto &b_shape = in_tensors_[1]->shape();
+  if (a_shape.size() != 2 || b_shape.size() != 2) {
+    MS_LOG(WARNING) << "A/B must be rank-2";
+    return RET_ERROR;
+  }
+  int aM = a_shape[0];
+  int aK = a_shape[1];
+  int bK = b_shape[0];
+  int bN = b_shape[1];
+  if (aK != bK) {
+    MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK;
+    return RET_ERROR;
+  }
+  *M = aM; *K = aK; *N = bN;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::GetActTypeCode(int *code) const {
+  if (code == nullptr) return RET_ERROR;
+  // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2)
+  int act = 0;  // default NONE
+  auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  if (param != nullptr) {
+    switch (param->act_type_) {
+      case ActType_Relu:
+        act = 1; break;
+      case ActType_Relu6:
+        act = 2; break;  // DSP uses 2 for RELU6, nnacl uses enum value 3
+      default:
+        act = 0; break;
+    }
+  }
+  *code = act;
+  return RET_OK;
+}
+
+int MatMulFusionDSPKernel::RunFp32() {
+  kernel_name_ = "fp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunFp16() {
+  kernel_name_ = "hp_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunInt32() {
+  kernel_name_ = "i32_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunInt16() {
+  kernel_name_ = "i16_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+int MatMulFusionDSPKernel::RunComplex64() {
+  kernel_name_ = "c64_matmulfusion_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int MatMulFusionDSPKernel::Run() {
+  int M = 0, N = 0, K = 0;
+  if (GetMNK(&M, &N, &K) != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion GetMNK failed";
+    return RET_ERROR;
+  }
+  int act_code = 0;
+  (void)GetActTypeCode(&act_code);  // default 0 if not set
+
+  auto allocator = dsp_runtime_->GetAllocator();
+  uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
+  uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
+  uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
+  uint64_t bias_ptr = 0;
+  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
+    bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
+  }
+  // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type
+  SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast<uint64_t>(M), static_cast<uint64_t>(N),
+                static_cast<uint64_t>(K), static_cast<uint64_t>(act_code)});
+
+  int ret = RET_ERROR;
+  auto dtype = in_tensors_[0]->data_type();
+  if (dtype == kNumberTypeFloat32) {
+    ret = RunFp32();
+  } else if (dtype == kNumberTypeFloat16) {
+    ret = RunFp16();
+  } else if (dtype == kNumberTypeInt32) {
+    ret = RunInt32();
+  } else if (dtype == kNumberTypeInt16) {
+    ret = RunInt16();
+  } else if (dtype == kNumberTypeComplex64) {
+    ret = RunComplex64();
+  } else {
+    MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast<int>(dtype);
+    return RET_ERROR;
+  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MatMulFusion DSP run failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
+
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
new file mode 100644
index 00000000..1a487f08
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
+
+#include <vector>
+#include <string>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+class MatMulFusionDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+  ~MatMulFusionDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int RunFp32();
+  int RunFp16();
+  int RunInt32();
+  int RunInt16();
+  int RunComplex64();
+
+  // helpers
+  int GetMNK(int *M, int *N, int *K) const;
+  int GetActTypeCode(int *code) const;
+
+ private:
+  std::string kernel_name_;
+  uint64_t core_mask_{0xF};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
new file mode 100644
index 00000000..07eff1bd
--- /dev/null
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
@@ -0,0 +1,260 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <cstring>
+#include <cmath>
+#include <limits>
+#include "ut/src/runtime/kernel/dsp/dsp_test.h"
+#include "include/api/context.h"
+#include "include/api/data_type.h"
+#include "include/api/model.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel_registry.h"
+#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
+
+namespace mindspore::lite::dsp::test {
+
+class TestDSP_MatMulFusion : public DSPCommonTest {};
+
+static void FillFloat(float *data, int size, float base = 0.1f) {
+  for (int i = 0; i < size; ++i) { data[i] = base * static_cast<float>((i % 10)); }
+}
+
+typedef uint16_t float16_t_u;
+static inline float16_t_u Fp32ToFp16Bits(float v) {
+  uint32_t bits;
+  std::memcpy(&bits, &v, sizeof(bits));
+  uint32_t sign = (bits >> 31) & 0x1;
+  int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15;
+  uint32_t mantissa = bits & 0x007FFFFF;
+  uint16_t result;
+  if (exponent <= 0) {
+    if (exponent < -10) {
+      result = static_cast<uint16_t>(sign << 15);
+    } else {
+      mantissa |= 0x00800000;
+      int shift = 14 - exponent;
+      uint32_t mantissa_shifted = mantissa >> shift;
+      uint32_t remainder = mantissa & ((1U << shift) - 1);
+      if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) {
+        mantissa_shifted++;
+      }
+      result = static_cast<uint16_t>((sign << 15) | (mantissa_shifted & 0x3FF));
+    }
+  } else if (exponent == 0xFF - 127 + 15) {
+    result = static_cast<uint16_t>((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00));
+  } else if (exponent > 30) {
+    result = static_cast<uint16_t>((sign << 15) | 0x7C00);
+  } else {
+    uint32_t mantissa_rounded = mantissa >> 13;
+    uint32_t remainder = mantissa & 0x1FFF;
+    if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
+      mantissa_rounded++;
+      if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) return static_cast<uint16_t>((sign << 15) | 0x7C00); }
+    }
+    result = static_cast<uint16_t>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
+  }
+  return result;
+}
+
+// Large size tests (M=N=K=256) across dtypes
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
+  FillFloat(reinterpret_cast<float*>(t_A->MutableData()), M*K, 0.02f);
+  FillFloat(reinterpret_cast<float*>(t_B->MutableData()), K*N, 0.03f);
+  FillFloat(reinterpret_cast<float*>(t_bias->MutableData()), M*N, 0.005f);
+  std::memset(t_out->MutableData(), 0, M*N*sizeof(float));
+  std::vector<lite::Tensor*> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor*> outputs_{t_out};
+  auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter(); param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu; param->has_bias_ = true; param->row_=M; param->col_=N; param->deep_=K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter*>(param), ctx, key); ASSERT_NE(kernel,nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK); ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto A = reinterpret_cast<float*>(t_A->MutableData());
+  auto B = reinterpret_cast<float*>(t_B->MutableData());
+  auto bias = reinterpret_cast<float*>(t_bias->MutableData());
+  auto C = reinterpret_cast<float*>(t_out->MutableData());
+  std::vector<float> expect(M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.f;
+      for (int k = 0; k < K; ++k) {
+        sum += A[m * K + k] * B[k * N + n];
+      }
+      sum += bias[m * N + n];
+      expect[m * N + n] = sum > 0.f ? sum : 0.f;
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3));
+  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) {
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N};
+  std::vector<int> bias_shape={M,N};
+  auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
+  auto A16 = reinterpret_cast<uint16_t*>(t_A->MutableData()); auto B16 = reinterpret_cast<uint16_t*>(t_B->MutableData());
+  auto bias16 = reinterpret_cast<uint16_t*>(t_bias->MutableData()); auto C16 = reinterpret_cast<uint16_t*>(t_out->MutableData());
+  for(int i=0;i<M*K;++i){ A16[i] = Fp32ToFp16Bits(0.01f * static_cast<float>(i % 13)); }
+  for(int i=0;i<K*N;++i){ B16[i] = Fp32ToFp16Bits(0.02f * static_cast<float>(i % 17)); }
+  for(int i=0;i<M*N;++i){ bias16[i] = Fp32ToFp16Bits(0.003f * static_cast<float>(i % 11)); }
+  std::memset(C16,0,M*N*sizeof(uint16_t));
+  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out};
+  auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param=new MatMulParameter(); param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K;
+  kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeFloat16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  auto Fp16ToFp32=[&](uint16_t h){ uint32_t sign=(h & 0x8000) << 16; uint32_t exp=(h & 0x7C00)>>10; uint32_t frac=(h & 0x03FF); uint32_t fexp,ffrac; if(exp==0){ if(frac==0){ fexp=0; ffrac=0;} else { int shift=0; while((frac & 0x0200)==0){ frac <<=1; ++shift;} frac &=0x03FF; fexp=127-15-shift; ffrac=frac<<13; } } else if(exp==0x1F){ fexp=255; ffrac=frac<<13; } else { fexp=exp-15+127; ffrac=frac<<13; } uint32_t bits= sign | (fexp<<23) | ffrac; float out; std::memcpy(&out,&bits,sizeof(out)); return out; };
+  std::vector<float> expect_fp32(M * N, 0.f);
+  std::vector<float> actual_fp32(M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.f;
+      for (int k = 0; k < K; ++k) {
+        float a = Fp16ToFp32(A16[m * K + k]);
+        float b = Fp16ToFp32(B16[k * N + n]);
+        sum += a * b;
+      }
+      sum += Fp16ToFp32(bias16[m * N + n]);
+      expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f;
+      actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2));
+  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) {
+  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N};
+  std::vector<int> bias_shape={M,N};
+  auto t_A=new lite::Tensor(kNumberTypeInt32,a_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_B=new lite::Tensor(kNumberTypeInt32,b_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_bias=new lite::Tensor(kNumberTypeInt32,bias_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_out=new lite::Tensor(kNumberTypeInt32,out_shape,NHWC,lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
+  auto A=reinterpret_cast<int32_t*>(t_A->MutableData()); auto B=reinterpret_cast<int32_t*>(t_B->MutableData());
+  auto bias=reinterpret_cast<int32_t*>(t_bias->MutableData()); auto C=reinterpret_cast<int32_t*>(t_out->MutableData());
+  for(int i=0;i<M*K;++i){ A[i]=(i%11)-5; }
+  for(int i=0;i<K*N;++i){ B[i]=(i%13)-6; }
+  for(int i=0;i<M*N;++i){ bias[i]=(i%9)-4; }
+  std::memset(C,0,M*N*sizeof(int32_t));
+  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt32,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  std::vector<int32_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      long long sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<long long>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<long long>(bias[m * N + n]);
+      expect[m * N + n] = static_cast<int32_t>(sum > 0 ? sum : 0);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) {
+  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N}; std::vector<int> bias_shape={M,N};
+  auto t_A=new lite::Tensor(kNumberTypeInt16,a_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_B=new lite::Tensor(kNumberTypeInt16,b_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_bias=new lite::Tensor(kNumberTypeInt16,bias_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_out=new lite::Tensor(kNumberTypeInt16,out_shape,NHWC,lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
+  auto A=reinterpret_cast<int16_t*>(t_A->MutableData()); auto B=reinterpret_cast<int16_t*>(t_B->MutableData());
+  auto bias=reinterpret_cast<int16_t*>(t_bias->MutableData()); auto C=reinterpret_cast<int16_t*>(t_out->MutableData());
+  for(int i=0;i<M*K;++i){ A[i]=static_cast<int16_t>((i%21)-10);} for(int i=0;i<K*N;++i){ B[i]=static_cast<int16_t>((i%19)-9);} for(int i=0;i<M*N;++i){ bias[i]=static_cast<int16_t>(i%15); }
+  std::memset(C,0,M*N*sizeof(int16_t));
+  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  std::vector<int16_t> expect(M * N, 0);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      long long sum = 0;
+      for (int k = 0; k < K; ++k) {
+        sum += static_cast<long long>(A[m * K + k]) * B[k * N + n];
+      }
+      sum += static_cast<long long>(bias[m * N + n]);
+      sum = sum > 0 ? sum : 0;
+      if (sum > std::numeric_limits<int16_t>::max()) sum = std::numeric_limits<int16_t>::max();
+      expect[m * N + n] = static_cast<int16_t>(sum);
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
+  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+}
+
+TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) {
+  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N}; std::vector<int> bias_shape={M,N};
+  auto t_A=new lite::Tensor(kNumberTypeComplex64,a_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_B=new lite::Tensor(kNumberTypeComplex64,b_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_bias=new lite::Tensor(kNumberTypeComplex64,bias_shape,NHWC,lite::Category::CONST_TENSOR);
+  auto t_out=new lite::Tensor(kNumberTypeComplex64,out_shape,NHWC,lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
+  auto A=reinterpret_cast<float*>(t_A->MutableData()); auto B=reinterpret_cast<float*>(t_B->MutableData());
+  auto bias=reinterpret_cast<float*>(t_bias->MutableData()); auto C=reinterpret_cast<float*>(t_out->MutableData()); // complex64 stored as interleaved real,imag
+  for(int i=0;i<M*K;++i){ A[2*i] = 0.01f * (i%17); A[2*i+1] = 0.02f * (i%19); }
+  for(int i=0;i<K*N;++i){ B[2*i] = 0.03f * (i%23); B[2*i+1] = 0.01f * (i%29); }
+  for(int i=0;i<M*N;++i){ bias[2*i] = 0.002f * (i%31); bias[2*i+1] = 0.001f * (i%37); }
+  std::memset(C,0,M*N*2*sizeof(float));
+  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeComplex64,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  std::vector<float> expect(2 * M * N, 0.f);
+  std::vector<float> actual(2 * M * N, 0.f);
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float real = 0.f;
+      float imag = 0.f;
+      for (int k = 0; k < K; ++k) {
+        float ar = A[2 * (m * K + k)];
+        float ai = A[2 * (m * K + k) + 1];
+        float br = B[2 * (k * N + n)];
+        float bi = B[2 * (k * N + n) + 1];
+        real += ar * br - ai * bi;
+        imag += ar * bi + ai * br;
+      }
+      real += bias[2 * (m * N + n)];
+      imag += bias[2 * (m * N + n) + 1];
+      if (real < 0.f) real = 0.f;
+      expect[2 * (m * N + n)] = real;
+      expect[2 * (m * N + n) + 1] = imag;
+      actual[2 * (m * N + n)] = C[2 * (m * N + n)];
+      actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1];
+    }
+  }
+  ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2));
+  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+}
+
+} // namespace mindspore::lite::dsp::test
-- 
Gitee


From b0c155949899390a967098a955b49d67a01492c4 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Fri, 7 Nov 2025 16:21:20 +0000
Subject: [PATCH 4/7] a

---
 .../litert/kernel/dsp/ft04/matmulfusion.cc    |  13 +-
 .../runtime/kernel/dsp/matmulfusion_tests.cc  | 363 ++++++++++++++----
 2 files changed, 294 insertions(+), 82 deletions(-)

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
index 4982c9d1..602507e0 100644
--- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
@@ -79,7 +79,9 @@ int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const {
     MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK;
     return RET_ERROR;
   }
-  *M = aM; *K = aK; *N = bN;
+  *M = aM;
+  *K = aK;
+  *N = bN;
   return RET_OK;
 }
 
@@ -91,11 +93,14 @@ int MatMulFusionDSPKernel::GetActTypeCode(int *code) const {
   if (param != nullptr) {
     switch (param->act_type_) {
       case ActType_Relu:
-        act = 1; break;
+        act = 1;
+        break;
       case ActType_Relu6:
-        act = 2; break;  // DSP uses 2 for RELU6, nnacl uses enum value 3
+        act = 2;
+        break;  // DSP uses 2 for RELU6, nnacl uses enum value 3
       default:
-        act = 0; break;
+        act = 0;
+        break;
     }
   }
   *code = act;
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
index 07eff1bd..42508223 100644
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
@@ -31,7 +31,9 @@ namespace mindspore::lite::dsp::test {
 class TestDSP_MatMulFusion : public DSPCommonTest {};
 
 static void FillFloat(float *data, int size, float base = 0.1f) {
-  for (int i = 0; i < size; ++i) { data[i] = base * static_cast<float>((i % 10)); }
+  for (int i = 0; i < size; ++i) {
+    data[i] = base * static_cast<float>((i % 10));
+  }
 }
 
 typedef uint16_t float16_t_u;
@@ -64,7 +66,11 @@ static inline float16_t_u Fp32ToFp16Bits(float v) {
     uint32_t remainder = mantissa & 0x1FFF;
     if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
       mantissa_rounded++;
-      if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) return static_cast<uint16_t>((sign << 15) | 0x7C00); }
+      if (mantissa_rounded == 0x400) {
+        mantissa_rounded = 0;
+        exponent++;
+        if (exponent > 30) return static_cast<uint16_t>((sign << 15) | 0x7C00);
+      }
     }
     result = static_cast<uint16_t>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
   }
@@ -83,24 +89,36 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) {
   auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR);
   auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
   auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
-  FillFloat(reinterpret_cast<float*>(t_A->MutableData()), M*K, 0.02f);
-  FillFloat(reinterpret_cast<float*>(t_B->MutableData()), K*N, 0.03f);
-  FillFloat(reinterpret_cast<float*>(t_bias->MutableData()), M*N, 0.005f);
-  std::memset(t_out->MutableData(), 0, M*N*sizeof(float));
-  std::vector<lite::Tensor*> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor*> outputs_{t_out};
-  auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter(); param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu; param->has_bias_ = true; param->row_=M; param->col_=N; param->deep_=K;
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  FillFloat(reinterpret_cast<float *>(t_A->MutableData()), M * K, 0.02f);
+  FillFloat(reinterpret_cast<float *>(t_B->MutableData()), K * N, 0.03f);
+  FillFloat(reinterpret_cast<float *>(t_bias->MutableData()), M * N, 0.005f);
+  std::memset(t_out->MutableData(), 0, M * N * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
   kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter*>(param), ctx, key); ASSERT_NE(kernel,nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK); ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  auto A = reinterpret_cast<float*>(t_A->MutableData());
-  auto B = reinterpret_cast<float*>(t_B->MutableData());
-  auto bias = reinterpret_cast<float*>(t_bias->MutableData());
-  auto C = reinterpret_cast<float*>(t_out->MutableData());
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());
   std::vector<float> expect(M * N, 0.f);
   for (int m = 0; m < M; ++m) {
     for (int n = 0; n < N; ++n) {
@@ -113,31 +131,93 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) {
     }
   }
   ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3));
-  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
 }
 
 TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) {
   InitDSPRuntime();
   const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N};
-  std::vector<int> bias_shape={M,N};
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
   auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR);
   auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR);
   auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
   auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
-  auto A16 = reinterpret_cast<uint16_t*>(t_A->MutableData()); auto B16 = reinterpret_cast<uint16_t*>(t_B->MutableData());
-  auto bias16 = reinterpret_cast<uint16_t*>(t_bias->MutableData()); auto C16 = reinterpret_cast<uint16_t*>(t_out->MutableData());
-  for(int i=0;i<M*K;++i){ A16[i] = Fp32ToFp16Bits(0.01f * static_cast<float>(i % 13)); }
-  for(int i=0;i<K*N;++i){ B16[i] = Fp32ToFp16Bits(0.02f * static_cast<float>(i % 17)); }
-  for(int i=0;i<M*N;++i){ bias16[i] = Fp32ToFp16Bits(0.003f * static_cast<float>(i % 11)); }
-  std::memset(C16,0,M*N*sizeof(uint16_t));
-  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out};
-  auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param=new MatMulParameter(); param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K;
-  kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeFloat16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
-  auto Fp16ToFp32=[&](uint16_t h){ uint32_t sign=(h & 0x8000) << 16; uint32_t exp=(h & 0x7C00)>>10; uint32_t frac=(h & 0x03FF); uint32_t fexp,ffrac; if(exp==0){ if(frac==0){ fexp=0; ffrac=0;} else { int shift=0; while((frac & 0x0200)==0){ frac <<=1; ++shift;} frac &=0x03FF; fexp=127-15-shift; ffrac=frac<<13; } } else if(exp==0x1F){ fexp=255; ffrac=frac<<13; } else { fexp=exp-15+127; ffrac=frac<<13; } uint32_t bits= sign | (fexp<<23) | ffrac; float out; std::memcpy(&out,&bits,sizeof(out)); return out; };
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A16 = reinterpret_cast<uint16_t *>(t_A->MutableData());
+  auto B16 = reinterpret_cast<uint16_t *>(t_B->MutableData());
+  auto bias16 = reinterpret_cast<uint16_t *>(t_bias->MutableData());
+  auto C16 = reinterpret_cast<uint16_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A16[i] = Fp32ToFp16Bits(0.01f * static_cast<float>(i % 13));
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B16[i] = Fp32ToFp16Bits(0.02f * static_cast<float>(i % 17));
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias16[i] = Fp32ToFp16Bits(0.003f * static_cast<float>(i % 11));
+  }
+  std::memset(C16, 0, M * N * sizeof(uint16_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
+  auto Fp16ToFp32 = [&](uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16;
+    uint32_t exp = (h & 0x7C00) >> 10;
+    uint32_t frac = (h & 0x03FF);
+    uint32_t fexp, ffrac;
+    if (exp == 0) {
+      if (frac == 0) {
+        fexp = 0;
+        ffrac = 0;
+      } else {
+        int shift = 0;
+        while ((frac & 0x0200) == 0) {
+          frac <<= 1;
+          ++shift;
+        }
+        frac &= 0x03FF;
+        fexp = 127 - 15 - shift;
+        ffrac = frac << 13;
+      }
+    } else if (exp == 0x1F) {
+      fexp = 255;
+      ffrac = frac << 13;
+    } else {
+      fexp = exp - 15 + 127;
+      ffrac = frac << 13;
+    }
+    uint32_t bits = sign | (fexp << 23) | ffrac;
+    float out;
+    std::memcpy(&out, &bits, sizeof(out));
+    return out;
+  };
   std::vector<float> expect_fp32(M * N, 0.f);
   std::vector<float> actual_fp32(M * N, 0.f);
   for (int m = 0; m < M; ++m) {
@@ -154,24 +234,62 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) {
     }
   }
   ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2));
-  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
 }
 
 TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) {
-  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N};
-  std::vector<int> bias_shape={M,N};
-  auto t_A=new lite::Tensor(kNumberTypeInt32,a_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_B=new lite::Tensor(kNumberTypeInt32,b_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_bias=new lite::Tensor(kNumberTypeInt32,bias_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_out=new lite::Tensor(kNumberTypeInt32,out_shape,NHWC,lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
-  auto A=reinterpret_cast<int32_t*>(t_A->MutableData()); auto B=reinterpret_cast<int32_t*>(t_B->MutableData());
-  auto bias=reinterpret_cast<int32_t*>(t_bias->MutableData()); auto C=reinterpret_cast<int32_t*>(t_out->MutableData());
-  for(int i=0;i<M*K;++i){ A[i]=(i%11)-5; }
-  for(int i=0;i<K*N;++i){ B[i]=(i%13)-6; }
-  for(int i=0;i<M*N;++i){ bias[i]=(i%9)-4; }
-  std::memset(C,0,M*N*sizeof(int32_t));
-  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt32,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int32_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int32_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int32_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int32_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = (i % 11) - 5;
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = (i % 13) - 6;
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = (i % 9) - 4;
+  }
+  std::memset(C, 0, M * N * sizeof(int32_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
   std::vector<int32_t> expect(M * N, 0);
   for (int m = 0; m < M; ++m) {
     for (int n = 0; n < N; ++n) {
@@ -184,21 +302,62 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) {
     }
   }
   ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
-  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
 }
 
 TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) {
-  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N}; std::vector<int> bias_shape={M,N};
-  auto t_A=new lite::Tensor(kNumberTypeInt16,a_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_B=new lite::Tensor(kNumberTypeInt16,b_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_bias=new lite::Tensor(kNumberTypeInt16,bias_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_out=new lite::Tensor(kNumberTypeInt16,out_shape,NHWC,lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
-  auto A=reinterpret_cast<int16_t*>(t_A->MutableData()); auto B=reinterpret_cast<int16_t*>(t_B->MutableData());
-  auto bias=reinterpret_cast<int16_t*>(t_bias->MutableData()); auto C=reinterpret_cast<int16_t*>(t_out->MutableData());
-  for(int i=0;i<M*K;++i){ A[i]=static_cast<int16_t>((i%21)-10);} for(int i=0;i<K*N;++i){ B[i]=static_cast<int16_t>((i%19)-9);} for(int i=0;i<M*N;++i){ bias[i]=static_cast<int16_t>(i%15); }
-  std::memset(C,0,M*N*sizeof(int16_t));
-  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<int16_t *>(t_A->MutableData());
+  auto B = reinterpret_cast<int16_t *>(t_B->MutableData());
+  auto bias = reinterpret_cast<int16_t *>(t_bias->MutableData());
+  auto C = reinterpret_cast<int16_t *>(t_out->MutableData());
+  for (int i = 0; i < M * K; ++i) {
+    A[i] = static_cast<int16_t>((i % 21) - 10);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[i] = static_cast<int16_t>((i % 19) - 9);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[i] = static_cast<int16_t>(i % 15);
+  }
+  std::memset(C, 0, M * N * sizeof(int16_t));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
   std::vector<int16_t> expect(M * N, 0);
   for (int m = 0; m < M; ++m) {
     for (int n = 0; n < N; ++n) {
@@ -213,23 +372,65 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) {
     }
   }
   ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
-  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
 }
 
 TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) {
-  InitDSPRuntime(); const int M=256,K=256,N=256; std::vector<int> a_shape={M,K}; std::vector<int> b_shape={K,N}; std::vector<int> out_shape={M,N}; std::vector<int> bias_shape={M,N};
-  auto t_A=new lite::Tensor(kNumberTypeComplex64,a_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_B=new lite::Tensor(kNumberTypeComplex64,b_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_bias=new lite::Tensor(kNumberTypeComplex64,bias_shape,NHWC,lite::Category::CONST_TENSOR);
-  auto t_out=new lite::Tensor(kNumberTypeComplex64,out_shape,NHWC,lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_);
-  auto A=reinterpret_cast<float*>(t_A->MutableData()); auto B=reinterpret_cast<float*>(t_B->MutableData());
-  auto bias=reinterpret_cast<float*>(t_bias->MutableData()); auto C=reinterpret_cast<float*>(t_out->MutableData()); // complex64 stored as interleaved real,imag
-  for(int i=0;i<M*K;++i){ A[2*i] = 0.01f * (i%17); A[2*i+1] = 0.02f * (i%19); }
-  for(int i=0;i<K*N;++i){ B[2*i] = 0.03f * (i%23); B[2*i+1] = 0.01f * (i%29); }
-  for(int i=0;i<M*N;++i){ bias[2*i] = 0.002f * (i%31); bias[2*i+1] = 0.001f * (i%37); }
-  std::memset(C,0,M*N*2*sizeof(float));
-  std::vector<lite::Tensor*> inputs_{t_A,t_B,t_bias}; std::vector<lite::Tensor*> outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast<int>(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeComplex64,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast<OpParameter*>(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK);
+  InitDSPRuntime();
+  const int M = 256, K = 256, N = 256;
+  std::vector<int> a_shape = {M, K};
+  std::vector<int> b_shape = {K, N};
+  std::vector<int> out_shape = {M, N};
+  std::vector<int> bias_shape = {M, N};
+  auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR);
+  auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR);
+  t_A->MallocData(allocator_);
+  t_B->MallocData(allocator_);
+  t_bias->MallocData(allocator_);
+  t_out->MallocData(allocator_);
+  auto A = reinterpret_cast<float *>(t_A->MutableData());
+  auto B = reinterpret_cast<float *>(t_B->MutableData());
+  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
+  auto C = reinterpret_cast<float *>(t_out->MutableData());  // complex64 stored as interleaved real,imag
+  for (int i = 0; i < M * K; ++i) {
+    A[2 * i] = 0.01f * (i % 17);
+    A[2 * i + 1] = 0.02f * (i % 19);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    B[2 * i] = 0.03f * (i % 23);
+    B[2 * i + 1] = 0.01f * (i % 29);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    bias[2 * i] = 0.002f * (i % 31);
+    bias[2 * i + 1] = 0.001f * (i % 37);
+  }
+  std::memset(C, 0, M * N * 2 * sizeof(float));
+  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
+  std::vector<lite::Tensor *> outputs_{t_out};
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+  auto *param = new MatMulParameter();
+  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
+  param->act_type_ = ActType_Relu;
+  param->has_bias_ = true;
+  param->row_ = M;
+  param->col_ = N;
+  param->deep_ = K;
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
+  ASSERT_EQ(kernel->Run(), lite::RET_OK);
   std::vector<float> expect(2 * M * N, 0.f);
   std::vector<float> actual(2 * M * N, 0.f);
   for (int m = 0; m < M; ++m) {
@@ -254,7 +455,13 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) {
     }
   }
   ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2));
-  UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out;
+  UninitDSPRuntime();
+  delete ctx;
+  delete kernel;
+  delete t_A;
+  delete t_B;
+  delete t_bias;
+  delete t_out;
 }
 
-} // namespace mindspore::lite::dsp::test
+}  // namespace mindspore::lite::dsp::test
-- 
Gitee


From 0903f9f32c1aeed1436ac40c1ac43b427a2f10e1 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Sat, 8 Nov 2025 10:14:46 +0000
Subject: [PATCH 5/7] add ft78 ragged_range

---
 .../litert/kernel/dsp/ft78/ragged_range.cc    | 142 +++++++++++
 .../src/litert/kernel/dsp/ft78/ragged_range.h |  50 ++++
 .../runtime/kernel/dsp/ragged_range_tests.cc  | 234 ++++++++++++++++++
 3 files changed, 426 insertions(+)
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
 create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
new file mode 100644
index 00000000..610644ad
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft78/ragged_range.h"
+#include <cstdint>
+#include <string>
+#include "src/litert/kernel_registry.h"
+#include "schema/inner/model_generated.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_RaggedRange;
+
+namespace mindspore::kernel {
+int RaggedRangeDSPKernel::CheckSpecs() {
+  if (in_tensors_.size() != 3 || out_tensors_.size() != 2) {
+    MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size()
+                    << ", out: " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int RaggedRangeDSPKernel::Prepare() { return RET_OK; }
+
+int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) {
+  if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) {
+    return RET_ERROR;
+  }
+  const auto &s0 = in_tensors_[0]->shape();
+  const auto &s1 = in_tensors_[1]->shape();
+  const auto &s2 = in_tensors_[2]->shape();
+  *starts_scalar = s0.empty();
+  *limits_scalar = s1.empty();
+  *deltas_scalar = s2.empty();
+  int non_scalar_rows = -1;
+  if (!*starts_scalar) non_scalar_rows = s0[0];
+  if (!*limits_scalar) {
+    if (non_scalar_rows == -1) {
+      non_scalar_rows = s1[0];
+    } else if (non_scalar_rows != s1[0]) {
+      return RET_ERROR;
+    }
+  }
+  if (!*deltas_scalar) {
+    if (non_scalar_rows == -1) {
+      non_scalar_rows = s2[0];
+    } else if (non_scalar_rows != s2[0]) {
+      return RET_ERROR;
+    }
+  }
+  *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows;
+  return RET_OK;
+}
+
+int RaggedRangeDSPKernel::RunFp32() {
+  kernel_name_ = "fp_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunFp64() {
+  kernel_name_ = "dp_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunInt32() {
+  kernel_name_ = "i32_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunInt16() {
+  kernel_name_ = "i16_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::RunInt8() {
+  kernel_name_ = "i8_raggedrange_s";
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int RaggedRangeDSPKernel::Run() {
+  int rows = 0;
+  bool starts_scalar = false;
+  bool limits_scalar = false;
+  bool deltas_scalar = false;
+  auto ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "RaggedRange rows check failed.";
+    return RET_ERROR;
+  }
+
+  auto allocator = dsp_runtime_->GetAllocator();
+  uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
+  uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
+  uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
+  uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
+  uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); 
+  uint64_t rows_hex = 0;
+  std::memcpy(&rows_hex, &rows, sizeof(int));
+
+  SetKernelArg({starts_dev, limits_dev, deltas_dev, rows_hex, values_dev, splits_dev});
+
+  auto out_dt = out_tensors_[1]->data_type();
+  switch (out_dt) {
+    case kNumberTypeFloat32:
+      return RunFp32();
+    case kNumberTypeFloat64:
+      return RunFp64();
+    case kNumberTypeInt32:
+      return RunInt32();
+    case kNumberTypeInt16:
+      return RunInt16();
+    case kNumberTypeInt8:
+      return RunInt8();
+    default:
+      MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast<int>(out_dt);
+      return RET_ERROR;
+  }
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
+}  // namespace mindspore::kernel
+
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h
new file mode 100644
index 00000000..8ce03e76
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_
+
+#include <string>
+#include <vector>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+class RaggedRangeDSPKernel : public DSPKernel {
+ public:
+  RaggedRangeDSPKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : DSPKernel(parameter, inputs, outputs, ctx) {}
+  ~RaggedRangeDSPKernel() override = default;
+
+  int CheckSpecs() override;
+  int Prepare() override;
+  int Run() override;
+
+ private:
+  int RunFp32();
+  int RunFp64();
+  int RunInt32();
+  int RunInt16();
+  int RunInt8();
+
+  int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar);
+
+  std::string kernel_name_;
+  uint64_t core_mask_{0xff};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
index a3047238..6d05fb6e 100644
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
@@ -256,6 +256,7 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Int32) {
   delete kernel;
 }
 
+#ifdef SUPPORT_FT04
 TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) {
   InitDSPRuntime();
   std::vector<lite::Tensor *> inputs_;
@@ -334,7 +335,9 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) {
   for (auto t : outputs_) delete t;
   delete kernel;
 }
+#endif
 
+#ifdef SUPPORT_FT04
 TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) {
   InitDSPRuntime();
   std::vector<lite::Tensor *> inputs_;
@@ -405,5 +408,236 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) {
   for (auto t : outputs_) delete t;
   delete kernel;
 }
+#endif
+
+#ifdef SUPPORT_FT78
+TEST_F(TestDSP_RaggedRange, RaggedRange_Int16_FT78) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+  std::vector<int> vec3 = {3};
+  auto t_starts = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  int16_t starts_host[3] = {-12, 0, 90};
+  int16_t limits_host[3] = {-2, 30, 100};
+  int16_t deltas_host[3] = {3, 5, 2};
+  std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host));
+  std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host));
+  std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host));
+
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeInt16, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 4, 0);
+  std::fill_n(reinterpret_cast<int16_t *>(t_values->MutableData()), 256, static_cast<int16_t>(0));
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  std::vector<int32_t> expect_splits(4, 0);
+  std::vector<int16_t> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 3; ++r) {
+    expect_splits[r] = acc;
+    for (int v = static_cast<int>(starts_host[r]);
+         deltas_host[r] > 0 ? v < static_cast<int>(limits_host[r]) : v > static_cast<int>(limits_host[r]);
+         v += static_cast<int>(deltas_host[r])) {
+      expect_values.push_back(static_cast<int16_t>(v));
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[3] = acc;
+
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int32_t *>(outputs_[0]->MutableData()), expect_splits.data(), 4));
+  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<int16_t *>(outputs_[1]->MutableData()), expect_values.data(), acc));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+#endif
+
+#ifdef SUPPORT_FT78
+TEST_F(TestDSP_RaggedRange, RaggedRange_Fp64) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+
+  std::vector<int> vec4 = {4};
+  auto t_starts = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  double starts_host[4] = {-5.0, -5.0, -5.0, -5.0};
+  double limits_host[4] = {0.0, 0.0, 0.0, 0.0};
+  double deltas_host[4] = {0.25, 0.25, 0.25, 0.25};
+  std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host));
+  std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host));
+  std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host));
+
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeFloat64, {512}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 5, 0);
+  std::fill_n(reinterpret_cast<double *>(t_values->MutableData()), 512, 0.0);
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  std::vector<int32_t> expect_splits(5, 0);
+  std::vector<double> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 4; ++r) {
+    expect_splits[r] = acc;
+    for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r];
+         v += deltas_host[r]) {
+      expect_values.push_back(v);
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[4] = acc;
+
+  auto actual_splits_ptr = reinterpret_cast<int32_t *>(outputs_[0]->MutableData());
+  std::vector<int32_t> actual_splits(actual_splits_ptr, actual_splits_ptr + 5);
+  for (size_t i = 0; i < actual_splits.size(); ++i) {
+    EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i;
+  }
+
+  auto actual_values_ptr = reinterpret_cast<double *>(outputs_[1]->MutableData());
+  std::vector<double> actual_values(actual_values_ptr, actual_values_ptr + acc);
+  for (int i = 0; i < acc; ++i) {
+    EXPECT_NEAR(expect_values[i], actual_values[i], 1e-6) << "value index " << i;
+  }
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+
+TEST_F(TestDSP_RaggedRange, RaggedRange_Int8) {
+  InitDSPRuntime();
+  std::vector<lite::Tensor *> inputs_;
+  std::vector<lite::Tensor *> outputs_;
+
+  std::vector<int> vec4 = {4};
+  auto t_starts = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_starts->MallocData(allocator_);
+  auto t_limits = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_limits->MallocData(allocator_);
+  auto t_deltas = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_deltas->MallocData(allocator_);
+  inputs_.push_back(t_starts);
+  inputs_.push_back(t_limits);
+  inputs_.push_back(t_deltas);
+
+  int8_t starts_host[4] = {-20, -10, 5, 100};
+  int8_t limits_host[4] = {-5, 10, 20, 110};
+  int8_t deltas_host[4] = {3, 4, 5, 1};
+  std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host));
+  std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host));
+  std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host));
+
+  auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_splits->MallocData(allocator_);
+  auto t_values = new lite::Tensor(kNumberTypeInt8, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  t_values->MallocData(allocator_);
+  outputs_.push_back(t_splits);
+  outputs_.push_back(t_values);
+
+  std::fill_n(reinterpret_cast<int32_t *>(t_splits->MutableData()), 5, 0);
+  std::fill_n(reinterpret_cast<int8_t *>(t_values->MutableData()), 256, static_cast<int8_t>(0));
+
+  auto ctx = new lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_RaggedRange};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  auto *param = new OpParameter();
+  param->type_ = static_cast<int>(schema::PrimitiveType_RaggedRange);
+  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
+  auto ret = kernel->Prepare();
+  EXPECT_EQ(0, ret);
+  ret = kernel->Run();
+  EXPECT_EQ(0, ret);
+
+  std::vector<int32_t> expect_splits(5, 0);
+  std::vector<int8_t> expect_values;
+  int32_t acc = 0;
+  for (int r = 0; r < 4; ++r) {
+    expect_splits[r] = acc;
+    for (int v = static_cast<int>(starts_host[r]);
+         deltas_host[r] > 0 ? v < static_cast<int>(limits_host[r]) : v > static_cast<int>(limits_host[r]);
+         v += static_cast<int>(deltas_host[r])) {
+      expect_values.push_back(static_cast<int8_t>(v));
+    }
+    acc = static_cast<int32_t>(expect_values.size());
+  }
+  expect_splits[4] = acc;
+
+  auto actual_splits_ptr = reinterpret_cast<int32_t *>(outputs_[0]->MutableData());
+  std::vector<int32_t> actual_splits(actual_splits_ptr, actual_splits_ptr + 5);
+  for (size_t i = 0; i < actual_splits.size(); ++i) {
+    EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i;
+  }
+
+  auto actual_values_ptr = reinterpret_cast<int8_t *>(outputs_[1]->MutableData());
+  std::vector<int8_t> actual_values(actual_values_ptr, actual_values_ptr + acc);
+  for (int i = 0; i < acc; ++i) {
+    EXPECT_EQ(expect_values[i], actual_values[i]) << "value index " << i;
+  }
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto t : inputs_) delete t;
+  for (auto t : outputs_) delete t;
+  delete kernel;
+}
+#endif
 
 }  // namespace mindspore::lite::dsp::test
-- 
Gitee


From ac9083ff5c2809e02c31f2c29e41667b51e70cd4 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Sat, 8 Nov 2025 10:15:36 +0000
Subject: [PATCH 6/7] add ft78 ragged_range

---
 mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc      | 3 +--
 .../test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc       | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
index 610644ad..ffb2966f 100644
--- a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc
@@ -109,7 +109,7 @@ int RaggedRangeDSPKernel::Run() {
   uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
   uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
   uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
-  uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); 
+  uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data());
   uint64_t rows_hex = 0;
   std::memcpy(&rows_hex, &rows, sizeof(int));
 
@@ -139,4 +139,3 @@ REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator<R
 REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
 REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_RaggedRange, DSPKernelCreator<RaggedRangeDSPKernel>)
 }  // namespace mindspore::kernel
-
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
index 6d05fb6e..858718b7 100644
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc
@@ -533,8 +533,7 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp64) {
   int32_t acc = 0;
   for (int r = 0; r < 4; ++r) {
     expect_splits[r] = acc;
-    for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r];
-         v += deltas_host[r]) {
+    for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r]; v += deltas_host[r]) {
       expect_values.push_back(v);
     }
     acc = static_cast<int32_t>(expect_values.size());
-- 
Gitee


From a32c2867698fb9d027c8a8f0b3837d32937ecf30 Mon Sep 17 00:00:00 2001
From: mzy <929449726@qq.com>
Date: Sat, 8 Nov 2025 17:19:23 +0000
Subject: [PATCH 7/7] remove matmulfusion

---
 .../litert/kernel/dsp/ft04/matmulfusion.cc    | 181 -------
 .../src/litert/kernel/dsp/ft04/matmulfusion.h |  51 --
 .../runtime/kernel/dsp/matmulfusion_tests.cc  | 467 ------------------
 3 files changed, 699 deletions(-)
 delete mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
 delete mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
 delete mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc

diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
deleted file mode 100644
index 602507e0..00000000
--- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Copyright 2025 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/litert/kernel/dsp/ft04/matmulfusion.h"
-#include <algorithm>
-#include <string>
-#include "src/litert/kernel_registry.h"
-#include "schema/inner/model_generated.h"
-#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
-
-using mindspore::kernel::KERNEL_ARCH::kDSP;
-using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-using mindspore::schema::PrimitiveType_MatMulFusion;
-
-namespace mindspore::kernel {
-
-int MatMulFusionDSPKernel::Prepare() { return RET_OK; }
-
-int MatMulFusionDSPKernel::CheckSpecs() {
-  // inputs: A, B, (optional) bias; output: C
-  if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) {
-    MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size();
-    return RET_ERROR;
-  }
-  if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) {
-    MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size();
-    return RET_ERROR;
-  }
-  int M = 0, N = 0, K = 0;
-  if (GetMNK(&M, &N, &K) != RET_OK) {
-    MS_LOG(WARNING) << "MatMulFusion shape inference failed.";
-    return RET_ERROR;
-  }
-  // Bias check if present
-  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
-    auto bias_shape = in_tensors_[2]->shape();
-    if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) {
-      MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape;
-      return RET_ERROR;
-    }
-  }
-  // Output shape check
-  auto out_shape = out_tensors_[0]->shape();
-  if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) {
-    MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const {
-  if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR;
-  const auto &a_shape = in_tensors_[0]->shape();
-  const auto &b_shape = in_tensors_[1]->shape();
-  if (a_shape.size() != 2 || b_shape.size() != 2) {
-    MS_LOG(WARNING) << "A/B must be rank-2";
-    return RET_ERROR;
-  }
-  int aM = a_shape[0];
-  int aK = a_shape[1];
-  int bK = b_shape[0];
-  int bN = b_shape[1];
-  if (aK != bK) {
-    MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK;
-    return RET_ERROR;
-  }
-  *M = aM;
-  *K = aK;
-  *N = bN;
-  return RET_OK;
-}
-
-int MatMulFusionDSPKernel::GetActTypeCode(int *code) const {
-  if (code == nullptr) return RET_ERROR;
-  // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2)
-  int act = 0;  // default NONE
-  auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  if (param != nullptr) {
-    switch (param->act_type_) {
-      case ActType_Relu:
-        act = 1;
-        break;
-      case ActType_Relu6:
-        act = 2;
-        break;  // DSP uses 2 for RELU6, nnacl uses enum value 3
-      default:
-        act = 0;
-        break;
-    }
-  }
-  *code = act;
-  return RET_OK;
-}
-
-int MatMulFusionDSPKernel::RunFp32() {
-  kernel_name_ = "fp_matmulfusion_s";
-  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
-}
-int MatMulFusionDSPKernel::RunFp16() {
-  kernel_name_ = "hp_matmulfusion_s";
-  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
-}
-int MatMulFusionDSPKernel::RunInt32() {
-  kernel_name_ = "i32_matmulfusion_s";
-  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
-}
-int MatMulFusionDSPKernel::RunInt16() {
-  kernel_name_ = "i16_matmulfusion_s";
-  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
-}
-int MatMulFusionDSPKernel::RunComplex64() {
-  kernel_name_ = "c64_matmulfusion_s";
-  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
-}
-
-int MatMulFusionDSPKernel::Run() {
-  int M = 0, N = 0, K = 0;
-  if (GetMNK(&M, &N, &K) != RET_OK) {
-    MS_LOG(ERROR) << "MatMulFusion GetMNK failed";
-    return RET_ERROR;
-  }
-  int act_code = 0;
-  (void)GetActTypeCode(&act_code);  // default 0 if not set
-
-  auto allocator = dsp_runtime_->GetAllocator();
-  uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data());
-  uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data());
-  uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data());
-  uint64_t bias_ptr = 0;
-  if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) {
-    bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data());
-  }
-  // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type
-  SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast<uint64_t>(M), static_cast<uint64_t>(N),
-                static_cast<uint64_t>(K), static_cast<uint64_t>(act_code)});
-
-  int ret = RET_ERROR;
-  auto dtype = in_tensors_[0]->data_type();
-  if (dtype == kNumberTypeFloat32) {
-    ret = RunFp32();
-  } else if (dtype == kNumberTypeFloat16) {
-    ret = RunFp16();
-  } else if (dtype == kNumberTypeInt32) {
-    ret = RunInt32();
-  } else if (dtype == kNumberTypeInt16) {
-    ret = RunInt16();
-  } else if (dtype == kNumberTypeComplex64) {
-    ret = RunComplex64();
-  } else {
-    MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast<int>(dtype);
-    return RET_ERROR;
-  }
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "MatMulFusion DSP run failed";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
-REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
-REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
-REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
-REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator<MatMulFusionDSPKernel>)
-
-}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
deleted file mode 100644
index 1a487f08..00000000
--- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright 2025 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
-
-#include <vector>
-#include <string>
-#include "src/litert/kernel/dsp/dsp_kernel.h"
-
-namespace mindspore::kernel {
-class MatMulFusionDSPKernel : public DSPKernel {
- public:
-  using DSPKernel::DSPKernel;
-  ~MatMulFusionDSPKernel() override = default;
-
-  int Prepare() override;
-  int CheckSpecs() override;
-  int Run() override;
-
- private:
-  int RunFp32();
-  int RunFp16();
-  int RunInt32();
-  int RunInt16();
-  int RunComplex64();
-
-  // helpers
-  int GetMNK(int *M, int *N, int *K) const;
-  int GetActTypeCode(int *code) const;
-
- private:
-  std::string kernel_name_;
-  uint64_t core_mask_{0xF};
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
deleted file mode 100644
index 42508223..00000000
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc
+++ /dev/null
@@ -1,467 +0,0 @@
-/**
- * Copyright 2025 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <cstring>
-#include <cmath>
-#include <limits>
-#include "ut/src/runtime/kernel/dsp/dsp_test.h"
-#include "include/api/context.h"
-#include "include/api/data_type.h"
-#include "include/api/model.h"
-#include "schema/inner/model_generated.h"
-#include "src/litert/kernel_registry.h"
-#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h"
-
-namespace mindspore::lite::dsp::test {
-
-class TestDSP_MatMulFusion : public DSPCommonTest {};
-
-static void FillFloat(float *data, int size, float base = 0.1f) {
-  for (int i = 0; i < size; ++i) {
-    data[i] = base * static_cast<float>((i % 10));
-  }
-}
-
-typedef uint16_t float16_t_u;
-static inline float16_t_u Fp32ToFp16Bits(float v) {
-  uint32_t bits;
-  std::memcpy(&bits, &v, sizeof(bits));
-  uint32_t sign = (bits >> 31) & 0x1;
-  int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15;
-  uint32_t mantissa = bits & 0x007FFFFF;
-  uint16_t result;
-  if (exponent <= 0) {
-    if (exponent < -10) {
-      result = static_cast<uint16_t>(sign << 15);
-    } else {
-      mantissa |= 0x00800000;
-      int shift = 14 - exponent;
-      uint32_t mantissa_shifted = mantissa >> shift;
-      uint32_t remainder = mantissa & ((1U << shift) - 1);
-      if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) {
-        mantissa_shifted++;
-      }
-      result = static_cast<uint16_t>((sign << 15) | (mantissa_shifted & 0x3FF));
-    }
-  } else if (exponent == 0xFF - 127 + 15) {
-    result = static_cast<uint16_t>((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00));
-  } else if (exponent > 30) {
-    result = static_cast<uint16_t>((sign << 15) | 0x7C00);
-  } else {
-    uint32_t mantissa_rounded = mantissa >> 13;
-    uint32_t remainder = mantissa & 0x1FFF;
-    if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) {
-      mantissa_rounded++;
-      if (mantissa_rounded == 0x400) {
-        mantissa_rounded = 0;
-        exponent++;
-        if (exponent > 30) return static_cast<uint16_t>((sign << 15) | 0x7C00);
-      }
-    }
-    result = static_cast<uint16_t>((sign << 15) | (static_cast<uint32_t>(exponent) << 10) | (mantissa_rounded & 0x3FF));
-  }
-  return result;
-}
-
-// Large size tests (M=N=K=256) across dtypes
-TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) {
-  InitDSPRuntime();
-  const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape = {M, K};
-  std::vector<int> b_shape = {K, N};
-  std::vector<int> out_shape = {M, N};
-  std::vector<int> bias_shape = {M, N};
-  auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_);
-  t_B->MallocData(allocator_);
-  t_bias->MallocData(allocator_);
-  t_out->MallocData(allocator_);
-  FillFloat(reinterpret_cast<float *>(t_A->MutableData()), M * K, 0.02f);
-  FillFloat(reinterpret_cast<float *>(t_B->MutableData()), K * N, 0.03f);
-  FillFloat(reinterpret_cast<float *>(t_bias->MutableData()), M * N, 0.005f);
-  std::memset(t_out->MutableData(), 0, M * N * sizeof(float));
-  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor *> outputs_{t_out};
-  auto ctx = new lite::InnerContext;
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter();
-  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu;
-  param->has_bias_ = true;
-  param->row_ = M;
-  param->col_ = N;
-  param->deep_ = K;
-  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  ASSERT_NE(creator, nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
-  ASSERT_NE(kernel, nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
-  ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  auto A = reinterpret_cast<float *>(t_A->MutableData());
-  auto B = reinterpret_cast<float *>(t_B->MutableData());
-  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
-  auto C = reinterpret_cast<float *>(t_out->MutableData());
-  std::vector<float> expect(M * N, 0.f);
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      float sum = 0.f;
-      for (int k = 0; k < K; ++k) {
-        sum += A[m * K + k] * B[k * N + n];
-      }
-      sum += bias[m * N + n];
-      expect[m * N + n] = sum > 0.f ? sum : 0.f;
-    }
-  }
-  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3));
-  UninitDSPRuntime();
-  delete ctx;
-  delete kernel;
-  delete t_A;
-  delete t_B;
-  delete t_bias;
-  delete t_out;
-}
-
-TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) {
-  InitDSPRuntime();
-  const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape = {M, K};
-  std::vector<int> b_shape = {K, N};
-  std::vector<int> out_shape = {M, N};
-  std::vector<int> bias_shape = {M, N};
-  auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_);
-  t_B->MallocData(allocator_);
-  t_bias->MallocData(allocator_);
-  t_out->MallocData(allocator_);
-  auto A16 = reinterpret_cast<uint16_t *>(t_A->MutableData());
-  auto B16 = reinterpret_cast<uint16_t *>(t_B->MutableData());
-  auto bias16 = reinterpret_cast<uint16_t *>(t_bias->MutableData());
-  auto C16 = reinterpret_cast<uint16_t *>(t_out->MutableData());
-  for (int i = 0; i < M * K; ++i) {
-    A16[i] = Fp32ToFp16Bits(0.01f * static_cast<float>(i % 13));
-  }
-  for (int i = 0; i < K * N; ++i) {
-    B16[i] = Fp32ToFp16Bits(0.02f * static_cast<float>(i % 17));
-  }
-  for (int i = 0; i < M * N; ++i) {
-    bias16[i] = Fp32ToFp16Bits(0.003f * static_cast<float>(i % 11));
-  }
-  std::memset(C16, 0, M * N * sizeof(uint16_t));
-  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor *> outputs_{t_out};
-  auto ctx = new lite::InnerContext;
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter();
-  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu;
-  param->has_bias_ = true;
-  param->row_ = M;
-  param->col_ = N;
-  param->deep_ = K;
-  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  ASSERT_NE(creator, nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
-  ASSERT_NE(kernel, nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
-  ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  auto Fp16ToFp32 = [&](uint16_t h) {
-    uint32_t sign = (h & 0x8000) << 16;
-    uint32_t exp = (h & 0x7C00) >> 10;
-    uint32_t frac = (h & 0x03FF);
-    uint32_t fexp, ffrac;
-    if (exp == 0) {
-      if (frac == 0) {
-        fexp = 0;
-        ffrac = 0;
-      } else {
-        int shift = 0;
-        while ((frac & 0x0200) == 0) {
-          frac <<= 1;
-          ++shift;
-        }
-        frac &= 0x03FF;
-        fexp = 127 - 15 - shift;
-        ffrac = frac << 13;
-      }
-    } else if (exp == 0x1F) {
-      fexp = 255;
-      ffrac = frac << 13;
-    } else {
-      fexp = exp - 15 + 127;
-      ffrac = frac << 13;
-    }
-    uint32_t bits = sign | (fexp << 23) | ffrac;
-    float out;
-    std::memcpy(&out, &bits, sizeof(out));
-    return out;
-  };
-  std::vector<float> expect_fp32(M * N, 0.f);
-  std::vector<float> actual_fp32(M * N, 0.f);
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      float sum = 0.f;
-      for (int k = 0; k < K; ++k) {
-        float a = Fp16ToFp32(A16[m * K + k]);
-        float b = Fp16ToFp32(B16[k * N + n]);
-        sum += a * b;
-      }
-      sum += Fp16ToFp32(bias16[m * N + n]);
-      expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f;
-      actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]);
-    }
-  }
-  ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2));
-  UninitDSPRuntime();
-  delete ctx;
-  delete kernel;
-  delete t_A;
-  delete t_B;
-  delete t_bias;
-  delete t_out;
-}
-
-TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) {
-  InitDSPRuntime();
-  const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape = {M, K};
-  std::vector<int> b_shape = {K, N};
-  std::vector<int> out_shape = {M, N};
-  std::vector<int> bias_shape = {M, N};
-  auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_);
-  t_B->MallocData(allocator_);
-  t_bias->MallocData(allocator_);
-  t_out->MallocData(allocator_);
-  auto A = reinterpret_cast<int32_t *>(t_A->MutableData());
-  auto B = reinterpret_cast<int32_t *>(t_B->MutableData());
-  auto bias = reinterpret_cast<int32_t *>(t_bias->MutableData());
-  auto C = reinterpret_cast<int32_t *>(t_out->MutableData());
-  for (int i = 0; i < M * K; ++i) {
-    A[i] = (i % 11) - 5;
-  }
-  for (int i = 0; i < K * N; ++i) {
-    B[i] = (i % 13) - 6;
-  }
-  for (int i = 0; i < M * N; ++i) {
-    bias[i] = (i % 9) - 4;
-  }
-  std::memset(C, 0, M * N * sizeof(int32_t));
-  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor *> outputs_{t_out};
-  auto ctx = new lite::InnerContext;
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter();
-  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu;
-  param->has_bias_ = true;
-  param->row_ = M;
-  param->col_ = N;
-  param->deep_ = K;
-  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  ASSERT_NE(creator, nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
-  ASSERT_NE(kernel, nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
-  ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  std::vector<int32_t> expect(M * N, 0);
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      long long sum = 0;
-      for (int k = 0; k < K; ++k) {
-        sum += static_cast<long long>(A[m * K + k]) * B[k * N + n];
-      }
-      sum += static_cast<long long>(bias[m * N + n]);
-      expect[m * N + n] = static_cast<int32_t>(sum > 0 ? sum : 0);
-    }
-  }
-  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
-  UninitDSPRuntime();
-  delete ctx;
-  delete kernel;
-  delete t_A;
-  delete t_B;
-  delete t_bias;
-  delete t_out;
-}
-
-TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) {
-  InitDSPRuntime();
-  const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape = {M, K};
-  std::vector<int> b_shape = {K, N};
-  std::vector<int> out_shape = {M, N};
-  std::vector<int> bias_shape = {M, N};
-  auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_);
-  t_B->MallocData(allocator_);
-  t_bias->MallocData(allocator_);
-  t_out->MallocData(allocator_);
-  auto A = reinterpret_cast<int16_t *>(t_A->MutableData());
-  auto B = reinterpret_cast<int16_t *>(t_B->MutableData());
-  auto bias = reinterpret_cast<int16_t *>(t_bias->MutableData());
-  auto C = reinterpret_cast<int16_t *>(t_out->MutableData());
-  for (int i = 0; i < M * K; ++i) {
-    A[i] = static_cast<int16_t>((i % 21) - 10);
-  }
-  for (int i = 0; i < K * N; ++i) {
-    B[i] = static_cast<int16_t>((i % 19) - 9);
-  }
-  for (int i = 0; i < M * N; ++i) {
-    bias[i] = static_cast<int16_t>(i % 15);
-  }
-  std::memset(C, 0, M * N * sizeof(int16_t));
-  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor *> outputs_{t_out};
-  auto ctx = new lite::InnerContext;
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter();
-  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu;
-  param->has_bias_ = true;
-  param->row_ = M;
-  param->col_ = N;
-  param->deep_ = K;
-  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  ASSERT_NE(creator, nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
-  ASSERT_NE(kernel, nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
-  ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  std::vector<int16_t> expect(M * N, 0);
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      long long sum = 0;
-      for (int k = 0; k < K; ++k) {
-        sum += static_cast<long long>(A[m * K + k]) * B[k * N + n];
-      }
-      sum += static_cast<long long>(bias[m * N + n]);
-      sum = sum > 0 ? sum : 0;
-      if (sum > std::numeric_limits<int16_t>::max()) sum = std::numeric_limits<int16_t>::max();
-      expect[m * N + n] = static_cast<int16_t>(sum);
-    }
-  }
-  ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f));
-  UninitDSPRuntime();
-  delete ctx;
-  delete kernel;
-  delete t_A;
-  delete t_B;
-  delete t_bias;
-  delete t_out;
-}
-
-TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) {
-  InitDSPRuntime();
-  const int M = 256, K = 256, N = 256;
-  std::vector<int> a_shape = {M, K};
-  std::vector<int> b_shape = {K, N};
-  std::vector<int> out_shape = {M, N};
-  std::vector<int> bias_shape = {M, N};
-  auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR);
-  auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR);
-  t_A->MallocData(allocator_);
-  t_B->MallocData(allocator_);
-  t_bias->MallocData(allocator_);
-  t_out->MallocData(allocator_);
-  auto A = reinterpret_cast<float *>(t_A->MutableData());
-  auto B = reinterpret_cast<float *>(t_B->MutableData());
-  auto bias = reinterpret_cast<float *>(t_bias->MutableData());
-  auto C = reinterpret_cast<float *>(t_out->MutableData());  // complex64 stored as interleaved real,imag
-  for (int i = 0; i < M * K; ++i) {
-    A[2 * i] = 0.01f * (i % 17);
-    A[2 * i + 1] = 0.02f * (i % 19);
-  }
-  for (int i = 0; i < K * N; ++i) {
-    B[2 * i] = 0.03f * (i % 23);
-    B[2 * i + 1] = 0.01f * (i % 29);
-  }
-  for (int i = 0; i < M * N; ++i) {
-    bias[2 * i] = 0.002f * (i % 31);
-    bias[2 * i + 1] = 0.001f * (i % 37);
-  }
-  std::memset(C, 0, M * N * 2 * sizeof(float));
-  std::vector<lite::Tensor *> inputs_{t_A, t_B, t_bias};
-  std::vector<lite::Tensor *> outputs_{t_out};
-  auto ctx = new lite::InnerContext;
-  ASSERT_EQ(lite::RET_OK, ctx->Init());
-  auto *param = new MatMulParameter();
-  param->op_parameter_.type_ = static_cast<int>(schema::PrimitiveType_MatMulFusion);
-  param->act_type_ = ActType_Relu;
-  param->has_bias_ = true;
-  param->row_ = M;
-  param->col_ = N;
-  param->deep_ = K;
-  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion};
-  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
-  ASSERT_NE(creator, nullptr);
-  auto kernel = creator(inputs_, outputs_, reinterpret_cast<OpParameter *>(param), ctx, key);
-  ASSERT_NE(kernel, nullptr);
-  ASSERT_EQ(kernel->Prepare(), lite::RET_OK);
-  ASSERT_EQ(kernel->Run(), lite::RET_OK);
-  std::vector<float> expect(2 * M * N, 0.f);
-  std::vector<float> actual(2 * M * N, 0.f);
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      float real = 0.f;
-      float imag = 0.f;
-      for (int k = 0; k < K; ++k) {
-        float ar = A[2 * (m * K + k)];
-        float ai = A[2 * (m * K + k) + 1];
-        float br = B[2 * (k * N + n)];
-        float bi = B[2 * (k * N + n) + 1];
-        real += ar * br - ai * bi;
-        imag += ar * bi + ai * br;
-      }
-      real += bias[2 * (m * N + n)];
-      imag += bias[2 * (m * N + n) + 1];
-      if (real < 0.f) real = 0.f;
-      expect[2 * (m * N + n)] = real;
-      expect[2 * (m * N + n) + 1] = imag;
-      actual[2 * (m * N + n)] = C[2 * (m * N + n)];
-      actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1];
-    }
-  }
-  ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2));
-  UninitDSPRuntime();
-  delete ctx;
-  delete kernel;
-  delete t_A;
-  delete t_B;
-  delete t_bias;
-  delete t_out;
-}
-
-}  // namespace mindspore::lite::dsp::test
-- 
Gitee