From 69dea2162c7e88c4adaf5f2b843b56aed11cb29e Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Thu, 6 Nov 2025 13:27:45 +0000 Subject: [PATCH 1/7] add ft04 raggedrange --- .../litert/kernel/dsp/ft04/ragged_range.cc | 132 ++++++ .../src/litert/kernel/dsp/ft04/ragged_range.h | 48 +++ .../runtime/kernel/dsp/ragged_range_tests.cc | 391 ++++++++++++++++++ 3 files changed, 571 insertions(+) create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h create mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc new file mode 100644 index 00000000..8dbd1c29 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc @@ -0,0 +1,132 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/ragged_range.h" +#include +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_RaggedRange; + +namespace mindspore::kernel { + +int RaggedRangeDSPKernel::CheckSpecs() { + // inputs: starts, limits, deltas; outputs: splits, values + if (in_tensors_.size() != 3 || out_tensors_.size() != 2) { + MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() << ", out: " + << out_tensors_.size(); + return RET_ERROR; + } + return RET_OK; +} + +int RaggedRangeDSPKernel::Prepare() { return RET_OK; } + +int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) { + if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) { + return RET_ERROR; + } + const auto &s0 = in_tensors_[0]->shape(); + const auto &s1 = in_tensors_[1]->shape(); + const auto &s2 = in_tensors_[2]->shape(); + *starts_scalar = s0.empty(); + *limits_scalar = s1.empty(); + *deltas_scalar = s2.empty(); + int non_scalar_rows = -1; + if (!*starts_scalar) non_scalar_rows = s0[0]; + if (!*limits_scalar) { + if (non_scalar_rows == -1) non_scalar_rows = s1[0]; + if (non_scalar_rows != s1[0]) return RET_ERROR; + } + if (!*deltas_scalar) { + if (non_scalar_rows == -1) non_scalar_rows = s2[0]; + if (non_scalar_rows != s2[0]) return RET_ERROR; + } + *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows; + return RET_OK; +} + +int RaggedRangeDSPKernel::RunFp32() { + kernel_name_ = "fp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunFp16() { + kernel_name_ = "hp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt32() { + kernel_name_ = "i32_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt16() { + kernel_name_ = "i16_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::Run() { + int rows = 0; + bool starts_scalar = false, limits_scalar = false, deltas_scalar = false; + int ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar); + if (ret != RET_OK) { + MS_LOG(ERROR) << "RaggedRange rows check failed."; + return RET_ERROR; + } + + auto allocator = dsp_runtime_->GetAllocator(); + // device pointers for inputs/outputs + uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + + // outputs: [0] splits (int32), [1] values (same type as inputs) + uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); + + // Note: s-variant core mask passed as separate arg by runtime; do not include in args. + // Arg order: starts, limits, deltas, range_count, values, splits + SetKernelArg({starts_dev, limits_dev, deltas_dev, static_cast(rows), values_dev, splits_dev}); + + auto out_dt = out_tensors_[1]->data_type(); + switch (out_dt) { + case kNumberTypeFloat32: + return RunFp32(); + case kNumberTypeFloat16: + return RunFp16(); + case kNumberTypeInt32: + return RunInt32(); + case kNumberTypeInt16: + return RunInt16(); + default: + MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast(out_dt); + return RET_ERROR; + } +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h new file mode 100644 index 00000000..3c177940 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h @@ -0,0 +1,48 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class RaggedRangeDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + ~RaggedRangeDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int RunFp32(); + int RunFp16(); + int RunInt32(); + int RunInt16(); + + int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar); + + private: + std::string kernel_name_; + uint64_t core_mask_{0xF}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc new file mode 100644 index 00000000..220e45ba --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc @@ -0,0 +1,391 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "include/api/model.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel/dsp/dsp_subgraph.h" +#include "src/litert/kernel_registry.h" + +namespace mindspore::lite::dsp::test { + +class TestDSP_RaggedRange : public DSPCommonTest {}; + +// fp16 helpers (consistent with other tests) +typedef short float16; +static inline float fp16_to_fp32(float16 h) { + uint32_t sign = (h & 0x8000) << 16; + uint32_t exp = (h & 0x7C00) >> 10; + uint32_t frac = (h & 0x03FF); + uint32_t f_exp, f_frac; + if (exp == 0) { + if (frac == 0) { + f_exp = 0; f_frac = 0; + } else { + int shift = 0; + while ((frac & 0x0200) == 0) { frac <<= 1; ++shift; } + frac &= 0x03FF; + f_exp = 127 - 15 - shift; + f_frac = frac << 13; + } + } else if (exp == 0x1F) { + f_exp = 255; f_frac = frac << 13; + } else { + f_exp = exp - 15 + 127; f_frac = frac << 13; + } + uint32_t f_bits = sign | (f_exp << 23) | f_frac; + float result; std::memcpy(&result, &f_bits, sizeof(result)); + return result; +} +[[maybe_unused]] static inline float16 fp32_to_fp16(float v) { + uint32_t bits; std::memcpy(&bits, &v, sizeof(bits)); + uint32_t sign = (bits >> 31) & 0x1; + int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; + uint32_t mantissa = bits & 0x007FFFFF; + float16 result; + if (exponent <= 0) { + if (exponent < -10) { + result = static_cast(sign << 15); + } else { + mantissa |= 0x00800000; int shift = 14 - exponent; uint32_t mantissa_shifted = mantissa >> shift; + uint32_t remainder = mantissa & ((1U << shift) - 1); + if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { + mantissa_shifted++; } + result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); + } + } else if (exponent == 0xFF - 127 + 15) { + result = (mantissa == 0) ? static_cast((sign << 15) | 0x7C00) : static_cast((sign << 15) | 0x7E00); + } else if (exponent > 30) { + result = static_cast((sign << 15) | 0x7C00); + } else { + uint32_t mantissa_rounded = mantissa >> 13; uint32_t remainder = mantissa & 0x1FFF; + if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { + mantissa_rounded++; if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) { + return static_cast((sign << 15) | 0x7C00); } } } + result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); + } + return result; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp32) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset: rows=5 + // starts=[0,10,-5,100,7], limits=[50,60,5,110,27], deltas=[1,2,3,1,4] + std::vector vec5 = {5}; + auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_data = reinterpret_cast(t_starts->MutableData()); + auto limits_data = reinterpret_cast(t_limits->MutableData()); + auto deltas_data = reinterpret_cast(t_deltas->MutableData()); + float starts_host[5] = {0.f, 10.f, -5.f, 100.f, 7.f}; + float limits_host[5] = {50.f, 60.f, 5.f, 110.f, 27.f}; + float deltas_host[5] = {1.f, 2.f, 3.f, 1.f, 4.f}; + std::memcpy(starts_data, starts_host, sizeof(starts_host)); + std::memcpy(limits_data, limits_host, sizeof(limits_host)); + std::memcpy(deltas_data, deltas_host, sizeof(deltas_host)); + + // outputs (splits size rows+1, values computed below) + auto t_splits = new lite::Tensor(kNumberTypeInt32, {6}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + // rough upper bound for values, we'll only compare first computed_len elements + auto t_values = new lite::Tensor(kNumberTypeFloat32, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 6, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 200, 0.0f); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + // build expected + std::vector expect_splits(6, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 5; ++r) { + expect_splits[r] = acc; + for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[5] = acc; + + // compare splits + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), + 6)); + // compare first acc values + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), + acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Int32) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset: rows=4 + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_data = reinterpret_cast(t_starts->MutableData()); + auto limits_data = reinterpret_cast(t_limits->MutableData()); + auto deltas_data = reinterpret_cast(t_deltas->MutableData()); + int32_t starts_host[4] = {0, -100, 5, 1000}; + int32_t limits_host[4] = {200, -50, 50, 1010}; + int32_t deltas_host[4] = {2, 5, 3, 1}; + std::memcpy(starts_data, starts_host, sizeof(starts_host)); + std::memcpy(limits_data, limits_host, sizeof(limits_host)); + std::memcpy(deltas_data, deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt32, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 300, 0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 5)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset with fp32 inputs and fp16 outputs + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_f = reinterpret_cast(t_starts->MutableData()); + auto limits_f = reinterpret_cast(t_limits->MutableData()); + auto deltas_f = reinterpret_cast(t_deltas->MutableData()); + float starts_host[3] = {-10.f, 0.f, 1.5f}; + float limits_host[3] = {0.f, 50.f, 6.f}; + float deltas_host[3] = {0.5f, 1.f, 1.25f}; + std::memcpy(starts_f, starts_host, sizeof(starts_host)); + std::memcpy(limits_f, limits_host, sizeof(limits_host)); + std::memcpy(deltas_f, deltas_host, sizeof(deltas_host)); + + // outputs + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeFloat16, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::memset(t_values->MutableData(), 0, 200 * sizeof(uint16_t)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + // expected + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + + auto out_fp16 = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual(acc); + for (int i = 0; i < acc; ++i) actual[i] = fp16_to_fp32(static_cast(out_fp16[i])); + std::vector correct(acc); + for (int i = 0; i < acc; ++i) correct[i] = fp16_to_fp32(fp32_to_fp16(expect_values[i])); + ASSERT_EQ(0, CompareOutputData(actual.data(), correct.data(), acc, 1e-3)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset with int32 inputs and int16 outputs + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_d32 = reinterpret_cast(t_starts->MutableData()); + auto limits_d32 = reinterpret_cast(t_limits->MutableData()); + auto deltas_d32 = reinterpret_cast(t_deltas->MutableData()); + int32_t starts_host[3] = {-10, 0, 100}; + int32_t limits_host[3] = {10, 100, 110}; + int32_t deltas_host[3] = {2, 3, 1}; + std::memcpy(starts_d32, starts_host, sizeof(starts_host)); + std::memcpy(limits_d32, limits_host, sizeof(limits_host)); + std::memcpy(deltas_d32, deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt16, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 300, 0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +} // namespace mindspore::lite::dsp::test -- Gitee From ec9da3165278e9181accfa3b59a0ddf7f3808051 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Fri, 7 Nov 2025 15:01:21 +0000 Subject: [PATCH 2/7] code format --- .../litert/kernel/dsp/ft04/ragged_range.cc | 4 +- .../runtime/kernel/dsp/ragged_range_tests.cc | 52 +++++++++++++------ 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc index 8dbd1c29..1acc8965 100644 --- a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc @@ -32,8 +32,8 @@ namespace mindspore::kernel { int RaggedRangeDSPKernel::CheckSpecs() { // inputs: starts, limits, deltas; outputs: splits, values if (in_tensors_.size() != 3 || out_tensors_.size() != 2) { - MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() << ", out: " - << out_tensors_.size(); + MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() + << ", out: " << out_tensors_.size(); return RET_ERROR; } return RET_OK; diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc index 220e45ba..a3047238 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc @@ -33,7 +33,7 @@ namespace mindspore::lite::dsp::test { class TestDSP_RaggedRange : public DSPCommonTest {}; // fp16 helpers (consistent with other tests) -typedef short float16; +typedef uint16_t float16; static inline float fp16_to_fp32(float16 h) { uint32_t sign = (h & 0x8000) << 16; uint32_t exp = (h & 0x7C00) >> 10; @@ -41,25 +41,33 @@ static inline float fp16_to_fp32(float16 h) { uint32_t f_exp, f_frac; if (exp == 0) { if (frac == 0) { - f_exp = 0; f_frac = 0; + f_exp = 0; + f_frac = 0; } else { int shift = 0; - while ((frac & 0x0200) == 0) { frac <<= 1; ++shift; } + while ((frac & 0x0200) == 0) { + frac <<= 1; + ++shift; + } frac &= 0x03FF; f_exp = 127 - 15 - shift; f_frac = frac << 13; } } else if (exp == 0x1F) { - f_exp = 255; f_frac = frac << 13; + f_exp = 255; + f_frac = frac << 13; } else { - f_exp = exp - 15 + 127; f_frac = frac << 13; + f_exp = exp - 15 + 127; + f_frac = frac << 13; } uint32_t f_bits = sign | (f_exp << 23) | f_frac; - float result; std::memcpy(&result, &f_bits, sizeof(result)); + float result; + std::memcpy(&result, &f_bits, sizeof(result)); return result; } [[maybe_unused]] static inline float16 fp32_to_fp16(float v) { - uint32_t bits; std::memcpy(&bits, &v, sizeof(bits)); + uint32_t bits; + std::memcpy(&bits, &v, sizeof(bits)); uint32_t sign = (bits >> 31) & 0x1; int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; uint32_t mantissa = bits & 0x007FFFFF; @@ -68,21 +76,33 @@ static inline float fp16_to_fp32(float16 h) { if (exponent < -10) { result = static_cast(sign << 15); } else { - mantissa |= 0x00800000; int shift = 14 - exponent; uint32_t mantissa_shifted = mantissa >> shift; + mantissa |= 0x00800000; + int shift = 14 - exponent; + uint32_t mantissa_shifted = mantissa >> shift; uint32_t remainder = mantissa & ((1U << shift) - 1); if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { - mantissa_shifted++; } + mantissa_shifted++; + } result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); } } else if (exponent == 0xFF - 127 + 15) { - result = (mantissa == 0) ? static_cast((sign << 15) | 0x7C00) : static_cast((sign << 15) | 0x7E00); + result = + (mantissa == 0) ? static_cast((sign << 15) | 0x7C00) : static_cast((sign << 15) | 0x7E00); } else if (exponent > 30) { result = static_cast((sign << 15) | 0x7C00); } else { - uint32_t mantissa_rounded = mantissa >> 13; uint32_t remainder = mantissa & 0x1FFF; + uint32_t mantissa_rounded = mantissa >> 13; + uint32_t remainder = mantissa & 0x1FFF; if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { - mantissa_rounded++; if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) { - return static_cast((sign << 15) | 0x7C00); } } } + mantissa_rounded++; + if (mantissa_rounded == 0x400) { + mantissa_rounded = 0; + exponent++; + if (exponent > 30) { + return static_cast((sign << 15) | 0x7C00); + } + } + } result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); } return result; @@ -154,11 +174,9 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp32) { expect_splits[5] = acc; // compare splits - ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), - 6)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 6)); // compare first acc values - ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), - acc)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); UninitDSPRuntime(); delete ctx; -- Gitee From 3c30419d70f3628d598348e46de864a3e6973619 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Fri, 7 Nov 2025 16:19:57 +0000 Subject: [PATCH 3/7] add matmulfusion --- .../litert/kernel/dsp/ft04/matmulfusion.cc | 176 ++++++++++++ .../src/litert/kernel/dsp/ft04/matmulfusion.h | 51 ++++ .../runtime/kernel/dsp/matmulfusion_tests.cc | 260 ++++++++++++++++++ 3 files changed, 487 insertions(+) create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h create mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc new file mode 100644 index 00000000..4982c9d1 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc @@ -0,0 +1,176 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/matmulfusion.h" +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_MatMulFusion; + +namespace mindspore::kernel { + +int MatMulFusionDSPKernel::Prepare() { return RET_OK; } + +int MatMulFusionDSPKernel::CheckSpecs() { + // inputs: A, B, (optional) bias; output: C + if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) { + MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) { + MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size(); + return RET_ERROR; + } + int M = 0, N = 0, K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(WARNING) << "MatMulFusion shape inference failed."; + return RET_ERROR; + } + // Bias check if present + if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { + auto bias_shape = in_tensors_[2]->shape(); + if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) { + MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape; + return RET_ERROR; + } + } + // Output shape check + auto out_shape = out_tensors_[0]->shape(); + if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) { + MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")"; + return RET_ERROR; + } + return RET_OK; +} + +int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const { + if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR; + const auto &a_shape = in_tensors_[0]->shape(); + const auto &b_shape = in_tensors_[1]->shape(); + if (a_shape.size() != 2 || b_shape.size() != 2) { + MS_LOG(WARNING) << "A/B must be rank-2"; + return RET_ERROR; + } + int aM = a_shape[0]; + int aK = a_shape[1]; + int bK = b_shape[0]; + int bN = b_shape[1]; + if (aK != bK) { + MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK; + return RET_ERROR; + } + *M = aM; *K = aK; *N = bN; + return RET_OK; +} + +int MatMulFusionDSPKernel::GetActTypeCode(int *code) const { + if (code == nullptr) return RET_ERROR; + // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2) + int act = 0; // default NONE + auto *param = reinterpret_cast(op_parameter_); + if (param != nullptr) { + switch (param->act_type_) { + case ActType_Relu: + act = 1; break; + case ActType_Relu6: + act = 2; break; // DSP uses 2 for RELU6, nnacl uses enum value 3 + default: + act = 0; break; + } + } + *code = act; + return RET_OK; +} + +int MatMulFusionDSPKernel::RunFp32() { + kernel_name_ = "fp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunFp16() { + kernel_name_ = "hp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunInt32() { + kernel_name_ = "i32_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunInt16() { + kernel_name_ = "i16_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunComplex64() { + kernel_name_ = "c64_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::Run() { + int M = 0, N = 0, K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion GetMNK failed"; + return RET_ERROR; + } + int act_code = 0; + (void)GetActTypeCode(&act_code); // default 0 if not set + + auto allocator = dsp_runtime_->GetAllocator(); + uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t bias_ptr = 0; + if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { + bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + } + // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type + SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast(M), static_cast(N), + static_cast(K), static_cast(act_code)}); + + int ret = RET_ERROR; + auto dtype = in_tensors_[0]->data_type(); + if (dtype == kNumberTypeFloat32) { + ret = RunFp32(); + } else if (dtype == kNumberTypeFloat16) { + ret = RunFp16(); + } else if (dtype == kNumberTypeInt32) { + ret = RunInt32(); + } else if (dtype == kNumberTypeInt16) { + ret = RunInt16(); + } else if (dtype == kNumberTypeComplex64) { + ret = RunComplex64(); + } else { + MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast(dtype); + return RET_ERROR; + } + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion DSP run failed"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h new file mode 100644 index 00000000..1a487f08 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h @@ -0,0 +1,51 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class MatMulFusionDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + ~MatMulFusionDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int RunFp32(); + int RunFp16(); + int RunInt32(); + int RunInt16(); + int RunComplex64(); + + // helpers + int GetMNK(int *M, int *N, int *K) const; + int GetActTypeCode(int *code) const; + + private: + std::string kernel_name_; + uint64_t core_mask_{0xF}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc new file mode 100644 index 00000000..07eff1bd --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc @@ -0,0 +1,260 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "include/api/model.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel_registry.h" +#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" + +namespace mindspore::lite::dsp::test { + +class TestDSP_MatMulFusion : public DSPCommonTest {}; + +static void FillFloat(float *data, int size, float base = 0.1f) { + for (int i = 0; i < size; ++i) { data[i] = base * static_cast((i % 10)); } +} + +typedef uint16_t float16_t_u; +static inline float16_t_u Fp32ToFp16Bits(float v) { + uint32_t bits; + std::memcpy(&bits, &v, sizeof(bits)); + uint32_t sign = (bits >> 31) & 0x1; + int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; + uint32_t mantissa = bits & 0x007FFFFF; + uint16_t result; + if (exponent <= 0) { + if (exponent < -10) { + result = static_cast(sign << 15); + } else { + mantissa |= 0x00800000; + int shift = 14 - exponent; + uint32_t mantissa_shifted = mantissa >> shift; + uint32_t remainder = mantissa & ((1U << shift) - 1); + if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { + mantissa_shifted++; + } + result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); + } + } else if (exponent == 0xFF - 127 + 15) { + result = static_cast((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00)); + } else if (exponent > 30) { + result = static_cast((sign << 15) | 0x7C00); + } else { + uint32_t mantissa_rounded = mantissa >> 13; + uint32_t remainder = mantissa & 0x1FFF; + if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { + mantissa_rounded++; + if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) return static_cast((sign << 15) | 0x7C00); } + } + result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); + } + return result; +} + +// Large size tests (M=N=K=256) across dtypes +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); + FillFloat(reinterpret_cast(t_A->MutableData()), M*K, 0.02f); + FillFloat(reinterpret_cast(t_B->MutableData()), K*N, 0.03f); + FillFloat(reinterpret_cast(t_bias->MutableData()), M*N, 0.005f); + std::memset(t_out->MutableData(), 0, M*N*sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; param->has_bias_ = true; param->row_=M; param->col_=N; param->deep_=K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); ASSERT_NE(kernel,nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + std::vector expect(M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.f; + for (int k = 0; k < K; ++k) { + sum += A[m * K + k] * B[k * N + n]; + } + sum += bias[m * N + n]; + expect[m * N + n] = sum > 0.f ? sum : 0.f; + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3)); + UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; + std::vector bias_shape={M,N}; + auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); + auto A16 = reinterpret_cast(t_A->MutableData()); auto B16 = reinterpret_cast(t_B->MutableData()); + auto bias16 = reinterpret_cast(t_bias->MutableData()); auto C16 = reinterpret_cast(t_out->MutableData()); + for(int i=0;i(i % 13)); } + for(int i=0;i(i % 17)); } + for(int i=0;i(i % 11)); } + std::memset(C16,0,M*N*sizeof(uint16_t)); + std::vector inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param=new MatMulParameter(); param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; + kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeFloat16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + auto Fp16ToFp32=[&](uint16_t h){ uint32_t sign=(h & 0x8000) << 16; uint32_t exp=(h & 0x7C00)>>10; uint32_t frac=(h & 0x03FF); uint32_t fexp,ffrac; if(exp==0){ if(frac==0){ fexp=0; ffrac=0;} else { int shift=0; while((frac & 0x0200)==0){ frac <<=1; ++shift;} frac &=0x03FF; fexp=127-15-shift; ffrac=frac<<13; } } else if(exp==0x1F){ fexp=255; ffrac=frac<<13; } else { fexp=exp-15+127; ffrac=frac<<13; } uint32_t bits= sign | (fexp<<23) | ffrac; float out; std::memcpy(&out,&bits,sizeof(out)); return out; }; + std::vector expect_fp32(M * N, 0.f); + std::vector actual_fp32(M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.f; + for (int k = 0; k < K; ++k) { + float a = Fp16ToFp32(A16[m * K + k]); + float b = Fp16ToFp32(B16[k * N + n]); + sum += a * b; + } + sum += Fp16ToFp32(bias16[m * N + n]); + expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f; + actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]); + } + } + ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2)); + UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) { + InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; + std::vector bias_shape={M,N}; + auto t_A=new lite::Tensor(kNumberTypeInt32,a_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_B=new lite::Tensor(kNumberTypeInt32,b_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_bias=new lite::Tensor(kNumberTypeInt32,bias_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_out=new lite::Tensor(kNumberTypeInt32,out_shape,NHWC,lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); + auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); + auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); + for(int i=0;i inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt32,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + long long sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + expect[m * N + n] = static_cast(sum > 0 ? sum : 0); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) { + InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; std::vector bias_shape={M,N}; + auto t_A=new lite::Tensor(kNumberTypeInt16,a_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_B=new lite::Tensor(kNumberTypeInt16,b_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_bias=new lite::Tensor(kNumberTypeInt16,bias_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_out=new lite::Tensor(kNumberTypeInt16,out_shape,NHWC,lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); + auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); + auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); + for(int i=0;i((i%21)-10);} for(int i=0;i((i%19)-9);} for(int i=0;i(i%15); } + std::memset(C,0,M*N*sizeof(int16_t)); + std::vector inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + long long sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + sum = sum > 0 ? sum : 0; + if (sum > std::numeric_limits::max()) sum = std::numeric_limits::max(); + expect[m * N + n] = static_cast(sum); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) { + InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; std::vector bias_shape={M,N}; + auto t_A=new lite::Tensor(kNumberTypeComplex64,a_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_B=new lite::Tensor(kNumberTypeComplex64,b_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_bias=new lite::Tensor(kNumberTypeComplex64,bias_shape,NHWC,lite::Category::CONST_TENSOR); + auto t_out=new lite::Tensor(kNumberTypeComplex64,out_shape,NHWC,lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); + auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); + auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); // complex64 stored as interleaved real,imag + for(int i=0;i inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeComplex64,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + std::vector expect(2 * M * N, 0.f); + std::vector actual(2 * M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float real = 0.f; + float imag = 0.f; + for (int k = 0; k < K; ++k) { + float ar = A[2 * (m * K + k)]; + float ai = A[2 * (m * K + k) + 1]; + float br = B[2 * (k * N + n)]; + float bi = B[2 * (k * N + n) + 1]; + real += ar * br - ai * bi; + imag += ar * bi + ai * br; + } + real += bias[2 * (m * N + n)]; + imag += bias[2 * (m * N + n) + 1]; + if (real < 0.f) real = 0.f; + expect[2 * (m * N + n)] = real; + expect[2 * (m * N + n) + 1] = imag; + actual[2 * (m * N + n)] = C[2 * (m * N + n)]; + actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1]; + } + } + ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2)); + UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; +} + +} // namespace mindspore::lite::dsp::test -- Gitee From b0c155949899390a967098a955b49d67a01492c4 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Fri, 7 Nov 2025 16:21:20 +0000 Subject: [PATCH 4/7] a --- .../litert/kernel/dsp/ft04/matmulfusion.cc | 13 +- .../runtime/kernel/dsp/matmulfusion_tests.cc | 363 ++++++++++++++---- 2 files changed, 294 insertions(+), 82 deletions(-) diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc index 4982c9d1..602507e0 100644 --- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc @@ -79,7 +79,9 @@ int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const { MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK; return RET_ERROR; } - *M = aM; *K = aK; *N = bN; + *M = aM; + *K = aK; + *N = bN; return RET_OK; } @@ -91,11 +93,14 @@ int MatMulFusionDSPKernel::GetActTypeCode(int *code) const { if (param != nullptr) { switch (param->act_type_) { case ActType_Relu: - act = 1; break; + act = 1; + break; case ActType_Relu6: - act = 2; break; // DSP uses 2 for RELU6, nnacl uses enum value 3 + act = 2; + break; // DSP uses 2 for RELU6, nnacl uses enum value 3 default: - act = 0; break; + act = 0; + break; } } *code = act; diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc index 07eff1bd..42508223 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc @@ -31,7 +31,9 @@ namespace mindspore::lite::dsp::test { class TestDSP_MatMulFusion : public DSPCommonTest {}; static void FillFloat(float *data, int size, float base = 0.1f) { - for (int i = 0; i < size; ++i) { data[i] = base * static_cast((i % 10)); } + for (int i = 0; i < size; ++i) { + data[i] = base * static_cast((i % 10)); + } } typedef uint16_t float16_t_u; @@ -64,7 +66,11 @@ static inline float16_t_u Fp32ToFp16Bits(float v) { uint32_t remainder = mantissa & 0x1FFF; if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { mantissa_rounded++; - if (mantissa_rounded == 0x400) { mantissa_rounded = 0; exponent++; if (exponent > 30) return static_cast((sign << 15) | 0x7C00); } + if (mantissa_rounded == 0x400) { + mantissa_rounded = 0; + exponent++; + if (exponent > 30) return static_cast((sign << 15) | 0x7C00); + } } result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); } @@ -83,24 +89,36 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) { auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR); auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR); auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); - FillFloat(reinterpret_cast(t_A->MutableData()), M*K, 0.02f); - FillFloat(reinterpret_cast(t_B->MutableData()), K*N, 0.03f); - FillFloat(reinterpret_cast(t_bias->MutableData()), M*N, 0.005f); - std::memset(t_out->MutableData(), 0, M*N*sizeof(float)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; param->has_bias_ = true; param->row_=M; param->col_=N; param->deep_=K; + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + FillFloat(reinterpret_cast(t_A->MutableData()), M * K, 0.02f); + FillFloat(reinterpret_cast(t_B->MutableData()), K * N, 0.03f); + FillFloat(reinterpret_cast(t_bias->MutableData()), M * N, 0.005f); + std::memset(t_out->MutableData(), 0, M * N * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); ASSERT_NE(kernel,nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); ASSERT_EQ(kernel->Run(), lite::RET_OK); - auto A = reinterpret_cast(t_A->MutableData()); - auto B = reinterpret_cast(t_B->MutableData()); - auto bias = reinterpret_cast(t_bias->MutableData()); - auto C = reinterpret_cast(t_out->MutableData()); + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); std::vector expect(M * N, 0.f); for (int m = 0; m < M; ++m) { for (int n = 0; n < N; ++n) { @@ -113,31 +131,93 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) { } } ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3)); - UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; } TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) { InitDSPRuntime(); const int M = 256, K = 256, N = 256; - std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; - std::vector bias_shape={M,N}; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR); auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR); auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR); auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); - auto A16 = reinterpret_cast(t_A->MutableData()); auto B16 = reinterpret_cast(t_B->MutableData()); - auto bias16 = reinterpret_cast(t_bias->MutableData()); auto C16 = reinterpret_cast(t_out->MutableData()); - for(int i=0;i(i % 13)); } - for(int i=0;i(i % 17)); } - for(int i=0;i(i % 11)); } - std::memset(C16,0,M*N*sizeof(uint16_t)); - std::vector inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param=new MatMulParameter(); param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; - kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeFloat16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); - auto Fp16ToFp32=[&](uint16_t h){ uint32_t sign=(h & 0x8000) << 16; uint32_t exp=(h & 0x7C00)>>10; uint32_t frac=(h & 0x03FF); uint32_t fexp,ffrac; if(exp==0){ if(frac==0){ fexp=0; ffrac=0;} else { int shift=0; while((frac & 0x0200)==0){ frac <<=1; ++shift;} frac &=0x03FF; fexp=127-15-shift; ffrac=frac<<13; } } else if(exp==0x1F){ fexp=255; ffrac=frac<<13; } else { fexp=exp-15+127; ffrac=frac<<13; } uint32_t bits= sign | (fexp<<23) | ffrac; float out; std::memcpy(&out,&bits,sizeof(out)); return out; }; + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A16 = reinterpret_cast(t_A->MutableData()); + auto B16 = reinterpret_cast(t_B->MutableData()); + auto bias16 = reinterpret_cast(t_bias->MutableData()); + auto C16 = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A16[i] = Fp32ToFp16Bits(0.01f * static_cast(i % 13)); + } + for (int i = 0; i < K * N; ++i) { + B16[i] = Fp32ToFp16Bits(0.02f * static_cast(i % 17)); + } + for (int i = 0; i < M * N; ++i) { + bias16[i] = Fp32ToFp16Bits(0.003f * static_cast(i % 11)); + } + std::memset(C16, 0, M * N * sizeof(uint16_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto Fp16ToFp32 = [&](uint16_t h) { + uint32_t sign = (h & 0x8000) << 16; + uint32_t exp = (h & 0x7C00) >> 10; + uint32_t frac = (h & 0x03FF); + uint32_t fexp, ffrac; + if (exp == 0) { + if (frac == 0) { + fexp = 0; + ffrac = 0; + } else { + int shift = 0; + while ((frac & 0x0200) == 0) { + frac <<= 1; + ++shift; + } + frac &= 0x03FF; + fexp = 127 - 15 - shift; + ffrac = frac << 13; + } + } else if (exp == 0x1F) { + fexp = 255; + ffrac = frac << 13; + } else { + fexp = exp - 15 + 127; + ffrac = frac << 13; + } + uint32_t bits = sign | (fexp << 23) | ffrac; + float out; + std::memcpy(&out, &bits, sizeof(out)); + return out; + }; std::vector expect_fp32(M * N, 0.f); std::vector actual_fp32(M * N, 0.f); for (int m = 0; m < M; ++m) { @@ -154,24 +234,62 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) { } } ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2)); - UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; } TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) { - InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; - std::vector bias_shape={M,N}; - auto t_A=new lite::Tensor(kNumberTypeInt32,a_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_B=new lite::Tensor(kNumberTypeInt32,b_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_bias=new lite::Tensor(kNumberTypeInt32,bias_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_out=new lite::Tensor(kNumberTypeInt32,out_shape,NHWC,lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); - auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); - auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); - for(int i=0;i inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt32,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = (i % 11) - 5; + } + for (int i = 0; i < K * N; ++i) { + B[i] = (i % 13) - 6; + } + for (int i = 0; i < M * N; ++i) { + bias[i] = (i % 9) - 4; + } + std::memset(C, 0, M * N * sizeof(int32_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); std::vector expect(M * N, 0); for (int m = 0; m < M; ++m) { for (int n = 0; n < N; ++n) { @@ -184,21 +302,62 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) { } } ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); - UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; } TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) { - InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; std::vector bias_shape={M,N}; - auto t_A=new lite::Tensor(kNumberTypeInt16,a_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_B=new lite::Tensor(kNumberTypeInt16,b_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_bias=new lite::Tensor(kNumberTypeInt16,bias_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_out=new lite::Tensor(kNumberTypeInt16,out_shape,NHWC,lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); - auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); - auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); - for(int i=0;i((i%21)-10);} for(int i=0;i((i%19)-9);} for(int i=0;i(i%15); } - std::memset(C,0,M*N*sizeof(int16_t)); - std::vector inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeInt16,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = static_cast((i % 21) - 10); + } + for (int i = 0; i < K * N; ++i) { + B[i] = static_cast((i % 19) - 9); + } + for (int i = 0; i < M * N; ++i) { + bias[i] = static_cast(i % 15); + } + std::memset(C, 0, M * N * sizeof(int16_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); std::vector expect(M * N, 0); for (int m = 0; m < M; ++m) { for (int n = 0; n < N; ++n) { @@ -213,23 +372,65 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) { } } ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); - UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; } TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) { - InitDSPRuntime(); const int M=256,K=256,N=256; std::vector a_shape={M,K}; std::vector b_shape={K,N}; std::vector out_shape={M,N}; std::vector bias_shape={M,N}; - auto t_A=new lite::Tensor(kNumberTypeComplex64,a_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_B=new lite::Tensor(kNumberTypeComplex64,b_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_bias=new lite::Tensor(kNumberTypeComplex64,bias_shape,NHWC,lite::Category::CONST_TENSOR); - auto t_out=new lite::Tensor(kNumberTypeComplex64,out_shape,NHWC,lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); t_B->MallocData(allocator_); t_bias->MallocData(allocator_); t_out->MallocData(allocator_); - auto A=reinterpret_cast(t_A->MutableData()); auto B=reinterpret_cast(t_B->MutableData()); - auto bias=reinterpret_cast(t_bias->MutableData()); auto C=reinterpret_cast(t_out->MutableData()); // complex64 stored as interleaved real,imag - for(int i=0;i inputs_{t_A,t_B,t_bias}; std::vector outputs_{t_out}; auto ctx=new lite::InnerContext; ASSERT_EQ(lite::RET_OK, ctx->Init()); auto *param=new MatMulParameter(); param->op_parameter_.type_=static_cast(schema::PrimitiveType_MatMulFusion); param->act_type_=ActType_Relu; param->has_bias_=true; param->row_=M; param->col_=N; param->deep_=K; kernel::KernelKey key={kernel::KERNEL_ARCH::kDSP,kNumberTypeComplex64,NHWC,schema::PrimitiveType_MatMulFusion}; auto creator=KernelRegistry::GetInstance()->GetCreator(key); ASSERT_NE(creator,nullptr); auto kernel=creator(inputs_,outputs_,reinterpret_cast(param),ctx,key); ASSERT_NE(kernel,nullptr); ASSERT_EQ(kernel->Prepare(),lite::RET_OK); ASSERT_EQ(kernel->Run(),lite::RET_OK); + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); // complex64 stored as interleaved real,imag + for (int i = 0; i < M * K; ++i) { + A[2 * i] = 0.01f * (i % 17); + A[2 * i + 1] = 0.02f * (i % 19); + } + for (int i = 0; i < K * N; ++i) { + B[2 * i] = 0.03f * (i % 23); + B[2 * i + 1] = 0.01f * (i % 29); + } + for (int i = 0; i < M * N; ++i) { + bias[2 * i] = 0.002f * (i % 31); + bias[2 * i + 1] = 0.001f * (i % 37); + } + std::memset(C, 0, M * N * 2 * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); std::vector expect(2 * M * N, 0.f); std::vector actual(2 * M * N, 0.f); for (int m = 0; m < M; ++m) { @@ -254,7 +455,13 @@ TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) { } } ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2)); - UninitDSPRuntime(); delete ctx; delete kernel; delete t_A; delete t_B; delete t_bias; delete t_out; + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; } -} // namespace mindspore::lite::dsp::test +} // namespace mindspore::lite::dsp::test -- Gitee From 0903f9f32c1aeed1436ac40c1ac43b427a2f10e1 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Sat, 8 Nov 2025 10:14:46 +0000 Subject: [PATCH 5/7] add ft78 ragged_range --- .../litert/kernel/dsp/ft78/ragged_range.cc | 142 +++++++++++ .../src/litert/kernel/dsp/ft78/ragged_range.h | 50 ++++ .../runtime/kernel/dsp/ragged_range_tests.cc | 234 ++++++++++++++++++ 3 files changed, 426 insertions(+) create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc create mode 100644 mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc new file mode 100644 index 00000000..610644ad --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc @@ -0,0 +1,142 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/ragged_range.h" +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_RaggedRange; + +namespace mindspore::kernel { +int RaggedRangeDSPKernel::CheckSpecs() { + if (in_tensors_.size() != 3 || out_tensors_.size() != 2) { + MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() + << ", out: " << out_tensors_.size(); + return RET_ERROR; + } + return RET_OK; +} + +int RaggedRangeDSPKernel::Prepare() { return RET_OK; } + +int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) { + if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) { + return RET_ERROR; + } + const auto &s0 = in_tensors_[0]->shape(); + const auto &s1 = in_tensors_[1]->shape(); + const auto &s2 = in_tensors_[2]->shape(); + *starts_scalar = s0.empty(); + *limits_scalar = s1.empty(); + *deltas_scalar = s2.empty(); + int non_scalar_rows = -1; + if (!*starts_scalar) non_scalar_rows = s0[0]; + if (!*limits_scalar) { + if (non_scalar_rows == -1) { + non_scalar_rows = s1[0]; + } else if (non_scalar_rows != s1[0]) { + return RET_ERROR; + } + } + if (!*deltas_scalar) { + if (non_scalar_rows == -1) { + non_scalar_rows = s2[0]; + } else if (non_scalar_rows != s2[0]) { + return RET_ERROR; + } + } + *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows; + return RET_OK; +} + +int RaggedRangeDSPKernel::RunFp32() { + kernel_name_ = "fp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunFp64() { + kernel_name_ = "dp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt32() { + kernel_name_ = "i32_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt16() { + kernel_name_ = "i16_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt8() { + kernel_name_ = "i8_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::Run() { + int rows = 0; + bool starts_scalar = false; + bool limits_scalar = false; + bool deltas_scalar = false; + auto ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar); + if (ret != RET_OK) { + MS_LOG(ERROR) << "RaggedRange rows check failed."; + return RET_ERROR; + } + + auto allocator = dsp_runtime_->GetAllocator(); + uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); + uint64_t rows_hex = 0; + std::memcpy(&rows_hex, &rows, sizeof(int)); + + SetKernelArg({starts_dev, limits_dev, deltas_dev, rows_hex, values_dev, splits_dev}); + + auto out_dt = out_tensors_[1]->data_type(); + switch (out_dt) { + case kNumberTypeFloat32: + return RunFp32(); + case kNumberTypeFloat64: + return RunFp64(); + case kNumberTypeInt32: + return RunInt32(); + case kNumberTypeInt16: + return RunInt16(); + case kNumberTypeInt8: + return RunInt8(); + default: + MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast(out_dt); + return RET_ERROR; + } +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_RaggedRange, DSPKernelCreator) +} // namespace mindspore::kernel + diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h new file mode 100644 index 00000000..8ce03e76 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h @@ -0,0 +1,50 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class RaggedRangeDSPKernel : public DSPKernel { + public: + RaggedRangeDSPKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : DSPKernel(parameter, inputs, outputs, ctx) {} + ~RaggedRangeDSPKernel() override = default; + + int CheckSpecs() override; + int Prepare() override; + int Run() override; + + private: + int RunFp32(); + int RunFp64(); + int RunInt32(); + int RunInt16(); + int RunInt8(); + + int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar); + + std::string kernel_name_; + uint64_t core_mask_{0xff}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc index a3047238..6d05fb6e 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc @@ -256,6 +256,7 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Int32) { delete kernel; } +#ifdef SUPPORT_FT04 TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) { InitDSPRuntime(); std::vector inputs_; @@ -334,7 +335,9 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) { for (auto t : outputs_) delete t; delete kernel; } +#endif +#ifdef SUPPORT_FT04 TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) { InitDSPRuntime(); std::vector inputs_; @@ -405,5 +408,236 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) { for (auto t : outputs_) delete t; delete kernel; } +#endif + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_RaggedRange, RaggedRange_Int16_FT78) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + int16_t starts_host[3] = {-12, 0, 90}; + int16_t limits_host[3] = {-2, 30, 100}; + int16_t deltas_host[3] = {3, 5, 2}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt16, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 256, static_cast(0)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (int v = static_cast(starts_host[r]); + deltas_host[r] > 0 ? v < static_cast(limits_host[r]) : v > static_cast(limits_host[r]); + v += static_cast(deltas_host[r])) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp64) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + double starts_host[4] = {-5.0, -5.0, -5.0, -5.0}; + double limits_host[4] = {0.0, 0.0, 0.0, 0.0}; + double deltas_host[4] = {0.25, 0.25, 0.25, 0.25}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeFloat64, {512}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 512, 0.0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r]; + v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + auto actual_splits_ptr = reinterpret_cast(outputs_[0]->MutableData()); + std::vector actual_splits(actual_splits_ptr, actual_splits_ptr + 5); + for (size_t i = 0; i < actual_splits.size(); ++i) { + EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i; + } + + auto actual_values_ptr = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual_values(actual_values_ptr, actual_values_ptr + acc); + for (int i = 0; i < acc; ++i) { + EXPECT_NEAR(expect_values[i], actual_values[i], 1e-6) << "value index " << i; + } + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Int8) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + int8_t starts_host[4] = {-20, -10, 5, 100}; + int8_t limits_host[4] = {-5, 10, 20, 110}; + int8_t deltas_host[4] = {3, 4, 5, 1}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt8, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 256, static_cast(0)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (int v = static_cast(starts_host[r]); + deltas_host[r] > 0 ? v < static_cast(limits_host[r]) : v > static_cast(limits_host[r]); + v += static_cast(deltas_host[r])) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + auto actual_splits_ptr = reinterpret_cast(outputs_[0]->MutableData()); + std::vector actual_splits(actual_splits_ptr, actual_splits_ptr + 5); + for (size_t i = 0; i < actual_splits.size(); ++i) { + EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i; + } + + auto actual_values_ptr = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual_values(actual_values_ptr, actual_values_ptr + acc); + for (int i = 0; i < acc; ++i) { + EXPECT_EQ(expect_values[i], actual_values[i]) << "value index " << i; + } + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif } // namespace mindspore::lite::dsp::test -- Gitee From ac9083ff5c2809e02c31f2c29e41667b51e70cd4 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Sat, 8 Nov 2025 10:15:36 +0000 Subject: [PATCH 6/7] add ft78 ragged_range --- mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc | 3 +-- .../test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc index 610644ad..ffb2966f 100644 --- a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc @@ -109,7 +109,7 @@ int RaggedRangeDSPKernel::Run() { uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); - uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); + uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); uint64_t rows_hex = 0; std::memcpy(&rows_hex, &rows, sizeof(int)); @@ -139,4 +139,3 @@ REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator) REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_RaggedRange, DSPKernelCreator) } // namespace mindspore::kernel - diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc index 6d05fb6e..858718b7 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc @@ -533,8 +533,7 @@ TEST_F(TestDSP_RaggedRange, RaggedRange_Fp64) { int32_t acc = 0; for (int r = 0; r < 4; ++r) { expect_splits[r] = acc; - for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r]; - v += deltas_host[r]) { + for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r]; v += deltas_host[r]) { expect_values.push_back(v); } acc = static_cast(expect_values.size()); -- Gitee From a32c2867698fb9d027c8a8f0b3837d32937ecf30 Mon Sep 17 00:00:00 2001 From: mzy <929449726@qq.com> Date: Sat, 8 Nov 2025 17:19:23 +0000 Subject: [PATCH 7/7] remove matmulfusion --- .../litert/kernel/dsp/ft04/matmulfusion.cc | 181 ------- .../src/litert/kernel/dsp/ft04/matmulfusion.h | 51 -- .../runtime/kernel/dsp/matmulfusion_tests.cc | 467 ------------------ 3 files changed, 699 deletions(-) delete mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc delete mode 100644 mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h delete mode 100644 mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc deleted file mode 100644 index 602507e0..00000000 --- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc +++ /dev/null @@ -1,181 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/litert/kernel/dsp/ft04/matmulfusion.h" -#include -#include -#include "src/litert/kernel_registry.h" -#include "schema/inner/model_generated.h" -#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" - -using mindspore::kernel::KERNEL_ARCH::kDSP; -using mindspore::lite::KernelRegistrar; -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_OK; -using mindspore::schema::PrimitiveType_MatMulFusion; - -namespace mindspore::kernel { - -int MatMulFusionDSPKernel::Prepare() { return RET_OK; } - -int MatMulFusionDSPKernel::CheckSpecs() { - // inputs: A, B, (optional) bias; output: C - if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) { - MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size(); - return RET_ERROR; - } - if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) { - MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size(); - return RET_ERROR; - } - int M = 0, N = 0, K = 0; - if (GetMNK(&M, &N, &K) != RET_OK) { - MS_LOG(WARNING) << "MatMulFusion shape inference failed."; - return RET_ERROR; - } - // Bias check if present - if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { - auto bias_shape = in_tensors_[2]->shape(); - if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) { - MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape; - return RET_ERROR; - } - } - // Output shape check - auto out_shape = out_tensors_[0]->shape(); - if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) { - MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")"; - return RET_ERROR; - } - return RET_OK; -} - -int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const { - if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR; - const auto &a_shape = in_tensors_[0]->shape(); - const auto &b_shape = in_tensors_[1]->shape(); - if (a_shape.size() != 2 || b_shape.size() != 2) { - MS_LOG(WARNING) << "A/B must be rank-2"; - return RET_ERROR; - } - int aM = a_shape[0]; - int aK = a_shape[1]; - int bK = b_shape[0]; - int bN = b_shape[1]; - if (aK != bK) { - MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK; - return RET_ERROR; - } - *M = aM; - *K = aK; - *N = bN; - return RET_OK; -} - -int MatMulFusionDSPKernel::GetActTypeCode(int *code) const { - if (code == nullptr) return RET_ERROR; - // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2) - int act = 0; // default NONE - auto *param = reinterpret_cast(op_parameter_); - if (param != nullptr) { - switch (param->act_type_) { - case ActType_Relu: - act = 1; - break; - case ActType_Relu6: - act = 2; - break; // DSP uses 2 for RELU6, nnacl uses enum value 3 - default: - act = 0; - break; - } - } - *code = act; - return RET_OK; -} - -int MatMulFusionDSPKernel::RunFp32() { - kernel_name_ = "fp_matmulfusion_s"; - return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); -} -int MatMulFusionDSPKernel::RunFp16() { - kernel_name_ = "hp_matmulfusion_s"; - return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); -} -int MatMulFusionDSPKernel::RunInt32() { - kernel_name_ = "i32_matmulfusion_s"; - return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); -} -int MatMulFusionDSPKernel::RunInt16() { - kernel_name_ = "i16_matmulfusion_s"; - return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); -} -int MatMulFusionDSPKernel::RunComplex64() { - kernel_name_ = "c64_matmulfusion_s"; - return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); -} - -int MatMulFusionDSPKernel::Run() { - int M = 0, N = 0, K = 0; - if (GetMNK(&M, &N, &K) != RET_OK) { - MS_LOG(ERROR) << "MatMulFusion GetMNK failed"; - return RET_ERROR; - } - int act_code = 0; - (void)GetActTypeCode(&act_code); // default 0 if not set - - auto allocator = dsp_runtime_->GetAllocator(); - uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); - uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); - uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); - uint64_t bias_ptr = 0; - if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { - bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); - } - // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type - SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast(M), static_cast(N), - static_cast(K), static_cast(act_code)}); - - int ret = RET_ERROR; - auto dtype = in_tensors_[0]->data_type(); - if (dtype == kNumberTypeFloat32) { - ret = RunFp32(); - } else if (dtype == kNumberTypeFloat16) { - ret = RunFp16(); - } else if (dtype == kNumberTypeInt32) { - ret = RunInt32(); - } else if (dtype == kNumberTypeInt16) { - ret = RunInt16(); - } else if (dtype == kNumberTypeComplex64) { - ret = RunComplex64(); - } else { - MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast(dtype); - return RET_ERROR; - } - if (ret != RET_OK) { - MS_LOG(ERROR) << "MatMulFusion DSP run failed"; - return RET_ERROR; - } - return RET_OK; -} - -REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator) -REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator) -REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator) -REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator) -REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator) - -} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h deleted file mode 100644 index 1a487f08..00000000 --- a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ - -#include -#include -#include "src/litert/kernel/dsp/dsp_kernel.h" - -namespace mindspore::kernel { -class MatMulFusionDSPKernel : public DSPKernel { - public: - using DSPKernel::DSPKernel; - ~MatMulFusionDSPKernel() override = default; - - int Prepare() override; - int CheckSpecs() override; - int Run() override; - - private: - int RunFp32(); - int RunFp16(); - int RunInt32(); - int RunInt16(); - int RunComplex64(); - - // helpers - int GetMNK(int *M, int *N, int *K) const; - int GetActTypeCode(int *code) const; - - private: - std::string kernel_name_; - uint64_t core_mask_{0xF}; -}; -} // namespace mindspore::kernel - -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc deleted file mode 100644 index 42508223..00000000 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc +++ /dev/null @@ -1,467 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include "ut/src/runtime/kernel/dsp/dsp_test.h" -#include "include/api/context.h" -#include "include/api/data_type.h" -#include "include/api/model.h" -#include "schema/inner/model_generated.h" -#include "src/litert/kernel_registry.h" -#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" - -namespace mindspore::lite::dsp::test { - -class TestDSP_MatMulFusion : public DSPCommonTest {}; - -static void FillFloat(float *data, int size, float base = 0.1f) { - for (int i = 0; i < size; ++i) { - data[i] = base * static_cast((i % 10)); - } -} - -typedef uint16_t float16_t_u; -static inline float16_t_u Fp32ToFp16Bits(float v) { - uint32_t bits; - std::memcpy(&bits, &v, sizeof(bits)); - uint32_t sign = (bits >> 31) & 0x1; - int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; - uint32_t mantissa = bits & 0x007FFFFF; - uint16_t result; - if (exponent <= 0) { - if (exponent < -10) { - result = static_cast(sign << 15); - } else { - mantissa |= 0x00800000; - int shift = 14 - exponent; - uint32_t mantissa_shifted = mantissa >> shift; - uint32_t remainder = mantissa & ((1U << shift) - 1); - if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { - mantissa_shifted++; - } - result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); - } - } else if (exponent == 0xFF - 127 + 15) { - result = static_cast((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00)); - } else if (exponent > 30) { - result = static_cast((sign << 15) | 0x7C00); - } else { - uint32_t mantissa_rounded = mantissa >> 13; - uint32_t remainder = mantissa & 0x1FFF; - if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { - mantissa_rounded++; - if (mantissa_rounded == 0x400) { - mantissa_rounded = 0; - exponent++; - if (exponent > 30) return static_cast((sign << 15) | 0x7C00); - } - } - result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); - } - return result; -} - -// Large size tests (M=N=K=256) across dtypes -TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) { - InitDSPRuntime(); - const int M = 256, K = 256, N = 256; - std::vector a_shape = {M, K}; - std::vector b_shape = {K, N}; - std::vector out_shape = {M, N}; - std::vector bias_shape = {M, N}; - auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); - t_B->MallocData(allocator_); - t_bias->MallocData(allocator_); - t_out->MallocData(allocator_); - FillFloat(reinterpret_cast(t_A->MutableData()), M * K, 0.02f); - FillFloat(reinterpret_cast(t_B->MutableData()), K * N, 0.03f); - FillFloat(reinterpret_cast(t_bias->MutableData()), M * N, 0.005f); - std::memset(t_out->MutableData(), 0, M * N * sizeof(float)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; - ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); - param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; - param->has_bias_ = true; - param->row_ = M; - param->col_ = N; - param->deep_ = K; - kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); - ASSERT_NE(creator, nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); - ASSERT_NE(kernel, nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); - ASSERT_EQ(kernel->Run(), lite::RET_OK); - auto A = reinterpret_cast(t_A->MutableData()); - auto B = reinterpret_cast(t_B->MutableData()); - auto bias = reinterpret_cast(t_bias->MutableData()); - auto C = reinterpret_cast(t_out->MutableData()); - std::vector expect(M * N, 0.f); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - float sum = 0.f; - for (int k = 0; k < K; ++k) { - sum += A[m * K + k] * B[k * N + n]; - } - sum += bias[m * N + n]; - expect[m * N + n] = sum > 0.f ? sum : 0.f; - } - } - ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3)); - UninitDSPRuntime(); - delete ctx; - delete kernel; - delete t_A; - delete t_B; - delete t_bias; - delete t_out; -} - -TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) { - InitDSPRuntime(); - const int M = 256, K = 256, N = 256; - std::vector a_shape = {M, K}; - std::vector b_shape = {K, N}; - std::vector out_shape = {M, N}; - std::vector bias_shape = {M, N}; - auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); - t_B->MallocData(allocator_); - t_bias->MallocData(allocator_); - t_out->MallocData(allocator_); - auto A16 = reinterpret_cast(t_A->MutableData()); - auto B16 = reinterpret_cast(t_B->MutableData()); - auto bias16 = reinterpret_cast(t_bias->MutableData()); - auto C16 = reinterpret_cast(t_out->MutableData()); - for (int i = 0; i < M * K; ++i) { - A16[i] = Fp32ToFp16Bits(0.01f * static_cast(i % 13)); - } - for (int i = 0; i < K * N; ++i) { - B16[i] = Fp32ToFp16Bits(0.02f * static_cast(i % 17)); - } - for (int i = 0; i < M * N; ++i) { - bias16[i] = Fp32ToFp16Bits(0.003f * static_cast(i % 11)); - } - std::memset(C16, 0, M * N * sizeof(uint16_t)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; - ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); - param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; - param->has_bias_ = true; - param->row_ = M; - param->col_ = N; - param->deep_ = K; - kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); - ASSERT_NE(creator, nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); - ASSERT_NE(kernel, nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); - ASSERT_EQ(kernel->Run(), lite::RET_OK); - auto Fp16ToFp32 = [&](uint16_t h) { - uint32_t sign = (h & 0x8000) << 16; - uint32_t exp = (h & 0x7C00) >> 10; - uint32_t frac = (h & 0x03FF); - uint32_t fexp, ffrac; - if (exp == 0) { - if (frac == 0) { - fexp = 0; - ffrac = 0; - } else { - int shift = 0; - while ((frac & 0x0200) == 0) { - frac <<= 1; - ++shift; - } - frac &= 0x03FF; - fexp = 127 - 15 - shift; - ffrac = frac << 13; - } - } else if (exp == 0x1F) { - fexp = 255; - ffrac = frac << 13; - } else { - fexp = exp - 15 + 127; - ffrac = frac << 13; - } - uint32_t bits = sign | (fexp << 23) | ffrac; - float out; - std::memcpy(&out, &bits, sizeof(out)); - return out; - }; - std::vector expect_fp32(M * N, 0.f); - std::vector actual_fp32(M * N, 0.f); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - float sum = 0.f; - for (int k = 0; k < K; ++k) { - float a = Fp16ToFp32(A16[m * K + k]); - float b = Fp16ToFp32(B16[k * N + n]); - sum += a * b; - } - sum += Fp16ToFp32(bias16[m * N + n]); - expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f; - actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]); - } - } - ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2)); - UninitDSPRuntime(); - delete ctx; - delete kernel; - delete t_A; - delete t_B; - delete t_bias; - delete t_out; -} - -TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) { - InitDSPRuntime(); - const int M = 256, K = 256, N = 256; - std::vector a_shape = {M, K}; - std::vector b_shape = {K, N}; - std::vector out_shape = {M, N}; - std::vector bias_shape = {M, N}; - auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); - t_B->MallocData(allocator_); - t_bias->MallocData(allocator_); - t_out->MallocData(allocator_); - auto A = reinterpret_cast(t_A->MutableData()); - auto B = reinterpret_cast(t_B->MutableData()); - auto bias = reinterpret_cast(t_bias->MutableData()); - auto C = reinterpret_cast(t_out->MutableData()); - for (int i = 0; i < M * K; ++i) { - A[i] = (i % 11) - 5; - } - for (int i = 0; i < K * N; ++i) { - B[i] = (i % 13) - 6; - } - for (int i = 0; i < M * N; ++i) { - bias[i] = (i % 9) - 4; - } - std::memset(C, 0, M * N * sizeof(int32_t)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; - ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); - param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; - param->has_bias_ = true; - param->row_ = M; - param->col_ = N; - param->deep_ = K; - kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); - ASSERT_NE(creator, nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); - ASSERT_NE(kernel, nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); - ASSERT_EQ(kernel->Run(), lite::RET_OK); - std::vector expect(M * N, 0); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - long long sum = 0; - for (int k = 0; k < K; ++k) { - sum += static_cast(A[m * K + k]) * B[k * N + n]; - } - sum += static_cast(bias[m * N + n]); - expect[m * N + n] = static_cast(sum > 0 ? sum : 0); - } - } - ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); - UninitDSPRuntime(); - delete ctx; - delete kernel; - delete t_A; - delete t_B; - delete t_bias; - delete t_out; -} - -TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) { - InitDSPRuntime(); - const int M = 256, K = 256, N = 256; - std::vector a_shape = {M, K}; - std::vector b_shape = {K, N}; - std::vector out_shape = {M, N}; - std::vector bias_shape = {M, N}; - auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); - t_B->MallocData(allocator_); - t_bias->MallocData(allocator_); - t_out->MallocData(allocator_); - auto A = reinterpret_cast(t_A->MutableData()); - auto B = reinterpret_cast(t_B->MutableData()); - auto bias = reinterpret_cast(t_bias->MutableData()); - auto C = reinterpret_cast(t_out->MutableData()); - for (int i = 0; i < M * K; ++i) { - A[i] = static_cast((i % 21) - 10); - } - for (int i = 0; i < K * N; ++i) { - B[i] = static_cast((i % 19) - 9); - } - for (int i = 0; i < M * N; ++i) { - bias[i] = static_cast(i % 15); - } - std::memset(C, 0, M * N * sizeof(int16_t)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; - ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); - param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; - param->has_bias_ = true; - param->row_ = M; - param->col_ = N; - param->deep_ = K; - kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); - ASSERT_NE(creator, nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); - ASSERT_NE(kernel, nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); - ASSERT_EQ(kernel->Run(), lite::RET_OK); - std::vector expect(M * N, 0); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - long long sum = 0; - for (int k = 0; k < K; ++k) { - sum += static_cast(A[m * K + k]) * B[k * N + n]; - } - sum += static_cast(bias[m * N + n]); - sum = sum > 0 ? sum : 0; - if (sum > std::numeric_limits::max()) sum = std::numeric_limits::max(); - expect[m * N + n] = static_cast(sum); - } - } - ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); - UninitDSPRuntime(); - delete ctx; - delete kernel; - delete t_A; - delete t_B; - delete t_bias; - delete t_out; -} - -TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) { - InitDSPRuntime(); - const int M = 256, K = 256, N = 256; - std::vector a_shape = {M, K}; - std::vector b_shape = {K, N}; - std::vector out_shape = {M, N}; - std::vector bias_shape = {M, N}; - auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR); - auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR); - t_A->MallocData(allocator_); - t_B->MallocData(allocator_); - t_bias->MallocData(allocator_); - t_out->MallocData(allocator_); - auto A = reinterpret_cast(t_A->MutableData()); - auto B = reinterpret_cast(t_B->MutableData()); - auto bias = reinterpret_cast(t_bias->MutableData()); - auto C = reinterpret_cast(t_out->MutableData()); // complex64 stored as interleaved real,imag - for (int i = 0; i < M * K; ++i) { - A[2 * i] = 0.01f * (i % 17); - A[2 * i + 1] = 0.02f * (i % 19); - } - for (int i = 0; i < K * N; ++i) { - B[2 * i] = 0.03f * (i % 23); - B[2 * i + 1] = 0.01f * (i % 29); - } - for (int i = 0; i < M * N; ++i) { - bias[2 * i] = 0.002f * (i % 31); - bias[2 * i + 1] = 0.001f * (i % 37); - } - std::memset(C, 0, M * N * 2 * sizeof(float)); - std::vector inputs_{t_A, t_B, t_bias}; - std::vector outputs_{t_out}; - auto ctx = new lite::InnerContext; - ASSERT_EQ(lite::RET_OK, ctx->Init()); - auto *param = new MatMulParameter(); - param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); - param->act_type_ = ActType_Relu; - param->has_bias_ = true; - param->row_ = M; - param->col_ = N; - param->deep_ = K; - kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion}; - auto creator = KernelRegistry::GetInstance()->GetCreator(key); - ASSERT_NE(creator, nullptr); - auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); - ASSERT_NE(kernel, nullptr); - ASSERT_EQ(kernel->Prepare(), lite::RET_OK); - ASSERT_EQ(kernel->Run(), lite::RET_OK); - std::vector expect(2 * M * N, 0.f); - std::vector actual(2 * M * N, 0.f); - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - float real = 0.f; - float imag = 0.f; - for (int k = 0; k < K; ++k) { - float ar = A[2 * (m * K + k)]; - float ai = A[2 * (m * K + k) + 1]; - float br = B[2 * (k * N + n)]; - float bi = B[2 * (k * N + n) + 1]; - real += ar * br - ai * bi; - imag += ar * bi + ai * br; - } - real += bias[2 * (m * N + n)]; - imag += bias[2 * (m * N + n) + 1]; - if (real < 0.f) real = 0.f; - expect[2 * (m * N + n)] = real; - expect[2 * (m * N + n) + 1] = imag; - actual[2 * (m * N + n)] = C[2 * (m * N + n)]; - actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1]; - } - } - ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2)); - UninitDSPRuntime(); - delete ctx; - delete kernel; - delete t_A; - delete t_B; - delete t_bias; - delete t_out; -} - -} // namespace mindspore::lite::dsp::test -- Gitee