diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc new file mode 100644 index 0000000000000000000000000000000000000000..1acc8965b4570904ba3a58f3ddb81347d3c261cd --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.cc @@ -0,0 +1,132 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/ragged_range.h" +#include +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_RaggedRange; + +namespace mindspore::kernel { + +int RaggedRangeDSPKernel::CheckSpecs() { + // inputs: starts, limits, deltas; outputs: splits, values + if (in_tensors_.size() != 3 || out_tensors_.size() != 2) { + MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() + << ", out: " << out_tensors_.size(); + return RET_ERROR; + } + return RET_OK; +} + +int RaggedRangeDSPKernel::Prepare() { return RET_OK; } + +int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) { + if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) { + return RET_ERROR; + } + const auto &s0 = in_tensors_[0]->shape(); + const auto &s1 = in_tensors_[1]->shape(); + const auto &s2 = in_tensors_[2]->shape(); + *starts_scalar = s0.empty(); + *limits_scalar = s1.empty(); + *deltas_scalar = s2.empty(); + int non_scalar_rows = -1; + if (!*starts_scalar) non_scalar_rows = s0[0]; + if (!*limits_scalar) { + if (non_scalar_rows == -1) non_scalar_rows = s1[0]; + if (non_scalar_rows != s1[0]) return RET_ERROR; + } + if (!*deltas_scalar) { + if (non_scalar_rows == -1) non_scalar_rows = s2[0]; + if (non_scalar_rows != s2[0]) return RET_ERROR; + } + *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows; + return RET_OK; +} + +int RaggedRangeDSPKernel::RunFp32() { + kernel_name_ = "fp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunFp16() { + kernel_name_ = "hp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt32() { + kernel_name_ = "i32_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt16() { + kernel_name_ = "i16_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::Run() { + int rows = 0; + bool starts_scalar = false, limits_scalar = false, deltas_scalar = false; + int ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar); + if (ret != RET_OK) { + MS_LOG(ERROR) << "RaggedRange rows check failed."; + return RET_ERROR; + } + + auto allocator = dsp_runtime_->GetAllocator(); + // device pointers for inputs/outputs + uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + + // outputs: [0] splits (int32), [1] values (same type as inputs) + uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); + + // Note: s-variant core mask passed as separate arg by runtime; do not include in args. + // Arg order: starts, limits, deltas, range_count, values, splits + SetKernelArg({starts_dev, limits_dev, deltas_dev, static_cast(rows), values_dev, splits_dev}); + + auto out_dt = out_tensors_[1]->data_type(); + switch (out_dt) { + case kNumberTypeFloat32: + return RunFp32(); + case kNumberTypeFloat16: + return RunFp16(); + case kNumberTypeInt32: + return RunInt32(); + case kNumberTypeInt16: + return RunInt16(); + default: + MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast(out_dt); + return RET_ERROR; + } +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h new file mode 100644 index 0000000000000000000000000000000000000000..3c177940579d646191628b53454a98c69c16a8c8 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/ragged_range.h @@ -0,0 +1,48 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class RaggedRangeDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + ~RaggedRangeDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int RunFp32(); + int RunFp16(); + int RunInt32(); + int RunInt16(); + + int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar); + + private: + std::string kernel_name_; + uint64_t core_mask_{0xF}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_RAGGED_RANGE_H_ diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc new file mode 100644 index 0000000000000000000000000000000000000000..ffb2966f56ad29aa5463153a8a2412fd4ea22fc7 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.cc @@ -0,0 +1,141 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/ragged_range.h" +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_RaggedRange; + +namespace mindspore::kernel { +int RaggedRangeDSPKernel::CheckSpecs() { + if (in_tensors_.size() != 3 || out_tensors_.size() != 2) { + MS_LOG(WARNING) << "RaggedRange unexpected io sizes, in: " << in_tensors_.size() + << ", out: " << out_tensors_.size(); + return RET_ERROR; + } + return RET_OK; +} + +int RaggedRangeDSPKernel::Prepare() { return RET_OK; } + +int RaggedRangeDSPKernel::CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar) { + if (rows == nullptr || starts_scalar == nullptr || limits_scalar == nullptr || deltas_scalar == nullptr) { + return RET_ERROR; + } + const auto &s0 = in_tensors_[0]->shape(); + const auto &s1 = in_tensors_[1]->shape(); + const auto &s2 = in_tensors_[2]->shape(); + *starts_scalar = s0.empty(); + *limits_scalar = s1.empty(); + *deltas_scalar = s2.empty(); + int non_scalar_rows = -1; + if (!*starts_scalar) non_scalar_rows = s0[0]; + if (!*limits_scalar) { + if (non_scalar_rows == -1) { + non_scalar_rows = s1[0]; + } else if (non_scalar_rows != s1[0]) { + return RET_ERROR; + } + } + if (!*deltas_scalar) { + if (non_scalar_rows == -1) { + non_scalar_rows = s2[0]; + } else if (non_scalar_rows != s2[0]) { + return RET_ERROR; + } + } + *rows = (non_scalar_rows == -1) ? 1 : non_scalar_rows; + return RET_OK; +} + +int RaggedRangeDSPKernel::RunFp32() { + kernel_name_ = "fp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunFp64() { + kernel_name_ = "dp_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt32() { + kernel_name_ = "i32_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt16() { + kernel_name_ = "i16_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::RunInt8() { + kernel_name_ = "i8_raggedrange_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int RaggedRangeDSPKernel::Run() { + int rows = 0; + bool starts_scalar = false; + bool limits_scalar = false; + bool deltas_scalar = false; + auto ret = CalcRows(&rows, &starts_scalar, &limits_scalar, &deltas_scalar); + if (ret != RET_OK) { + MS_LOG(ERROR) << "RaggedRange rows check failed."; + return RET_ERROR; + } + + auto allocator = dsp_runtime_->GetAllocator(); + uint64_t starts_dev = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t limits_dev = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t deltas_dev = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + uint64_t splits_dev = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t values_dev = allocator->GetDeviceMemPtr(out_tensors_[1]->data()); + uint64_t rows_hex = 0; + std::memcpy(&rows_hex, &rows, sizeof(int)); + + SetKernelArg({starts_dev, limits_dev, deltas_dev, rows_hex, values_dev, splits_dev}); + + auto out_dt = out_tensors_[1]->data_type(); + switch (out_dt) { + case kNumberTypeFloat32: + return RunFp32(); + case kNumberTypeFloat64: + return RunFp64(); + case kNumberTypeInt32: + return RunInt32(); + case kNumberTypeInt16: + return RunInt16(); + case kNumberTypeInt8: + return RunInt8(); + default: + MS_LOG(ERROR) << "RaggedRange unsupported output dtype: " << static_cast(out_dt); + return RET_ERROR; + } +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_RaggedRange, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_RaggedRange, DSPKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h new file mode 100644 index 0000000000000000000000000000000000000000..8ce03e7616da34232be2ef852b2b99caefc1efc8 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/ragged_range.h @@ -0,0 +1,50 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class RaggedRangeDSPKernel : public DSPKernel { + public: + RaggedRangeDSPKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : DSPKernel(parameter, inputs, outputs, ctx) {} + ~RaggedRangeDSPKernel() override = default; + + int CheckSpecs() override; + int Prepare() override; + int Run() override; + + private: + int RunFp32(); + int RunFp64(); + int RunInt32(); + int RunInt16(); + int RunInt8(); + + int CalcRows(int *rows, bool *starts_scalar, bool *limits_scalar, bool *deltas_scalar); + + std::string kernel_name_; + uint64_t core_mask_{0xff}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_RAGGED_RANGE_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..858718b7b79a255a42281bea4f6af05cb11fa00d --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/ragged_range_tests.cc @@ -0,0 +1,642 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "include/api/model.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel/dsp/dsp_subgraph.h" +#include "src/litert/kernel_registry.h" + +namespace mindspore::lite::dsp::test { + +class TestDSP_RaggedRange : public DSPCommonTest {}; + +// fp16 helpers (consistent with other tests) +typedef uint16_t float16; +static inline float fp16_to_fp32(float16 h) { + uint32_t sign = (h & 0x8000) << 16; + uint32_t exp = (h & 0x7C00) >> 10; + uint32_t frac = (h & 0x03FF); + uint32_t f_exp, f_frac; + if (exp == 0) { + if (frac == 0) { + f_exp = 0; + f_frac = 0; + } else { + int shift = 0; + while ((frac & 0x0200) == 0) { + frac <<= 1; + ++shift; + } + frac &= 0x03FF; + f_exp = 127 - 15 - shift; + f_frac = frac << 13; + } + } else if (exp == 0x1F) { + f_exp = 255; + f_frac = frac << 13; + } else { + f_exp = exp - 15 + 127; + f_frac = frac << 13; + } + uint32_t f_bits = sign | (f_exp << 23) | f_frac; + float result; + std::memcpy(&result, &f_bits, sizeof(result)); + return result; +} +[[maybe_unused]] static inline float16 fp32_to_fp16(float v) { + uint32_t bits; + std::memcpy(&bits, &v, sizeof(bits)); + uint32_t sign = (bits >> 31) & 0x1; + int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; + uint32_t mantissa = bits & 0x007FFFFF; + float16 result; + if (exponent <= 0) { + if (exponent < -10) { + result = static_cast(sign << 15); + } else { + mantissa |= 0x00800000; + int shift = 14 - exponent; + uint32_t mantissa_shifted = mantissa >> shift; + uint32_t remainder = mantissa & ((1U << shift) - 1); + if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { + mantissa_shifted++; + } + result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); + } + } else if (exponent == 0xFF - 127 + 15) { + result = + (mantissa == 0) ? static_cast((sign << 15) | 0x7C00) : static_cast((sign << 15) | 0x7E00); + } else if (exponent > 30) { + result = static_cast((sign << 15) | 0x7C00); + } else { + uint32_t mantissa_rounded = mantissa >> 13; + uint32_t remainder = mantissa & 0x1FFF; + if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { + mantissa_rounded++; + if (mantissa_rounded == 0x400) { + mantissa_rounded = 0; + exponent++; + if (exponent > 30) { + return static_cast((sign << 15) | 0x7C00); + } + } + } + result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); + } + return result; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp32) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset: rows=5 + // starts=[0,10,-5,100,7], limits=[50,60,5,110,27], deltas=[1,2,3,1,4] + std::vector vec5 = {5}; + auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec5, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_data = reinterpret_cast(t_starts->MutableData()); + auto limits_data = reinterpret_cast(t_limits->MutableData()); + auto deltas_data = reinterpret_cast(t_deltas->MutableData()); + float starts_host[5] = {0.f, 10.f, -5.f, 100.f, 7.f}; + float limits_host[5] = {50.f, 60.f, 5.f, 110.f, 27.f}; + float deltas_host[5] = {1.f, 2.f, 3.f, 1.f, 4.f}; + std::memcpy(starts_data, starts_host, sizeof(starts_host)); + std::memcpy(limits_data, limits_host, sizeof(limits_host)); + std::memcpy(deltas_data, deltas_host, sizeof(deltas_host)); + + // outputs (splits size rows+1, values computed below) + auto t_splits = new lite::Tensor(kNumberTypeInt32, {6}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + // rough upper bound for values, we'll only compare first computed_len elements + auto t_values = new lite::Tensor(kNumberTypeFloat32, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 6, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 200, 0.0f); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + // build expected + std::vector expect_splits(6, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 5; ++r) { + expect_splits[r] = acc; + for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[5] = acc; + + // compare splits + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 6)); + // compare first acc values + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Int32) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset: rows=4 + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_data = reinterpret_cast(t_starts->MutableData()); + auto limits_data = reinterpret_cast(t_limits->MutableData()); + auto deltas_data = reinterpret_cast(t_deltas->MutableData()); + int32_t starts_host[4] = {0, -100, 5, 1000}; + int32_t limits_host[4] = {200, -50, 50, 1010}; + int32_t deltas_host[4] = {2, 5, 3, 1}; + std::memcpy(starts_data, starts_host, sizeof(starts_host)); + std::memcpy(limits_data, limits_host, sizeof(limits_host)); + std::memcpy(deltas_data, deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt32, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 300, 0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 5)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +#ifdef SUPPORT_FT04 +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp16) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset with fp32 inputs and fp16 outputs + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_f = reinterpret_cast(t_starts->MutableData()); + auto limits_f = reinterpret_cast(t_limits->MutableData()); + auto deltas_f = reinterpret_cast(t_deltas->MutableData()); + float starts_host[3] = {-10.f, 0.f, 1.5f}; + float limits_host[3] = {0.f, 50.f, 6.f}; + float deltas_host[3] = {0.5f, 1.f, 1.25f}; + std::memcpy(starts_f, starts_host, sizeof(starts_host)); + std::memcpy(limits_f, limits_host, sizeof(limits_host)); + std::memcpy(deltas_f, deltas_host, sizeof(deltas_host)); + + // outputs + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeFloat16, {200}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::memset(t_values->MutableData(), 0, 200 * sizeof(uint16_t)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + // expected + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (float v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + + auto out_fp16 = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual(acc); + for (int i = 0; i < acc; ++i) actual[i] = fp16_to_fp32(static_cast(out_fp16[i])); + std::vector correct(acc); + for (int i = 0; i < acc; ++i) correct[i] = fp16_to_fp32(fp32_to_fp16(expect_values[i])); + ASSERT_EQ(0, CompareOutputData(actual.data(), correct.data(), acc, 1e-3)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif + +#ifdef SUPPORT_FT04 +TEST_F(TestDSP_RaggedRange, RaggedRange_Int16) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + // Larger dataset with int32 inputs and int16 outputs + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt32, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + auto starts_d32 = reinterpret_cast(t_starts->MutableData()); + auto limits_d32 = reinterpret_cast(t_limits->MutableData()); + auto deltas_d32 = reinterpret_cast(t_deltas->MutableData()); + int32_t starts_host[3] = {-10, 0, 100}; + int32_t limits_host[3] = {10, 100, 110}; + int32_t deltas_host[3] = {2, 3, 1}; + std::memcpy(starts_d32, starts_host, sizeof(starts_host)); + std::memcpy(limits_d32, limits_host, sizeof(limits_host)); + std::memcpy(deltas_d32, deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt16, {300}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 300, 0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (int32_t v = starts_host[r]; v < limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_RaggedRange, RaggedRange_Int16_FT78) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + std::vector vec3 = {3}; + auto t_starts = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt16, vec3, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + int16_t starts_host[3] = {-12, 0, 90}; + int16_t limits_host[3] = {-2, 30, 100}; + int16_t deltas_host[3] = {3, 5, 2}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {4}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt16, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 4, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 256, static_cast(0)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(4, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 3; ++r) { + expect_splits[r] = acc; + for (int v = static_cast(starts_host[r]); + deltas_host[r] > 0 ? v < static_cast(limits_host[r]) : v > static_cast(limits_host[r]); + v += static_cast(deltas_host[r])) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[3] = acc; + + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[0]->MutableData()), expect_splits.data(), 4)); + ASSERT_EQ(0, CompareOutputData(reinterpret_cast(outputs_[1]->MutableData()), expect_values.data(), acc)); + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_RaggedRange, RaggedRange_Fp64) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeFloat64, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + double starts_host[4] = {-5.0, -5.0, -5.0, -5.0}; + double limits_host[4] = {0.0, 0.0, 0.0, 0.0}; + double deltas_host[4] = {0.25, 0.25, 0.25, 0.25}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeFloat64, {512}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 512, 0.0); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (double v = starts_host[r]; deltas_host[r] > 0 ? v < limits_host[r] : v > limits_host[r]; v += deltas_host[r]) { + expect_values.push_back(v); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + auto actual_splits_ptr = reinterpret_cast(outputs_[0]->MutableData()); + std::vector actual_splits(actual_splits_ptr, actual_splits_ptr + 5); + for (size_t i = 0; i < actual_splits.size(); ++i) { + EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i; + } + + auto actual_values_ptr = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual_values(actual_values_ptr, actual_values_ptr + acc); + for (int i = 0; i < acc; ++i) { + EXPECT_NEAR(expect_values[i], actual_values[i], 1e-6) << "value index " << i; + } + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} + +TEST_F(TestDSP_RaggedRange, RaggedRange_Int8) { + InitDSPRuntime(); + std::vector inputs_; + std::vector outputs_; + + std::vector vec4 = {4}; + auto t_starts = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_starts->MallocData(allocator_); + auto t_limits = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_limits->MallocData(allocator_); + auto t_deltas = new lite::Tensor(kNumberTypeInt8, vec4, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_deltas->MallocData(allocator_); + inputs_.push_back(t_starts); + inputs_.push_back(t_limits); + inputs_.push_back(t_deltas); + + int8_t starts_host[4] = {-20, -10, 5, 100}; + int8_t limits_host[4] = {-5, 10, 20, 110}; + int8_t deltas_host[4] = {3, 4, 5, 1}; + std::memcpy(t_starts->MutableData(), starts_host, sizeof(starts_host)); + std::memcpy(t_limits->MutableData(), limits_host, sizeof(limits_host)); + std::memcpy(t_deltas->MutableData(), deltas_host, sizeof(deltas_host)); + + auto t_splits = new lite::Tensor(kNumberTypeInt32, {5}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_splits->MallocData(allocator_); + auto t_values = new lite::Tensor(kNumberTypeInt8, {256}, mindspore::NHWC, lite::Category::CONST_TENSOR); + t_values->MallocData(allocator_); + outputs_.push_back(t_splits); + outputs_.push_back(t_values); + + std::fill_n(reinterpret_cast(t_splits->MutableData()), 5, 0); + std::fill_n(reinterpret_cast(t_values->MutableData()), 256, static_cast(0)); + + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_RaggedRange}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + auto *param = new OpParameter(); + param->type_ = static_cast(schema::PrimitiveType_RaggedRange); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + auto ret = kernel->Prepare(); + EXPECT_EQ(0, ret); + ret = kernel->Run(); + EXPECT_EQ(0, ret); + + std::vector expect_splits(5, 0); + std::vector expect_values; + int32_t acc = 0; + for (int r = 0; r < 4; ++r) { + expect_splits[r] = acc; + for (int v = static_cast(starts_host[r]); + deltas_host[r] > 0 ? v < static_cast(limits_host[r]) : v > static_cast(limits_host[r]); + v += static_cast(deltas_host[r])) { + expect_values.push_back(static_cast(v)); + } + acc = static_cast(expect_values.size()); + } + expect_splits[4] = acc; + + auto actual_splits_ptr = reinterpret_cast(outputs_[0]->MutableData()); + std::vector actual_splits(actual_splits_ptr, actual_splits_ptr + 5); + for (size_t i = 0; i < actual_splits.size(); ++i) { + EXPECT_EQ(expect_splits[i], actual_splits[i]) << "split index " << i; + } + + auto actual_values_ptr = reinterpret_cast(outputs_[1]->MutableData()); + std::vector actual_values(actual_values_ptr, actual_values_ptr + acc); + for (int i = 0; i < acc; ++i) { + EXPECT_EQ(expect_values[i], actual_values[i]) << "value index " << i; + } + + UninitDSPRuntime(); + delete ctx; + for (auto t : inputs_) delete t; + for (auto t : outputs_) delete t; + delete kernel; +} +#endif + +} // namespace mindspore::lite::dsp::test