diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd5481c883bd32578815da0490acd428f0769e1a --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.cc @@ -0,0 +1,178 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/broadcastto.h" +#include +#include +#include +#include "src/litert/kernel/cpu/nnacl_c/broadcast_to_parameter.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_BroadcastTo; + +namespace mindspore::kernel { +namespace { +constexpr size_t kInputTensorSize = 1; +constexpr size_t kOutputTensorSize = 1; +} // namespace + +int BroadcastToDSPKernel::Prepare() { return RET_OK; } + +int BroadcastToDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kInputTensorSize) { + MS_LOG(WARNING) << "BroadcastTo expects one input, got: " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kOutputTensorSize) { + MS_LOG(WARNING) << "BroadcastTo expects one output, got: " << out_tensors_.size(); + return RET_ERROR; + } + const auto input_rank = in_tensors_[0]->shape().size(); + const auto output_rank = out_tensors_[0]->shape().size(); + if (input_rank == 0 || output_rank == 0) { + MS_LOG(WARNING) << "BroadcastTo requires non-empty input/output shapes."; + return RET_ERROR; + } + if (input_rank > MAX_SHAPE_SIZE || output_rank > MAX_SHAPE_SIZE) { + MS_LOG(WARNING) << "BroadcastTo rank exceeds limit, input: " << input_rank << ", output: " << output_rank; + return RET_ERROR; + } + auto *param = reinterpret_cast(op_parameter_); + if (param == nullptr) { + MS_LOG(WARNING) << "BroadcastTo parameter is null."; + return RET_ERROR; + } + return RET_OK; +} + +int BroadcastToDSPKernel::BroadcastToRunFp32() { + kernel_name_ = "fp_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunFp16() { + kernel_name_ = "hp_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunInt16() { + kernel_name_ = "i16_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunInt32() { + kernel_name_ = "i32_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunComplex64() { + kernel_name_ = "c64_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::Run() { + int ret = RET_ERROR; + auto allocator = dsp_runtime_->GetAllocator(); + + auto *input_tensor = in_tensors_[0]; + auto *output_tensor = out_tensors_[0]; + + const auto &in_shape_vec = input_tensor->shape(); + const auto &out_shape_vec = output_tensor->shape(); + const size_t input_shape_size = in_shape_vec.size(); + const size_t output_shape_size = out_shape_vec.size(); + + int32_t in_shape_host[MAX_SHAPE_SIZE] = {0}; + for (size_t i = 0; i < input_shape_size; ++i) { + in_shape_host[i] = static_cast(in_shape_vec[i]); + } + + int32_t out_shape_host[MAX_SHAPE_SIZE] = {0}; + for (size_t i = 0; i < output_shape_size; ++i) { + out_shape_host[i] = static_cast(out_shape_vec[i]); + } + + void *in_shape_buf = allocator->Malloc(sizeof(int32_t) * input_shape_size); + void *out_shape_buf = allocator->Malloc(sizeof(int32_t) * output_shape_size); + + if (in_shape_buf == nullptr || out_shape_buf == nullptr) { + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + MS_LOG(ERROR) << "BroadcastTo alloc shape buffer failed."; + return RET_ERROR; + } + + std::memcpy(in_shape_buf, in_shape_host, sizeof(int32_t) * input_shape_size); + std::memcpy(out_shape_buf, out_shape_host, sizeof(int32_t) * output_shape_size); + + uint64_t in_shape_dev = allocator->GetDeviceMemPtr(in_shape_buf); + uint64_t out_shape_dev = allocator->GetDeviceMemPtr(out_shape_buf); + + uint64_t input_dev = allocator->GetDeviceMemPtr(input_tensor->data()); + uint64_t output_dev = allocator->GetDeviceMemPtr(output_tensor->data()); + + auto data_type = input_tensor->data_type(); + size_t data_size = lite::DataTypeSize(data_type); + if (data_size == 0) { + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + MS_LOG(ERROR) << "BroadcastTo unsupported dtype: " << static_cast(data_type); + return RET_ERROR; + } + + SetKernelArg({input_dev, output_dev, in_shape_dev, static_cast(input_shape_size), out_shape_dev, + static_cast(output_shape_size), static_cast(data_size)}); + + if (data_type == kNumberTypeFloat32) { + ret = BroadcastToRunFp32(); + } else if (data_type == kNumberTypeFloat16) { + ret = BroadcastToRunFp16(); + } else if (data_type == kNumberTypeInt16) { + ret = BroadcastToRunInt16(); + } else if (data_type == kNumberTypeInt32) { + ret = BroadcastToRunInt32(); + } else if (data_type == kNumberTypeComplex64) { + ret = BroadcastToRunComplex64(); + } else { + MS_LOG(ERROR) << "BroadcastTo unsupported dtype: " << static_cast(data_type); + } + + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed!"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_BroadcastTo, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.h b/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.h new file mode 100644 index 0000000000000000000000000000000000000000..b3bdb9300281c2b2062b9bd7f17a787c922b56c2 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/broadcastto.h @@ -0,0 +1,46 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class BroadcastToDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~BroadcastToDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + int BroadcastToRunFp32(); + int BroadcastToRunFp16(); + int BroadcastToRunInt16(); + int BroadcastToRunInt32(); + int BroadcastToRunComplex64(); + + private: + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.cc new file mode 100644 index 0000000000000000000000000000000000000000..ac9cfc29a13ea12421de0880b2c3a67083aa1de6 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.cc @@ -0,0 +1,202 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/broadcastto.h" +#include +#include +#include +#include "src/litert/kernel/cpu/nnacl_c/broadcast_to_parameter.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_BroadcastTo; + +namespace mindspore::kernel { +namespace { +constexpr size_t kInputTensorSize = 1; +constexpr size_t kOutputTensorSize = 1; +} // namespace + +int BroadcastToDSPKernel::Prepare() { return RET_OK; } + +int BroadcastToDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kInputTensorSize) { + MS_LOG(WARNING) << "BroadcastTo expects one input, got: " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kOutputTensorSize) { + MS_LOG(WARNING) << "BroadcastTo expects one output, got: " << out_tensors_.size(); + return RET_ERROR; + } + const auto input_rank = in_tensors_[0]->shape().size(); + const auto output_rank = out_tensors_[0]->shape().size(); + if (input_rank == 0 || output_rank == 0) { + MS_LOG(WARNING) << "BroadcastTo requires non-empty input/output shapes."; + return RET_ERROR; + } + if (input_rank > MAX_SHAPE_SIZE || output_rank > MAX_SHAPE_SIZE) { + MS_LOG(WARNING) << "BroadcastTo rank exceeds limit, input: " << input_rank << ", output: " << output_rank; + return RET_ERROR; + } + auto *param = reinterpret_cast(op_parameter_); + if (param == nullptr) { + MS_LOG(WARNING) << "BroadcastTo parameter is null."; + return RET_ERROR; + } + return RET_OK; +} + +int BroadcastToDSPKernel::BroadcastToRunFp32() { + kernel_name_ = "fp_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunFp64() { + kernel_name_ = "dp_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunInt8() { + kernel_name_ = "i8_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunInt16() { + kernel_name_ = "i16_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunInt32() { + kernel_name_ = "i32_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunComplex64() { + kernel_name_ = "c64_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::BroadcastToRunComplex128() { + kernel_name_ = "c128_broadcastto_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int BroadcastToDSPKernel::Run() { + int ret = RET_ERROR; + auto allocator = dsp_runtime_->GetAllocator(); + + auto *input_tensor = in_tensors_[0]; + auto *output_tensor = out_tensors_[0]; + + const auto &in_shape_vec = input_tensor->shape(); + const auto &out_shape_vec = output_tensor->shape(); + const size_t input_shape_size = in_shape_vec.size(); + const size_t output_shape_size = out_shape_vec.size(); + + int32_t in_shape_host[MAX_SHAPE_SIZE] = {0}; + for (size_t i = 0; i < input_shape_size; ++i) { + in_shape_host[i] = static_cast(in_shape_vec[i]); + } + + int32_t out_shape_host[MAX_SHAPE_SIZE] = {0}; + for (size_t i = 0; i < output_shape_size; ++i) { + out_shape_host[i] = static_cast(out_shape_vec[i]); + } + + void *in_shape_buf = allocator->Malloc(sizeof(int32_t) * input_shape_size); + void *out_shape_buf = allocator->Malloc(sizeof(int32_t) * output_shape_size); + void *int_addr_buf = allocator->Malloc(sizeof(int32_t) * 2); + int32_t int_addr_tmp[2] = {static_cast(input_shape_size), static_cast(output_shape_size)}; + + if (in_shape_buf == nullptr || out_shape_buf == nullptr || int_addr_buf == nullptr) { + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + allocator->Free(int_addr_buf); + MS_LOG(ERROR) << "BroadcastTo alloc shape buffer failed."; + return RET_ERROR; + } + + std::memcpy(in_shape_buf, in_shape_host, sizeof(int32_t) * input_shape_size); + std::memcpy(out_shape_buf, out_shape_host, sizeof(int32_t) * output_shape_size); + std::memcpy(int_addr_buf, int_addr_tmp, sizeof(int_addr_tmp)); + + uint64_t in_shape_dev = allocator->GetDeviceMemPtr(in_shape_buf); + uint64_t out_shape_dev = allocator->GetDeviceMemPtr(out_shape_buf); + uint64_t int_addr_dev = allocator->GetDeviceMemPtr(int_addr_buf); + + uint64_t input_dev = allocator->GetDeviceMemPtr(input_tensor->data()); + uint64_t output_dev = allocator->GetDeviceMemPtr(output_tensor->data()); + + auto data_type = input_tensor->data_type(); + size_t data_size = lite::DataTypeSize(data_type); + if (data_size == 0) { + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + allocator->Free(int_addr_buf); + MS_LOG(ERROR) << "BroadcastTo unsupported dtype: " << static_cast(data_type); + return RET_ERROR; + } + + SetKernelArg({input_dev, output_dev, in_shape_dev, out_shape_dev, int_addr_dev, static_cast(data_size)}); + + if (data_type == kNumberTypeFloat32) { + ret = BroadcastToRunFp32(); + } else if (data_type == kNumberTypeFloat64) { + ret = BroadcastToRunFp64(); + } else if (data_type == kNumberTypeInt8) { + ret = BroadcastToRunInt8(); + } else if (data_type == kNumberTypeInt16) { + ret = BroadcastToRunInt16(); + } else if (data_type == kNumberTypeInt32) { + ret = BroadcastToRunInt32(); + } else if (data_type == kNumberTypeComplex64) { + ret = BroadcastToRunComplex64(); + } else if (data_type == kNumberTypeComplex128) { + ret = BroadcastToRunComplex128(); + } else { + MS_LOG(ERROR) << "BroadcastTo unsupported dtype: " << static_cast(data_type); + } + + allocator->Free(in_shape_buf); + allocator->Free(out_shape_buf); + allocator->Free(int_addr_buf); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed!"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_BroadcastTo, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex128, PrimitiveType_BroadcastTo, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.h b/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.h new file mode 100644 index 0000000000000000000000000000000000000000..e117de7c75c566b085f0d53f123344b4cd8ad380 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/broadcastto.h @@ -0,0 +1,48 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class BroadcastToDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~BroadcastToDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + int BroadcastToRunFp32(); + int BroadcastToRunFp64(); + int BroadcastToRunInt8(); + int BroadcastToRunInt16(); + int BroadcastToRunInt32(); + int BroadcastToRunComplex64(); + int BroadcastToRunComplex128(); + + private: + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_BROADCASTTO_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/broadcastto_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/broadcastto_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..cdc726a896c1a685d5baf1e1ddbd2e29ffd1b28c --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/broadcastto_tests.cc @@ -0,0 +1,518 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel_registry.h" +#include "src/litert/kernel/cpu/nnacl_c/broadcast_to_parameter.h" + +namespace mindspore::lite::dsp::test { +namespace { +// Use the same shape sets as main.c to ensure interface compatibility. +constexpr std::array kCase0In = {1, 100}; +constexpr std::array kCase0Out = {100, 100}; +constexpr std::array kCase1In = {1, 10}; // NOTE: kept for parity with main.c even though dims differ. +constexpr std::array kCase1Out = {10, 100}; + +int64_t Accumulate(const std::vector &shape) { + int64_t total = 1; + for (int dim : shape) { + total *= dim; + } + return total; +} + +// Reference broadcast implementation matching main.c (broadcastto_c). +template +void BroadcastToRef(const T *input, T *output, const std::vector &input_shape, + const std::vector &output_shape, bool is_complex) { + size_t input_rank = input_shape.size(); + size_t output_rank = output_shape.size(); + std::vector ext_in_shape(output_rank, 1); + size_t shape_gap = output_rank - input_rank; + for (size_t i = 0; i < input_rank; ++i) { + ext_in_shape[shape_gap + i] = input_shape[i]; + } + + // stride calculation + std::vector out_stride(output_rank, 1); + std::vector in_stride(output_rank, 1); + for (int i = static_cast(output_rank) - 2; i >= 0; --i) { + out_stride[i] = out_stride[i + 1] * output_shape[i + 1]; + } + in_stride[output_rank - 1] = (ext_in_shape[output_rank - 1] == 1) ? 0 : 1; + for (int i = static_cast(output_rank) - 2; i >= 0; --i) { + in_stride[i] = (ext_in_shape[i] == 1) ? 0 : in_stride[i + 1] * ext_in_shape[i + 1]; + } + + int64_t out_elems = Accumulate(output_shape); + int factor = is_complex ? 2 : 1; + std::fill(output, output + out_elems * factor, static_cast(0)); + + for (int64_t idx = 0; idx < out_elems; ++idx) { + int tmp = static_cast(idx); + int in_offset = 0; + for (size_t axis = 0; axis < output_rank; ++axis) { + int pos = tmp / out_stride[axis]; + in_offset += pos * in_stride[axis]; + tmp %= out_stride[axis]; + } + if (!is_complex) { + output[idx] = input[in_offset]; + } else { + output[2 * idx] = input[2 * in_offset]; + output[2 * idx + 1] = input[2 * in_offset + 1]; + } + } +} + +BroadcastToParameter *CreateParam(const std::vector &out_shape) { + auto *param = reinterpret_cast(malloc(sizeof(BroadcastToParameter))); + if (param == nullptr) { + return nullptr; + } + std::memset(param, 0, sizeof(BroadcastToParameter)); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_BroadcastTo); + param->shape_size_ = out_shape.size(); + for (size_t i = 0; i < out_shape.size(); ++i) { + param->shape_[i] = out_shape[i]; + } + return param; +} + +std::vector ToVec(const std::array &shape) { return {shape[0], shape[1]}; } +} // namespace + +class TestDSP_BroadcastTo : public DSPCommonTest {}; + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Fp32) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeFloat32, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeFloat32, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + for (int i = 0; i < in_elems; ++i) { + reinterpret_cast(input->MutableData())[i] = static_cast(i + 1); + } + + std::vector expected(out_elems, 0.f); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, false); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_data = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_data, expected.data(), out_elems, 1e-5); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Int16) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeInt16, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeInt16, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + for (int i = 0; i < in_elems; ++i) { + reinterpret_cast(input->MutableData())[i] = static_cast((i + 1) % 32760); + } + + std::vector expected(out_elems, 0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, false); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_data = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_data, expected.data(), out_elems, 0.0f); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Int32) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeInt32, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeInt32, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + for (int i = 0; i < in_elems; ++i) { + reinterpret_cast(input->MutableData())[i] = i + 1; + } + + std::vector expected(out_elems, 0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, false); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_data = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_data, expected.data(), out_elems, 0.0f); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Complex64) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeComplex64, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeComplex64, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + auto *in_cplx = reinterpret_cast(input->MutableData()); + for (int i = 0; i < in_elems; ++i) { + in_cplx[2 * i] = 0.5f * static_cast(i + 1); + in_cplx[2 * i + 1] = 0.3f * static_cast(i + 1); + } + + std::vector expected(out_elems * 2, 0.f); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, true); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_cplx = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_cplx, expected.data(), out_elems * 2, 1e-5); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +#ifdef SUPPORT_FT04 +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Fp16) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeFloat16, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeFloat16, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + auto *in_half = reinterpret_cast(input->MutableData()); + for (int i = 0; i < in_elems; ++i) { + in_half[i] = fp32_to_fp16(static_cast(i + 1) * 0.1f); + } + + std::vector expected_half(out_elems, 0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected_half.data(), in_shape, out_shape, false); + std::vector expected(out_elems, 0.f); + for (int i = 0; i < out_elems; ++i) { + expected[i] = fp16_to_fp32(expected_half[i]); + } + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_half = reinterpret_cast(output->MutableData()); + std::vector out_fp(out_elems, 0.f); + for (int i = 0; i < out_elems; ++i) { + out_fp[i] = fp16_to_fp32(out_half[i]); + } + int cmp = CompareOutputData(out_fp.data(), expected.data(), out_elems, 1e-3f); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} +#endif // SUPPORT_FT04 + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Fp64) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeFloat64, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeFloat64, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + for (int i = 0; i < in_elems; ++i) { + reinterpret_cast(input->MutableData())[i] = static_cast(i + 1); + } + + std::vector expected(out_elems, 0.0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, false); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_data = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_data, expected.data(), out_elems, 1e-9); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Int8) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeInt8, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeInt8, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + for (int i = 0; i < in_elems; ++i) { + reinterpret_cast(input->MutableData())[i] = static_cast((i % 120) - 60); + } + + std::vector expected(out_elems, 0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, false); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_data = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_data, expected.data(), out_elems, 0.0f); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} + +TEST_F(TestDSP_BroadcastTo, BroadcastTo_Complex128) { + InitDSPRuntime(); + auto in_shape = ToVec(kCase0In); + auto out_shape = ToVec(kCase0Out); + int64_t in_elems = Accumulate(in_shape); + int64_t out_elems = Accumulate(out_shape); + + std::vector inputs; + std::vector outputs; + auto *input = new lite::Tensor(kNumberTypeComplex128, in_shape, mindspore::NHWC, lite::Category::VAR); + input->MallocData(allocator_); + inputs.push_back(input); + + auto *output = new lite::Tensor(kNumberTypeComplex128, out_shape, mindspore::NHWC, lite::Category::VAR); + output->MallocData(allocator_); + outputs.push_back(output); + + auto *in_cplx = reinterpret_cast(input->MutableData()); + for (int i = 0; i < in_elems; ++i) { + in_cplx[2 * i] = 0.5 * static_cast(i + 1); + in_cplx[2 * i + 1] = 0.3 * static_cast(i + 1); + } + + std::vector expected(out_elems * 2, 0.0); + BroadcastToRef(reinterpret_cast(input->MutableData()), expected.data(), in_shape, out_shape, true); + + auto *ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = CreateParam(out_shape); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex128, NHWC, schema::PrimitiveType_BroadcastTo}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs, outputs, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + + EXPECT_EQ(kernel->Prepare(), lite::RET_OK); + EXPECT_EQ(kernel->Run(), lite::RET_OK); + + auto *out_cplx = reinterpret_cast(output->MutableData()); + int cmp = CompareOutputData(out_cplx, expected.data(), out_elems * 2, 1e-9); + ASSERT_EQ(0, cmp); + + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete input; + delete output; +} +#endif // SUPPORT_FT78 +} // namespace mindspore::lite::dsp::test diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h index 88419f42d7e853af569ac4d207993293a3f96258..450e0d6c8c5a67f6b0d9675569fd25acd1494ba7 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h @@ -46,6 +46,98 @@ class DSPCommonTest : public CommonTest { dsp_runtime_wrapper_ = nullptr; } + // Local IEEE754 half <-> float converters to avoid any linkage/impl mismatch in tests. + float fp16_to_fp32(uint16_t h) { + uint32_t sign = (static_cast(h) & 0x8000u) << 16; + uint32_t exp = (static_cast(h) & 0x7C00u) >> 10; + uint32_t mant = static_cast(h & 0x03FFu); + uint32_t f; + if (exp == 0) { + if (mant == 0) { + f = sign; // zero + } else { + // subnormal -> normalize + exp = 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + --exp; + } + mant &= 0x03FFu; + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + } else if (exp == 0x1Fu) { // Inf/NaN + f = sign | 0x7F800000u | (mant << 13); + } else { + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + float out; + std::memcpy(&out, &f, sizeof(out)); + return out; + } + + uint16_t fp32_to_fp16(float val) { + uint32_t fbits; + std::memcpy(&fbits, &val, sizeof(fbits)); + uint32_t sign = (fbits >> 16) & 0x8000u; + uint32_t fexp = (fbits >> 23) & 0xFFu; + uint32_t fmant = fbits & 0x007FFFFFu; + + // NaN/Inf handling + if (fexp == 0xFFu) { + if (fmant != 0) { + // NaN: keep a quiet NaN in half + return static_cast(sign | 0x7C00u | 0x0001u); + } + // Inf + return static_cast(sign | 0x7C00u); + } + + // Rebias exponent for half + int32_t hexp = static_cast(fexp) - 127 + 15; + + if (hexp <= 0) { + // Subnormal or underflow to zero in half + if (hexp < -10) { + return static_cast(sign); // Underflow to zero + } + // Make implicit leading 1 explicit + uint32_t mant = fmant | 0x00800000u; + // Shift to align to half subnormal mantissa (10 bits) + int shift = 1 - hexp; // shift in [1..10] + // Compute mantissa with round-to-nearest-even + uint32_t mant_rounded = mant >> (shift + 13); + uint32_t round_bit = (mant >> (shift + 12)) & 1u; + uint32_t sticky = (mant & ((1u << (shift + 12)) - 1u)) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + return static_cast(sign | static_cast(mant_rounded)); + } + + if (hexp >= 0x1F) { + // Overflow to half inf + return static_cast(sign | 0x7C00u); + } + + // Normal case: build exponent and mantissa with round-to-nearest-even + uint16_t hexp_field = static_cast(hexp) << 10; + uint32_t mant = fmant; + uint32_t mant_rounded = mant >> 13; + uint32_t round_bit = (mant >> 12) & 1u; + uint32_t sticky = (mant & 0xFFFu) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + if (mant_rounded == 0x400u) { + // Mantissa overflow after rounding; bump exponent, zero mantissa + mant_rounded = 0; + hexp_field = static_cast(hexp_field + 0x0400u); + if (hexp_field >= 0x7C00u) { + // Exponent overflow -> inf + return static_cast(sign | 0x7C00u); + } + } + return static_cast(sign | hexp_field | static_cast(mant_rounded)); + } + protected: dsp::DSPRuntimeInnerWrapper *dsp_runtime_wrapper_{nullptr}; std::shared_ptr allocator_;