diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.cc new file mode 100644 index 0000000000000000000000000000000000000000..593b2516f3c1b3fa4bfe996e0b3a481c304f67f1 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.cc @@ -0,0 +1,193 @@ + +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/adam_weight_decay.h" + +#include +#include +#include + +#include "src/common/utils.h" +#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_AdamWeightDecay; + +namespace mindspore::kernel { +int AdamWeightDecayDSPKernel::Prepare() { return RET_OK; } + +int AdamWeightDecayDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kAdamWeightDecayInputTensorSize) { + MS_LOG(WARNING) << "Input size mismatch: expected " << kAdamWeightDecayInputTensorSize << ", got " + << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kAdamWeightDecayOutputTensorSize) { + MS_LOG(WARNING) << "Output size mismatch: expected " << kAdamWeightDecayOutputTensorSize << ", got " + << out_tensors_.size(); + return RET_ERROR; + } + + auto weight_shape = in_tensors_[kAdamWeightIdx]->shape(); + if (weight_shape != in_tensors_[kAdamMoment1Idx]->shape() || weight_shape != in_tensors_[kAdamMoment2Idx]->shape() || + weight_shape != in_tensors_[kAdamGradientIdx]->shape()) { + MS_LOG(WARNING) << "Weight, moment or gradient tensor shapes mismatch."; + return RET_ERROR; + } + + auto data_type = in_tensors_[kAdamWeightIdx]->data_type(); + if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16) { + MS_LOG(WARNING) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + auto check_scalar = [](const lite::Tensor *tensor) -> bool { + if (tensor == nullptr || tensor->ElementsNum() != 1) { + return false; + } + auto tensor_type = tensor->data_type(); + return tensor_type == kNumberTypeFloat32 || tensor_type == kNumberTypeFloat16; + }; + + if (!check_scalar(in_tensors_[kAdamLrIdx]) || !check_scalar(in_tensors_[kAdamBeta1Idx]) || + !check_scalar(in_tensors_[kAdamBeta2Idx]) || !check_scalar(in_tensors_[kAdamEpsilonIdx]) || + !check_scalar(in_tensors_[kAdamDecayIdx])) { + MS_LOG(WARNING) << "Optimizer scalar tensors are invalid."; + return RET_ERROR; + } + + return RET_OK; +} + +int AdamWeightDecayDSPKernel::AdamWeightDecayRunFp32() { + kernel_name_ = "fp_adamweightdecay_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int AdamWeightDecayDSPKernel::AdamWeightDecayRunFp16() { + kernel_name_ = "hp_adamweightdecay_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int AdamWeightDecayDSPKernel::Run() { + auto allocator = dsp_runtime_->GetAllocator(); + auto *weight = in_tensors_[kAdamWeightIdx]; + int64_t elements_num = weight->ElementsNum(); + auto data_type = weight->data_type(); + + uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data()); + uint64_t moment1_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamMoment1Idx]->data()); + uint64_t moment2_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamMoment2Idx]->data()); + uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamGradientIdx]->data()); + + size_t float_param_bytes = sizeof(float) * kAdamFloatParamSize; + if (data_type == kNumberTypeFloat16) { + float_param_bytes = sizeof(uint16_t) * kAdamFloatParamSize; + } else if (data_type != kNumberTypeFloat32) { + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + void *float_params_buffer = allocator->Malloc(float_param_bytes); + auto free_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + }; + + // Pack optimizer scalar parameters according to kernel data type. + const size_t idxs[kAdamFloatParamSize] = {kAdamLrIdx, kAdamBeta1Idx, kAdamBeta2Idx, kAdamEpsilonIdx, kAdamDecayIdx}; + if (data_type == kNumberTypeFloat32) { + float float_params[kAdamFloatParamSize] = {0.f}; + for (size_t i = 0; i < kAdamFloatParamSize; ++i) { + const lite::Tensor *t = in_tensors_[idxs[i]]; + if (t->data_type() != kNumberTypeFloat32) { + free_buffers(); + MS_LOG(ERROR) << "Scalar tensor type mismatch: expected FP32 scalar for FP32 kernel, got " + << static_cast(t->data_type()); + return RET_ERROR; + } + float_params[i] = *(reinterpret_cast(t->data())); + } + std::memcpy(float_params_buffer, float_params, float_param_bytes); + } else { // kernel expects float16 parameters + uint16_t float16_params[kAdamFloatParamSize] = {0}; + for (size_t i = 0; i < kAdamFloatParamSize; ++i) { + const lite::Tensor *t = in_tensors_[idxs[i]]; + if (t->data_type() != kNumberTypeFloat16) { + free_buffers(); + MS_LOG(ERROR) << "Scalar tensor type mismatch: expected FP16 scalar for FP16 kernel, got " + << static_cast(t->data_type()); + return RET_ERROR; + } + // already half precision: copy raw representation to avoid rounding. + float16_params[i] = *(reinterpret_cast(t->data())); + } + std::memcpy(float_params_buffer, float16_params, float_param_bytes); + } + uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer); + + void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kAdamIntParamSize); + auto free_all_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + if (int_params_buffer != nullptr) { + allocator->Free(int_params_buffer); + int_params_buffer = nullptr; + } + }; + + auto *int_params = reinterpret_cast(int_params_buffer); + int_params[0] = 0; + int_params[1] = static_cast(elements_num); + uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer); + + SetKernelArg({weight_device_ptr, moment1_device_ptr, moment2_device_ptr, grad_device_ptr, float_params_device_ptr, + int_params_device_ptr}); + + int ret = RET_ERROR; + if (data_type == kNumberTypeFloat32) { + ret = AdamWeightDecayRunFp32(); + } else if (data_type == kNumberTypeFloat16) { + ret = AdamWeightDecayRunFp16(); + } else { + free_all_buffers(); + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + free_all_buffers(); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed! "; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_AdamWeightDecay, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_AdamWeightDecay, DSPKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.h b/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.h new file mode 100644 index 0000000000000000000000000000000000000000..38113a10f747e93ada42576fa3113e03e6c08019 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/adam_weight_decay.h @@ -0,0 +1,59 @@ + +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_ADAM_WEIGHT_DECAY_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_ADAM_WEIGHT_DECAY_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +constexpr size_t kAdamWeightDecayInputTensorSize = 9; +constexpr size_t kAdamWeightDecayOutputTensorSize = 1; +constexpr size_t kAdamFloatParamSize = 5; +constexpr size_t kAdamIntParamSize = 2; + +constexpr size_t kAdamWeightIdx = 0; +constexpr size_t kAdamMoment1Idx = 1; +constexpr size_t kAdamMoment2Idx = 2; +constexpr size_t kAdamLrIdx = 3; +constexpr size_t kAdamBeta1Idx = 4; +constexpr size_t kAdamBeta2Idx = 5; +constexpr size_t kAdamEpsilonIdx = 6; +constexpr size_t kAdamDecayIdx = 7; +constexpr size_t kAdamGradientIdx = 8; + +class AdamWeightDecayDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~AdamWeightDecayDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int AdamWeightDecayRunFp32(); + int AdamWeightDecayRunFp16(); + + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_ADAM_WEIGHT_DECAY_H_ diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b4c8dceb0a49e3a997643aa608e77cec454d60b --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.cc @@ -0,0 +1,173 @@ + +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/adam_weight_decay.h" + +#include +#include +#include + +#include "src/common/utils.h" +#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_AdamWeightDecay; + +namespace mindspore::kernel { +int AdamWeightDecayDSPKernel::Prepare() { return RET_OK; } + +int AdamWeightDecayDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kAdamWeightDecayInputTensorSize) { + MS_LOG(WARNING) << "Input size mismatch: expected " << kAdamWeightDecayInputTensorSize << ", got " + << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kAdamWeightDecayOutputTensorSize) { + MS_LOG(WARNING) << "Output size mismatch: expected " << kAdamWeightDecayOutputTensorSize << ", got " + << out_tensors_.size(); + return RET_ERROR; + } + + auto weight_shape = in_tensors_[kAdamWeightIdx]->shape(); + if (weight_shape != in_tensors_[kAdamMoment1Idx]->shape() || weight_shape != in_tensors_[kAdamMoment2Idx]->shape() || + weight_shape != in_tensors_[kAdamGradientIdx]->shape()) { + MS_LOG(WARNING) << "Weight, moment or gradient tensor shapes mismatch."; + return RET_ERROR; + } + + auto data_type = in_tensors_[kAdamWeightIdx]->data_type(); + if (data_type != kNumberTypeFloat32) { + MS_LOG(WARNING) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + auto check_scalar = [](const lite::Tensor *tensor) -> bool { + if (tensor == nullptr || tensor->ElementsNum() != 1) { + return false; + } + auto tensor_type = tensor->data_type(); + return tensor_type == kNumberTypeFloat32; + }; + + if (!check_scalar(in_tensors_[kAdamLrIdx]) || !check_scalar(in_tensors_[kAdamBeta1Idx]) || + !check_scalar(in_tensors_[kAdamBeta2Idx]) || !check_scalar(in_tensors_[kAdamEpsilonIdx]) || + !check_scalar(in_tensors_[kAdamDecayIdx])) { + MS_LOG(WARNING) << "Optimizer scalar tensors are invalid."; + return RET_ERROR; + } + + return RET_OK; +} + +int AdamWeightDecayDSPKernel::AdamWeightDecayRunFp32() { + kernel_name_ = "fp_adamweightdecay_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int AdamWeightDecayDSPKernel::Run() { + auto allocator = dsp_runtime_->GetAllocator(); + auto *weight = in_tensors_[kAdamWeightIdx]; + + int64_t elements_num = weight->ElementsNum(); + + auto data_type = weight->data_type(); + + uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data()); + uint64_t moment1_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamMoment1Idx]->data()); + uint64_t moment2_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamMoment2Idx]->data()); + uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kAdamGradientIdx]->data()); + + size_t float_param_bytes = sizeof(float) * kAdamFloatParamSize; + if (data_type != kNumberTypeFloat32) { + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + void *float_params_buffer = allocator->Malloc(float_param_bytes); + + auto free_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + }; + + // Pack optimizer scalar parameters according to kernel data type. + const size_t idxs[kAdamFloatParamSize] = {kAdamLrIdx, kAdamBeta1Idx, kAdamBeta2Idx, kAdamEpsilonIdx, kAdamDecayIdx}; + float float_params[kAdamFloatParamSize] = {0.f}; + for (size_t i = 0; i < kAdamFloatParamSize; ++i) { + const lite::Tensor *t = in_tensors_[idxs[i]]; + if (t->data_type() != kNumberTypeFloat32) { + free_buffers(); + MS_LOG(ERROR) << "Scalar tensor type mismatch: expected FP32 scalar, got " << static_cast(t->data_type()); + return RET_ERROR; + } + float_params[i] = *(reinterpret_cast(t->data())); + } + std::memcpy(float_params_buffer, float_params, float_param_bytes); + uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer); + if (float_params_device_ptr == 0) { + free_buffers(); + MS_LOG(ERROR) << "Failed to obtain device pointer for float parameter buffer."; + return RET_ERROR; + } + + void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kAdamIntParamSize); + + auto free_all_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + if (int_params_buffer != nullptr) { + allocator->Free(int_params_buffer); + int_params_buffer = nullptr; + } + }; + + auto *int_params = reinterpret_cast(int_params_buffer); + int_params[0] = 0; + int_params[1] = static_cast(elements_num); + uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer); + SetKernelArg({weight_device_ptr, moment1_device_ptr, moment2_device_ptr, grad_device_ptr, float_params_device_ptr, + int_params_device_ptr}); + + int ret = RET_ERROR; + if (data_type == kNumberTypeFloat32) { + ret = AdamWeightDecayRunFp32(); + } else { + free_all_buffers(); + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + free_all_buffers(); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed! "; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_AdamWeightDecay, DSPKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.h b/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.h new file mode 100644 index 0000000000000000000000000000000000000000..eeb457893f31cf65e82973a6adc14e2234b73af3 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/adam_weight_decay.h @@ -0,0 +1,58 @@ + +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_ADAM_WEIGHT_DECAY_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_ADAM_WEIGHT_DECAY_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +constexpr size_t kAdamWeightDecayInputTensorSize = 9; +constexpr size_t kAdamWeightDecayOutputTensorSize = 1; +constexpr size_t kAdamFloatParamSize = 5; +constexpr size_t kAdamIntParamSize = 2; + +constexpr size_t kAdamWeightIdx = 0; +constexpr size_t kAdamMoment1Idx = 1; +constexpr size_t kAdamMoment2Idx = 2; +constexpr size_t kAdamLrIdx = 3; +constexpr size_t kAdamBeta1Idx = 4; +constexpr size_t kAdamBeta2Idx = 5; +constexpr size_t kAdamEpsilonIdx = 6; +constexpr size_t kAdamDecayIdx = 7; +constexpr size_t kAdamGradientIdx = 8; + +class AdamWeightDecayDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~AdamWeightDecayDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int AdamWeightDecayRunFp32(); + + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/adam_weight_decay_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/adam_weight_decay_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..3abc17c1baebd69b2dce54d2150e6a5b8bcba9bd --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/adam_weight_decay_tests.cc @@ -0,0 +1,277 @@ + +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "ut/src/runtime/kernel/opencl/common.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel_registry.h" +#include "src/litert/kernel/dsp/dsp_subgraph.h" +#ifdef SUPPORT_FT78 +#include "src/litert/kernel/dsp/ft78/adam_weight_decay.h" +#else +#include "src/litert/kernel/dsp/ft04/adam_weight_decay.h" +#endif +#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h" +#include "nnacl_c/fp32/adam_fp32.h" + +namespace mindspore::lite::dsp::test { +namespace { +constexpr int kTensorLength = 100000; +constexpr float kLearningRate = 0.01f; + +constexpr float kBeta1 = 0.9f; +constexpr float kBeta2 = 0.999f; +constexpr float kEpsilon = 1e-8f; +constexpr float kDecay = 0.01f; + +OpParameter *CreateAdamWeightDecayParameter() { + auto *param = opencl::test::CreateParameter(schema::PrimitiveType_AdamWeightDecay); + return reinterpret_cast(param); +} + +#ifdef SUPPORT_FT04 +std::vector BuildSequence(float start, float step) { + std::vector data(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + data[i] = start + static_cast(i) * step; + } + return data; +} +#endif + +} // namespace + +class TestDSP_AdamWeightDecay : public DSPCommonTest { + public: + void CreateInputTensors(TypeId type, std::vector *inputs, + std::vector *tensors_to_delete) { + std::vector param_shape = {kTensorLength}; + std::vector scalar_shape = {1}; + + auto category = (type == kNumberTypeFloat16) ? lite::Category::CONST_TENSOR : lite::Category::VAR; + auto weight_tensor = new lite::Tensor(type, param_shape, mindspore::NHWC, category); + weight_tensor->MallocData(allocator_); + inputs->push_back(weight_tensor); + tensors_to_delete->push_back(weight_tensor); + + auto moment1_tensor = new lite::Tensor(type, param_shape, mindspore::NHWC, lite::Category::VAR); + moment1_tensor->MallocData(allocator_); + inputs->push_back(moment1_tensor); + tensors_to_delete->push_back(moment1_tensor); + + auto moment2_tensor = new lite::Tensor(type, param_shape, mindspore::NHWC, lite::Category::VAR); + moment2_tensor->MallocData(allocator_); + inputs->push_back(moment2_tensor); + tensors_to_delete->push_back(moment2_tensor); + + for (int i = 0; i < 5; i++) { + auto tensor = new lite::Tensor(type, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + tensor->MallocData(allocator_); + inputs->push_back(tensor); + tensors_to_delete->push_back(tensor); + } + + auto gradient_tensor = new lite::Tensor(type, param_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + gradient_tensor->MallocData(allocator_); + inputs->push_back(gradient_tensor); + tensors_to_delete->push_back(gradient_tensor); + } +}; + +TEST_F(TestDSP_AdamWeightDecay, AdamWeightDecay_Fp32) { + InitDSPRuntime(); + + std::vector inputs; + std::vector outputs; + std::vector tensors_to_delete; + + CreateInputTensors(kNumberTypeFloat32, &inputs, &tensors_to_delete); + outputs.push_back(inputs[0]); + + // Use FP16-friendly ranges to avoid overflow/NaN across large N. + // Keep weights slowly increasing and gradients bounded in [-0.05, 0.05]. + std::vector initial_weight(kTensorLength); + std::vector initial_moment1(kTensorLength, 0.0f); + std::vector initial_moment2(kTensorLength, 0.0f); + std::vector gradients(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + initial_weight[i] = 0.5f + 0.001f * static_cast(i); // up to ~10.5 + int t = i % 200; // 0..199 + gradients[i] = 0.05f - 0.0005f * static_cast(t); // 0.05..-0.045, then wrap, stays in [-0.05,0.05] + } + + std::copy(initial_weight.begin(), initial_weight.end(), reinterpret_cast(inputs[0]->MutableData())); + std::copy(initial_moment1.begin(), initial_moment1.end(), reinterpret_cast(inputs[1]->MutableData())); + std::copy(initial_moment2.begin(), initial_moment2.end(), reinterpret_cast(inputs[2]->MutableData())); + std::copy(gradients.begin(), gradients.end(), reinterpret_cast(inputs[8]->MutableData())); + + reinterpret_cast(inputs[3]->MutableData())[0] = kLearningRate; + reinterpret_cast(inputs[4]->MutableData())[0] = kBeta1; + reinterpret_cast(inputs[5]->MutableData())[0] = kBeta2; + reinterpret_cast(inputs[6]->MutableData())[0] = kEpsilon; + reinterpret_cast(inputs[7]->MutableData())[0] = kDecay; + + auto expected_weight = initial_weight; + auto expected_moment1 = initial_moment1; + auto expected_moment2 = initial_moment2; + AdamWeightDecayFp32(expected_weight.data(), expected_moment1.data(), expected_moment2.data(), kLearningRate, kBeta1, + kBeta2, kEpsilon, kDecay, gradients.data(), 0, kTensorLength); + + auto ctx = new lite::InnerContext; + ASSERT_NE(ctx, nullptr); + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + auto *param = CreateAdamWeightDecayParameter(); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_AdamWeightDecay}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + + auto kernel = creator(inputs, outputs, param, ctx, key); + ASSERT_NE(kernel, nullptr); + + ASSERT_EQ(lite::RET_OK, kernel->Prepare()); + ASSERT_EQ(lite::RET_OK, kernel->Run()); + + auto weight_data = reinterpret_cast(inputs[kernel::kAdamWeightIdx]->MutableData()); + + ASSERT_EQ(0, CompareOutputData(weight_data, expected_weight.data(), kTensorLength)); + + UninitDSPRuntime(); + + delete ctx; + for (auto *tensor : tensors_to_delete) { + delete tensor; + } + delete kernel; +} + +#ifdef SUPPORT_FT04 +TEST_F(TestDSP_AdamWeightDecay, AdamWeightDecay_Fp16) { + InitDSPRuntime(); + + std::vector inputs; + std::vector outputs; + std::vector tensors_to_delete; + + CreateInputTensors(kNumberTypeFloat16, &inputs, &tensors_to_delete); + outputs.push_back(inputs[0]); + + auto initial_weight = BuildSequence(0.5f, 0.1f); + auto initial_moment1 = std::vector(kTensorLength, 0.0f); + auto initial_moment2 = std::vector(kTensorLength, 0.0f); + auto gradients = BuildSequence(0.05f, -0.01f); + + auto *weight_half = reinterpret_cast(inputs[0]->MutableData()); + auto *moment1_half = reinterpret_cast(inputs[1]->MutableData()); + auto *moment2_half = reinterpret_cast(inputs[2]->MutableData()); + auto *gradient_half = reinterpret_cast(inputs[8]->MutableData()); + + std::vector weight_fp32(kTensorLength); + std::vector moment1_fp32(kTensorLength); + std::vector moment2_fp32(kTensorLength); + std::vector gradient_fp32(kTensorLength); + + for (int i = 0; i < kTensorLength; ++i) { + weight_half[i] = fp32_to_fp16(initial_weight[i]); + weight_fp32[i] = fp16_to_fp32(weight_half[i]); + + moment1_half[i] = fp32_to_fp16(initial_moment1[i]); + moment1_fp32[i] = fp16_to_fp32(moment1_half[i]); + + moment2_half[i] = fp32_to_fp16(initial_moment2[i]); + moment2_fp32[i] = fp16_to_fp32(moment2_half[i]); + + gradient_half[i] = fp32_to_fp16(gradients[i]); + gradient_fp32[i] = fp16_to_fp32(gradient_half[i]); + } + + const float kEpsilonFp16 = 1e-4f; // avoid underflow to zero in FP16 + auto lr_half = fp32_to_fp16(kLearningRate); + auto beta1_half = fp32_to_fp16(kBeta1); + auto beta2_half = fp32_to_fp16(kBeta2); + auto epsilon_half = fp32_to_fp16(kEpsilonFp16); + auto decay_half = fp32_to_fp16(kDecay); + + reinterpret_cast(inputs[3]->MutableData())[0] = lr_half; + reinterpret_cast(inputs[4]->MutableData())[0] = beta1_half; + reinterpret_cast(inputs[5]->MutableData())[0] = beta2_half; + reinterpret_cast(inputs[6]->MutableData())[0] = epsilon_half; + reinterpret_cast(inputs[7]->MutableData())[0] = decay_half; + + const float lr_value = fp16_to_fp32(lr_half); + const float beta1_value = fp16_to_fp32(beta1_half); + const float beta2_value = fp16_to_fp32(beta2_half); + const float epsilon_value = fp16_to_fp32(epsilon_half); + const float decay_value = fp16_to_fp32(decay_half); + + auto expected_weight = weight_fp32; + auto expected_moment1 = moment1_fp32; + auto expected_moment2 = moment2_fp32; + AdamWeightDecayFp32(expected_weight.data(), expected_moment1.data(), expected_moment2.data(), lr_value, beta1_value, + beta2_value, epsilon_value, decay_value, gradient_fp32.data(), 0, kTensorLength); + + std::vector expected_weight_quantized(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + expected_weight_quantized[i] = fp16_to_fp32(fp32_to_fp16(expected_weight[i])); + } + + auto ctx = new lite::InnerContext; + ASSERT_NE(ctx, nullptr); + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + auto *param = CreateAdamWeightDecayParameter(); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_AdamWeightDecay}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + + auto kernel = creator(inputs, outputs, param, ctx, key); + ASSERT_NE(kernel, nullptr); + + auto ret = kernel->Prepare(); + EXPECT_EQ(lite::RET_OK, ret); + ret = kernel->Run(); + EXPECT_EQ(lite::RET_OK, ret); + + auto weight_data_half = reinterpret_cast(inputs[kernel::kAdamWeightIdx]->MutableData()); + std::vector weight_data_fp32(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + weight_data_fp32[i] = fp16_to_fp32(weight_data_half[i]); + } + + ASSERT_EQ(0, CompareOutputData(weight_data_fp32.data(), expected_weight_quantized.data(), kTensorLength, 1e-3f)); + + UninitDSPRuntime(); + delete ctx; + for (auto *tensor : tensors_to_delete) { + delete tensor; + } + delete kernel; +} +#endif // SUPPORT_FT04 + +} // namespace mindspore::lite::dsp::test diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h index 88419f42d7e853af569ac4d207993293a3f96258..450e0d6c8c5a67f6b0d9675569fd25acd1494ba7 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h @@ -46,6 +46,98 @@ class DSPCommonTest : public CommonTest { dsp_runtime_wrapper_ = nullptr; } + // Local IEEE754 half <-> float converters to avoid any linkage/impl mismatch in tests. + float fp16_to_fp32(uint16_t h) { + uint32_t sign = (static_cast(h) & 0x8000u) << 16; + uint32_t exp = (static_cast(h) & 0x7C00u) >> 10; + uint32_t mant = static_cast(h & 0x03FFu); + uint32_t f; + if (exp == 0) { + if (mant == 0) { + f = sign; // zero + } else { + // subnormal -> normalize + exp = 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + --exp; + } + mant &= 0x03FFu; + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + } else if (exp == 0x1Fu) { // Inf/NaN + f = sign | 0x7F800000u | (mant << 13); + } else { + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + float out; + std::memcpy(&out, &f, sizeof(out)); + return out; + } + + uint16_t fp32_to_fp16(float val) { + uint32_t fbits; + std::memcpy(&fbits, &val, sizeof(fbits)); + uint32_t sign = (fbits >> 16) & 0x8000u; + uint32_t fexp = (fbits >> 23) & 0xFFu; + uint32_t fmant = fbits & 0x007FFFFFu; + + // NaN/Inf handling + if (fexp == 0xFFu) { + if (fmant != 0) { + // NaN: keep a quiet NaN in half + return static_cast(sign | 0x7C00u | 0x0001u); + } + // Inf + return static_cast(sign | 0x7C00u); + } + + // Rebias exponent for half + int32_t hexp = static_cast(fexp) - 127 + 15; + + if (hexp <= 0) { + // Subnormal or underflow to zero in half + if (hexp < -10) { + return static_cast(sign); // Underflow to zero + } + // Make implicit leading 1 explicit + uint32_t mant = fmant | 0x00800000u; + // Shift to align to half subnormal mantissa (10 bits) + int shift = 1 - hexp; // shift in [1..10] + // Compute mantissa with round-to-nearest-even + uint32_t mant_rounded = mant >> (shift + 13); + uint32_t round_bit = (mant >> (shift + 12)) & 1u; + uint32_t sticky = (mant & ((1u << (shift + 12)) - 1u)) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + return static_cast(sign | static_cast(mant_rounded)); + } + + if (hexp >= 0x1F) { + // Overflow to half inf + return static_cast(sign | 0x7C00u); + } + + // Normal case: build exponent and mantissa with round-to-nearest-even + uint16_t hexp_field = static_cast(hexp) << 10; + uint32_t mant = fmant; + uint32_t mant_rounded = mant >> 13; + uint32_t round_bit = (mant >> 12) & 1u; + uint32_t sticky = (mant & 0xFFFu) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + if (mant_rounded == 0x400u) { + // Mantissa overflow after rounding; bump exponent, zero mantissa + mant_rounded = 0; + hexp_field = static_cast(hexp_field + 0x0400u); + if (hexp_field >= 0x7C00u) { + // Exponent overflow -> inf + return static_cast(sign | 0x7C00u); + } + } + return static_cast(sign | hexp_field | static_cast(mant_rounded)); + } + protected: dsp::DSPRuntimeInnerWrapper *dsp_runtime_wrapper_{nullptr}; std::shared_ptr allocator_;