diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc9361b31e505bb19d6cad6ccb427e51fb6ebbb3 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc @@ -0,0 +1,219 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/sgd.h" + +#include +#include +#include + +#include "src/common/utils.h" +#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h" +#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h" +#include "src/litert/kernel/cpu/nnacl_c/base/cast_base.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_SGD; + +namespace mindspore::kernel { +int SgdDSPKernel::Prepare() { return RET_OK; } + +int SgdDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kSgdInputTensorSize) { + MS_LOG(WARNING) << "Input size mismatch: expected " << kSgdInputTensorSize << ", got " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kSgdOutputTensorSize) { + MS_LOG(WARNING) << "Output size mismatch: expected " << kSgdOutputTensorSize << ", got " << out_tensors_.size(); + return RET_ERROR; + } + + auto weight_shape = in_tensors_[kSgdWeightIdx]->shape(); + if (weight_shape != in_tensors_[kSgdAccumulateIdx]->shape() || + weight_shape != in_tensors_[kSgdGradientIdx]->shape()) { + MS_LOG(WARNING) << "Weight, accumulate or gradient tensor shapes mismatch."; + return RET_ERROR; + } + + auto data_type = in_tensors_[kSgdWeightIdx]->data_type(); + if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16) { + MS_LOG(WARNING) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + auto check_scalar = [&](const lite::Tensor *tensor) -> bool { + if (tensor == nullptr || tensor->ElementsNum() != 1) { + return false; + } + auto tensor_type = tensor->data_type(); + if (data_type == kNumberTypeFloat32) { + return tensor_type == kNumberTypeFloat32; + } + return tensor_type == kNumberTypeFloat16; + }; + + if (!check_scalar(in_tensors_[kSgdLrIdx]) || !check_scalar(in_tensors_[kSgdMomentumIdx])) { + MS_LOG(WARNING) << "Optimizer scalar tensors are invalid."; + return RET_ERROR; + } + + return RET_OK; +} + +int SgdDSPKernel::SgdRunFp32() { + kernel_name_ = "fp_sgd_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int SgdDSPKernel::SgdRunFp16() { + kernel_name_ = "hp_sgd_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int SgdDSPKernel::Run() { + auto allocator = dsp_runtime_->GetAllocator(); + + auto *weight = in_tensors_[kSgdWeightIdx]; + + int64_t elements_num = weight->ElementsNum(); + + auto data_type = weight->data_type(); + auto *param = reinterpret_cast(op_parameter_); + + uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data()); + uint64_t accumulate_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdAccumulateIdx]->data()); + uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdGradientIdx]->data()); + + size_t float_param_bytes = 0; + if (data_type == kNumberTypeFloat32) { + float_param_bytes = sizeof(float) * kSgdFloatParamSize; + } else if (data_type == kNumberTypeFloat16) { + float_param_bytes = sizeof(uint16_t) * kSgdFloatParamSize; + } else { + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + void *float_params_buffer = allocator->Malloc(float_param_bytes); + + auto free_float_buffer = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + }; + + // Pack float params: [lr, dampening, momentum, weight_decay] + if (data_type == kNumberTypeFloat32) { + float float_params[kSgdFloatParamSize] = {0.f}; + + // LR + const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx]; + if (lr_tensor == nullptr || lr_tensor->data() == nullptr) { + free_float_buffer(); + MS_LOG(ERROR) << "LR tensor is invalid."; + return RET_ERROR; + } + float_params[0] = *(reinterpret_cast(lr_tensor->data())); + + // Dampening + float_params[1] = param->dampening_; + + // Momentum + const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx]; + + float_params[2] = *(reinterpret_cast(momentum_tensor->data())); + + // Weight Decay + float_params[3] = param->weight_decay_; + + std::memcpy(float_params_buffer, float_params, float_param_bytes); + } else { + uint16_t float16_params[kSgdFloatParamSize] = {0}; + + // LR + const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx]; + + float16_params[0] = *(reinterpret_cast(lr_tensor->data())); + + // Dampening + float16_params[1] = Float32ToFloat16_(param->dampening_); + + // Momentum + const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx]; + float16_params[2] = *(reinterpret_cast(momentum_tensor->data())); + + // Weight Decay + float16_params[3] = Float32ToFloat16_(param->weight_decay_); + + std::memcpy(float_params_buffer, float16_params, float_param_bytes); + } + + uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer); + + void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kSgdIntParamSize); + + auto free_all_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + if (int_params_buffer != nullptr) { + allocator->Free(int_params_buffer); + int_params_buffer = nullptr; + } + }; + + auto *int_params = reinterpret_cast(int_params_buffer); + int_params[0] = 0; + int_params[1] = static_cast(elements_num); + + uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer); + + int use_nesterov = param->use_nesterov_ ? 1 : 0; + + SetKernelArg({weight_device_ptr, accumulate_device_ptr, grad_device_ptr, float_params_device_ptr, + int_params_device_ptr, static_cast(use_nesterov)}); + + int ret = RET_ERROR; + if (data_type == kNumberTypeFloat32) { + ret = SgdRunFp32(); + } else if (data_type == kNumberTypeFloat16) { + ret = SgdRunFp16(); + } else { + free_all_buffers(); + MS_LOG(ERROR) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + free_all_buffers(); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed! "; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_SGD, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_SGD, DSPKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h new file mode 100644 index 0000000000000000000000000000000000000000..9e0d9fba05a5e3ce6dbe49f52c362949afa0ff2d --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h @@ -0,0 +1,55 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +constexpr size_t kSgdInputTensorSize = 6; +constexpr size_t kSgdOutputTensorSize = 1; +constexpr size_t kSgdFloatParamSize = 4; +constexpr size_t kSgdIntParamSize = 2; + +constexpr size_t kSgdWeightIdx = 0; +constexpr size_t kSgdGradientIdx = 1; +constexpr size_t kSgdLrIdx = 2; +constexpr size_t kSgdAccumulateIdx = 3; +constexpr size_t kSgdMomentumIdx = 4; +constexpr size_t kSgdStatIdx = 5; + +class SgdDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~SgdDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int SgdRunFp32(); + int SgdRunFp16(); + + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c5a6e0a142c124e9ddd7cab3c999c171260d3e4 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc @@ -0,0 +1,200 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/sgd.h" + +#include +#include +#include + +#include "src/common/utils.h" +#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h" +#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h" +#include "src/litert/kernel/cpu/nnacl_c/base/cast_base.h" +#include "src/litert/kernel_registry.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_SGD; + +namespace mindspore::kernel { +int SgdDSPKernel::Prepare() { return RET_OK; } + +int SgdDSPKernel::CheckSpecs() { + if (in_tensors_.size() != kSgdInputTensorSize) { + MS_LOG(WARNING) << "Input size mismatch: expected " << kSgdInputTensorSize << ", got " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != kSgdOutputTensorSize) { + MS_LOG(WARNING) << "Output size mismatch: expected " << kSgdOutputTensorSize << ", got " << out_tensors_.size(); + return RET_ERROR; + } + + auto weight_shape = in_tensors_[kSgdWeightIdx]->shape(); + if (weight_shape != in_tensors_[kSgdAccumulateIdx]->shape() || + weight_shape != in_tensors_[kSgdGradientIdx]->shape()) { + MS_LOG(WARNING) << "Weight, accumulate or gradient tensor shapes mismatch."; + return RET_ERROR; + } + + auto data_type = in_tensors_[kSgdWeightIdx]->data_type(); + if (data_type != kNumberTypeFloat32) { + MS_LOG(WARNING) << "Unsupported data type: " << static_cast(data_type); + return RET_ERROR; + } + + auto check_scalar = [&](const lite::Tensor *tensor) -> bool { + return tensor != nullptr && tensor->ElementsNum() == 1 && tensor->data_type() == kNumberTypeFloat32; + }; + + if (!check_scalar(in_tensors_[kSgdLrIdx]) || !check_scalar(in_tensors_[kSgdMomentumIdx])) { + MS_LOG(WARNING) << "Optimizer scalar tensors are invalid."; + return RET_ERROR; + } + + return RET_OK; +} + +int SgdDSPKernel::SgdRunFp32() { + kernel_name_ = "fp_sgd_s"; + core_mask_ = 0xf; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int SgdDSPKernel::Run() { + auto allocator = dsp_runtime_->GetAllocator(); + + auto *weight = in_tensors_[kSgdWeightIdx]; + + int64_t elements_num = weight->ElementsNum(); + if (elements_num <= 0) { + MS_LOG(ERROR) << "Invalid tensor length: " << elements_num; + return RET_ERROR; + } + if (elements_num > std::numeric_limits::max()) { + MS_LOG(ERROR) << "Tensor length overflow: " << elements_num; + return RET_ERROR; + } + + auto *param = reinterpret_cast(op_parameter_); + + uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data()); + uint64_t accumulate_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdAccumulateIdx]->data()); + uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdGradientIdx]->data()); + + if (weight_device_ptr == 0 || accumulate_device_ptr == 0 || grad_device_ptr == 0) { + MS_LOG(ERROR) << "Failed to obtain device pointers for Sgd tensors."; + return RET_ERROR; + } + + size_t float_param_bytes = sizeof(float) * kSgdFloatParamSize; + + void *float_params_buffer = allocator->Malloc(float_param_bytes); + if (float_params_buffer == nullptr) { + MS_LOG(ERROR) << "Failed to allocate float parameter buffer."; + return RET_ERROR; + } + auto free_float_buffer = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + }; + + // Pack float params: [lr, dampening, momentum, weight_decay] + float float_params[kSgdFloatParamSize] = {0.f}; + + // LR + const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx]; + if (lr_tensor == nullptr || lr_tensor->data() == nullptr) { + free_float_buffer(); + MS_LOG(ERROR) << "LR tensor is invalid."; + return RET_ERROR; + } + float_params[0] = *(reinterpret_cast(lr_tensor->data())); + + // Dampening + float_params[1] = param->dampening_; + + // Momentum + const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx]; + if (momentum_tensor == nullptr || momentum_tensor->data() == nullptr) { + free_float_buffer(); + MS_LOG(ERROR) << "Momentum tensor is invalid."; + return RET_ERROR; + } + float_params[2] = *(reinterpret_cast(momentum_tensor->data())); + + // Weight Decay + float_params[3] = param->weight_decay_; + + std::memcpy(float_params_buffer, float_params, float_param_bytes); + + uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer); + if (float_params_device_ptr == 0) { + free_float_buffer(); + MS_LOG(ERROR) << "Failed to obtain device pointer for float parameter buffer."; + return RET_ERROR; + } + + void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kSgdIntParamSize); + if (int_params_buffer == nullptr) { + free_float_buffer(); + MS_LOG(ERROR) << "Failed to allocate int parameter buffer."; + return RET_ERROR; + } + auto free_all_buffers = [&]() { + if (float_params_buffer != nullptr) { + allocator->Free(float_params_buffer); + float_params_buffer = nullptr; + } + if (int_params_buffer != nullptr) { + allocator->Free(int_params_buffer); + int_params_buffer = nullptr; + } + }; + + auto *int_params = reinterpret_cast(int_params_buffer); + int_params[0] = 0; + int_params[1] = static_cast(elements_num); + + uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer); + if (int_params_device_ptr == 0) { + free_all_buffers(); + MS_LOG(ERROR) << "Failed to obtain device pointer for int parameter buffer."; + return RET_ERROR; + } + + int use_nesterov = param->use_nesterov_ ? 1 : 0; + + SetKernelArg({weight_device_ptr, accumulate_device_ptr, grad_device_ptr, float_params_device_ptr, + int_params_device_ptr, static_cast(use_nesterov)}); + + int ret = SgdRunFp32(); + + free_all_buffers(); + + if (ret != RET_OK) { + MS_LOG(ERROR) << this->name() << " Run failed! "; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_SGD, DSPKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h new file mode 100644 index 0000000000000000000000000000000000000000..29584fe5845e14dbcb7c6cdd3411784658dd6c18 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h @@ -0,0 +1,54 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ + +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +constexpr size_t kSgdInputTensorSize = 6; +constexpr size_t kSgdOutputTensorSize = 1; +constexpr size_t kSgdFloatParamSize = 4; +constexpr size_t kSgdIntParamSize = 2; + +constexpr size_t kSgdWeightIdx = 0; +constexpr size_t kSgdGradientIdx = 1; +constexpr size_t kSgdLrIdx = 2; +constexpr size_t kSgdAccumulateIdx = 3; +constexpr size_t kSgdMomentumIdx = 4; +constexpr size_t kSgdStatIdx = 5; + +class SgdDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + + ~SgdDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int SgdRunFp32(); + + std::string kernel_name_; + uint64_t core_mask_{0}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h index 88419f42d7e853af569ac4d207993293a3f96258..450e0d6c8c5a67f6b0d9675569fd25acd1494ba7 100644 --- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h @@ -46,6 +46,98 @@ class DSPCommonTest : public CommonTest { dsp_runtime_wrapper_ = nullptr; } + // Local IEEE754 half <-> float converters to avoid any linkage/impl mismatch in tests. + float fp16_to_fp32(uint16_t h) { + uint32_t sign = (static_cast(h) & 0x8000u) << 16; + uint32_t exp = (static_cast(h) & 0x7C00u) >> 10; + uint32_t mant = static_cast(h & 0x03FFu); + uint32_t f; + if (exp == 0) { + if (mant == 0) { + f = sign; // zero + } else { + // subnormal -> normalize + exp = 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + --exp; + } + mant &= 0x03FFu; + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + } else if (exp == 0x1Fu) { // Inf/NaN + f = sign | 0x7F800000u | (mant << 13); + } else { + uint32_t fexp = (exp + (127 - 15)) << 23; + f = sign | fexp | (mant << 13); + } + float out; + std::memcpy(&out, &f, sizeof(out)); + return out; + } + + uint16_t fp32_to_fp16(float val) { + uint32_t fbits; + std::memcpy(&fbits, &val, sizeof(fbits)); + uint32_t sign = (fbits >> 16) & 0x8000u; + uint32_t fexp = (fbits >> 23) & 0xFFu; + uint32_t fmant = fbits & 0x007FFFFFu; + + // NaN/Inf handling + if (fexp == 0xFFu) { + if (fmant != 0) { + // NaN: keep a quiet NaN in half + return static_cast(sign | 0x7C00u | 0x0001u); + } + // Inf + return static_cast(sign | 0x7C00u); + } + + // Rebias exponent for half + int32_t hexp = static_cast(fexp) - 127 + 15; + + if (hexp <= 0) { + // Subnormal or underflow to zero in half + if (hexp < -10) { + return static_cast(sign); // Underflow to zero + } + // Make implicit leading 1 explicit + uint32_t mant = fmant | 0x00800000u; + // Shift to align to half subnormal mantissa (10 bits) + int shift = 1 - hexp; // shift in [1..10] + // Compute mantissa with round-to-nearest-even + uint32_t mant_rounded = mant >> (shift + 13); + uint32_t round_bit = (mant >> (shift + 12)) & 1u; + uint32_t sticky = (mant & ((1u << (shift + 12)) - 1u)) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + return static_cast(sign | static_cast(mant_rounded)); + } + + if (hexp >= 0x1F) { + // Overflow to half inf + return static_cast(sign | 0x7C00u); + } + + // Normal case: build exponent and mantissa with round-to-nearest-even + uint16_t hexp_field = static_cast(hexp) << 10; + uint32_t mant = fmant; + uint32_t mant_rounded = mant >> 13; + uint32_t round_bit = (mant >> 12) & 1u; + uint32_t sticky = (mant & 0xFFFu) != 0u; + mant_rounded += (round_bit & (sticky | (mant_rounded & 1u))); + if (mant_rounded == 0x400u) { + // Mantissa overflow after rounding; bump exponent, zero mantissa + mant_rounded = 0; + hexp_field = static_cast(hexp_field + 0x0400u); + if (hexp_field >= 0x7C00u) { + // Exponent overflow -> inf + return static_cast(sign | 0x7C00u); + } + } + return static_cast(sign | hexp_field | static_cast(mant_rounded)); + } + protected: dsp::DSPRuntimeInnerWrapper *dsp_runtime_wrapper_{nullptr}; std::shared_ptr allocator_; diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f17461b08a0d491c7cb0ceaa509892fe6d420d66 --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc @@ -0,0 +1,332 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel_registry.h" +#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h" +#ifdef SUPPORT_FT78 +#include "src/litert/kernel/dsp/ft78/sgd.h" +#else +#include "src/litert/kernel/dsp/ft04/sgd.h" +#endif + +namespace mindspore::lite::dsp::test { +namespace { +constexpr int kTensorLength = 10000; +constexpr float kLearningRate = 0.00001f; +constexpr float kDampening = 0.1f; +constexpr float kMomentum = 0.9f; +constexpr float kWeightDecay = 0.0001f; + +// Reference implementation for SGD +void DoSgdRef(float *weight, float *accumulate, float *gradient, float learning_rate, float dampening, float moment, + bool nesterov, float weight_decay, int start, int end) { + if (weight_decay > 0.f) { + for (int i = start; i < end; ++i) { + gradient[i] += weight[i] * weight_decay; + } + } + if (moment > 0.f) { + if (nesterov) { + for (int i = start; i < end; ++i) { + accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - dampening); + weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate; + } + } else { + for (int i = start; i < end; ++i) { + accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - dampening); + weight[i] -= accumulate[i] * learning_rate; + } + } + } else { + for (int i = start; i < end; ++i) { + weight[i] -= gradient[i] * learning_rate; + } + } +} + +OpParameter *CreateSgdParameter(float dampening, float weight_decay, bool use_nesterov) { + auto *param = new SgdParameter(); + param->op_parameter_.type_ = schema::PrimitiveType_SGD; + param->dampening_ = dampening; + param->weight_decay_ = weight_decay; + param->use_nesterov_ = use_nesterov; + return reinterpret_cast(param); +} + +void BuildInitialData(std::vector *weight, std::vector *accumulate, std::vector *gradient) { + weight->resize(kTensorLength); + accumulate->resize(kTensorLength); + gradient->resize(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + (*weight)[i] = 0.5f + 0.001f * static_cast(i % 100); // stay near [0.5,1.5) + (*accumulate)[i] = 0.0f; // zero momentum to limit growth + int t = i % 200; + (*gradient)[i] = 0.05f - 0.0005f * static_cast(t); // wrap every 200 elems into [-0.05,0.05] + } +} + +} // namespace + +class TestDSP_Sgd : public DSPCommonTest {}; + +TEST_F(TestDSP_Sgd, Sgd_Fp32) { + InitDSPRuntime(); + + std::vector inputs; + std::vector outputs; + std::vector tensors_to_delete; + + std::vector param_shape = {kTensorLength}; + std::vector scalar_shape = {1}; + + // 0: Weight + auto weight_tensor = new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::VAR); + weight_tensor->MallocData(allocator_); + inputs.push_back(weight_tensor); + outputs.push_back(weight_tensor); + tensors_to_delete.push_back(weight_tensor); + + // 1: Gradient + auto gradient_tensor = + new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + gradient_tensor->MallocData(allocator_); + inputs.push_back(gradient_tensor); + tensors_to_delete.push_back(gradient_tensor); + + // 2: Learning Rate + auto lr_tensor = new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + lr_tensor->MallocData(allocator_); + inputs.push_back(lr_tensor); + tensors_to_delete.push_back(lr_tensor); + + // 3: Accumulate + auto accumulate_tensor = new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::VAR); + accumulate_tensor->MallocData(allocator_); + inputs.push_back(accumulate_tensor); + tensors_to_delete.push_back(accumulate_tensor); + + // 4: Momentum + auto momentum_tensor = + new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + momentum_tensor->MallocData(allocator_); + inputs.push_back(momentum_tensor); + tensors_to_delete.push_back(momentum_tensor); + + // 5: Stat + auto stat_tensor = new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + stat_tensor->MallocData(allocator_); + inputs.push_back(stat_tensor); + tensors_to_delete.push_back(stat_tensor); + + // Initialize data + std::vector initial_weight; + std::vector initial_accumulate; + std::vector initial_gradient; + BuildInitialData(&initial_weight, &initial_accumulate, &initial_gradient); + + std::copy(initial_weight.begin(), initial_weight.end(), reinterpret_cast(weight_tensor->MutableData())); + std::copy(initial_accumulate.begin(), initial_accumulate.end(), + reinterpret_cast(accumulate_tensor->MutableData())); + std::copy(initial_gradient.begin(), initial_gradient.end(), + reinterpret_cast(gradient_tensor->MutableData())); + + reinterpret_cast(lr_tensor->MutableData())[0] = kLearningRate; + reinterpret_cast(momentum_tensor->MutableData())[0] = kMomentum; + reinterpret_cast(stat_tensor->MutableData())[0] = 1.0f; // Assume stat > 0 for normal step + + // Run Reference + auto expected_weight = initial_weight; + auto expected_accumulate = initial_accumulate; + auto expected_gradient = initial_gradient; // Gradient is modified in place if weight_decay > 0 + + DoSgdRef(expected_weight.data(), expected_accumulate.data(), expected_gradient.data(), kLearningRate, kDampening, + kMomentum, false, kWeightDecay, 0, kTensorLength); + + // Run DSP Kernel + auto ctx = new lite::InnerContext; + ASSERT_NE(ctx, nullptr); + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + auto *param = CreateSgdParameter(kDampening, kWeightDecay, false); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_SGD}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + + auto kernel = creator(inputs, outputs, param, ctx, key); + ASSERT_NE(kernel, nullptr); + + ASSERT_EQ(lite::RET_OK, kernel->Prepare()); + ASSERT_EQ(lite::RET_OK, kernel->Run()); + + auto weight_data = reinterpret_cast(weight_tensor->MutableData()); + + ASSERT_EQ(0, CompareOutputData(weight_data, expected_weight.data(), kTensorLength, 1e-5f)); + + UninitDSPRuntime(); + delete ctx; + for (auto *tensor : tensors_to_delete) delete tensor; + delete kernel; +} + +#ifndef SUPPORT_FT78 +TEST_F(TestDSP_Sgd, Sgd_Fp16) { + InitDSPRuntime(); + + std::vector inputs; + std::vector outputs; + std::vector tensors_to_delete; + + std::vector param_shape = {kTensorLength}; + std::vector scalar_shape = {1}; + + // 0: Weight + auto weight_tensor = new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::VAR); + weight_tensor->MallocData(allocator_); + inputs.push_back(weight_tensor); + outputs.push_back(weight_tensor); + tensors_to_delete.push_back(weight_tensor); + + // 1: Gradient + auto gradient_tensor = + new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + gradient_tensor->MallocData(allocator_); + inputs.push_back(gradient_tensor); + tensors_to_delete.push_back(gradient_tensor); + + // 2: Learning Rate + auto lr_tensor = new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + lr_tensor->MallocData(allocator_); + inputs.push_back(lr_tensor); + tensors_to_delete.push_back(lr_tensor); + + // 3: Accumulate + auto accumulate_tensor = new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::VAR); + accumulate_tensor->MallocData(allocator_); + inputs.push_back(accumulate_tensor); + tensors_to_delete.push_back(accumulate_tensor); + + // 4: Momentum + auto momentum_tensor = + new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + momentum_tensor->MallocData(allocator_); + inputs.push_back(momentum_tensor); + tensors_to_delete.push_back(momentum_tensor); + + // 5: Stat + auto stat_tensor = new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR); + stat_tensor->MallocData(allocator_); + inputs.push_back(stat_tensor); + tensors_to_delete.push_back(stat_tensor); + + // Initialize data (FP32 source) + std::vector initial_weight; + std::vector initial_accumulate; + std::vector initial_gradient; + BuildInitialData(&initial_weight, &initial_accumulate, &initial_gradient); + + // Convert to FP16 and fill tensors + auto *weight_ptr = reinterpret_cast(weight_tensor->MutableData()); + auto *accumulate_ptr = reinterpret_cast(accumulate_tensor->MutableData()); + auto *gradient_ptr = reinterpret_cast(gradient_tensor->MutableData()); + + for (int i = 0; i < kTensorLength; ++i) { + weight_ptr[i] = fp32_to_fp16(initial_weight[i]); + accumulate_ptr[i] = fp32_to_fp16(initial_accumulate[i]); + gradient_ptr[i] = fp32_to_fp16(initial_gradient[i]); + } + + auto lr_half = fp32_to_fp16(kLearningRate); + auto momentum_half = fp32_to_fp16(kMomentum); + auto stat_half = fp32_to_fp16(1.0f); + auto dampening_half = fp32_to_fp16(kDampening); + auto weight_decay_half = fp32_to_fp16(kWeightDecay); + reinterpret_cast(lr_tensor->MutableData())[0] = lr_half; + reinterpret_cast(momentum_tensor->MutableData())[0] = momentum_half; + reinterpret_cast(stat_tensor->MutableData())[0] = stat_half; + + // Run Reference (using FP32 for simplicity, but inputs are what we put in) + // Note: Precision loss is expected. + // Use values that have been round-tripped through FP16 to match DSP input + std::vector ref_weight(kTensorLength); + std::vector ref_accumulate(kTensorLength); + std::vector ref_gradient(kTensorLength); + + for (int i = 0; i < kTensorLength; ++i) { + ref_weight[i] = fp16_to_fp32(weight_ptr[i]); + ref_accumulate[i] = fp16_to_fp32(accumulate_ptr[i]); + ref_gradient[i] = fp16_to_fp32(gradient_ptr[i]); + } + + float ref_lr = fp16_to_fp32(lr_half); + float ref_momentum = fp16_to_fp32(momentum_half); + float ref_dampening = fp16_to_fp32(dampening_half); + float ref_weight_decay = fp16_to_fp32(weight_decay_half); + + DoSgdRef(ref_weight.data(), ref_accumulate.data(), ref_gradient.data(), ref_lr, ref_dampening, ref_momentum, false, + ref_weight_decay, 0, kTensorLength); + + std::vector ref_weight_quantized(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + ref_weight_quantized[i] = fp16_to_fp32(fp32_to_fp16(ref_weight[i])); + } + + // Run DSP Kernel + auto ctx = new lite::InnerContext; + ASSERT_NE(ctx, nullptr); + ASSERT_EQ(lite::RET_OK, ctx->Init()); + + auto *param = CreateSgdParameter(kDampening, kWeightDecay, false); + ASSERT_NE(param, nullptr); + + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_SGD}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + + auto kernel = creator(inputs, outputs, param, ctx, key); + ASSERT_NE(kernel, nullptr); + + ASSERT_EQ(lite::RET_OK, kernel->Prepare()); + ASSERT_EQ(lite::RET_OK, kernel->Run()); + + auto weight_data_fp16 = reinterpret_cast(weight_tensor->MutableData()); + std::vector weight_data_fp32(kTensorLength); + for (int i = 0; i < kTensorLength; ++i) { + weight_data_fp32[i] = fp16_to_fp32(weight_data_fp16[i]); + } + + // FP16 precision is lower, so tolerance is higher + ASSERT_EQ(0, CompareOutputData(weight_data_fp32.data(), ref_weight_quantized.data(), kTensorLength, 5e-3f)); + + UninitDSPRuntime(); + delete ctx; + for (auto *tensor : tensors_to_delete) delete tensor; + delete kernel; +} + +#endif // not SUPPORT_FT78 + +} // namespace mindspore::lite::dsp::test