diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc9361b31e505bb19d6cad6ccb427e51fb6ebbb3
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.cc
@@ -0,0 +1,219 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft04/sgd.h"
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+#include "src/common/utils.h"
+#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h"
+#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h"
+#include "src/litert/kernel/cpu/nnacl_c/base/cast_base.h"
+#include "src/litert/kernel_registry.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_SGD;
+
+namespace mindspore::kernel {
+int SgdDSPKernel::Prepare() { return RET_OK; }
+
+int SgdDSPKernel::CheckSpecs() {
+  if (in_tensors_.size() != kSgdInputTensorSize) {
+    MS_LOG(WARNING) << "Input size mismatch: expected " << kSgdInputTensorSize << ", got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != kSgdOutputTensorSize) {
+    MS_LOG(WARNING) << "Output size mismatch: expected " << kSgdOutputTensorSize << ", got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+
+  auto weight_shape = in_tensors_[kSgdWeightIdx]->shape();
+  if (weight_shape != in_tensors_[kSgdAccumulateIdx]->shape() ||
+      weight_shape != in_tensors_[kSgdGradientIdx]->shape()) {
+    MS_LOG(WARNING) << "Weight, accumulate or gradient tensor shapes mismatch.";
+    return RET_ERROR;
+  }
+
+  auto data_type = in_tensors_[kSgdWeightIdx]->data_type();
+  if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16) {
+    MS_LOG(WARNING) << "Unsupported data type: " << static_cast<int>(data_type);
+    return RET_ERROR;
+  }
+
+  auto check_scalar = [&](const lite::Tensor *tensor) -> bool {
+    if (tensor == nullptr || tensor->ElementsNum() != 1) {
+      return false;
+    }
+    auto tensor_type = tensor->data_type();
+    if (data_type == kNumberTypeFloat32) {
+      return tensor_type == kNumberTypeFloat32;
+    }
+    return tensor_type == kNumberTypeFloat16;
+  };
+
+  if (!check_scalar(in_tensors_[kSgdLrIdx]) || !check_scalar(in_tensors_[kSgdMomentumIdx])) {
+    MS_LOG(WARNING) << "Optimizer scalar tensors are invalid.";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int SgdDSPKernel::SgdRunFp32() {
+  kernel_name_ = "fp_sgd_s";
+  core_mask_ = 0xf;
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int SgdDSPKernel::SgdRunFp16() {
+  kernel_name_ = "hp_sgd_s";
+  core_mask_ = 0xf;
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int SgdDSPKernel::Run() {
+  auto allocator = dsp_runtime_->GetAllocator();
+
+  auto *weight = in_tensors_[kSgdWeightIdx];
+
+  int64_t elements_num = weight->ElementsNum();
+
+  auto data_type = weight->data_type();
+  auto *param = reinterpret_cast<SgdParameter *>(op_parameter_);
+
+  uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data());
+  uint64_t accumulate_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdAccumulateIdx]->data());
+  uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdGradientIdx]->data());
+
+  size_t float_param_bytes = 0;
+  if (data_type == kNumberTypeFloat32) {
+    float_param_bytes = sizeof(float) * kSgdFloatParamSize;
+  } else if (data_type == kNumberTypeFloat16) {
+    float_param_bytes = sizeof(uint16_t) * kSgdFloatParamSize;
+  } else {
+    MS_LOG(ERROR) << "Unsupported data type: " << static_cast<int>(data_type);
+    return RET_ERROR;
+  }
+
+  void *float_params_buffer = allocator->Malloc(float_param_bytes);
+
+  auto free_float_buffer = [&]() {
+    if (float_params_buffer != nullptr) {
+      allocator->Free(float_params_buffer);
+      float_params_buffer = nullptr;
+    }
+  };
+
+  // Pack float params: [lr, dampening, momentum, weight_decay]
+  if (data_type == kNumberTypeFloat32) {
+    float float_params[kSgdFloatParamSize] = {0.f};
+
+    // LR
+    const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx];
+    if (lr_tensor == nullptr || lr_tensor->data() == nullptr) {
+      free_float_buffer();
+      MS_LOG(ERROR) << "LR tensor is invalid.";
+      return RET_ERROR;
+    }
+    float_params[0] = *(reinterpret_cast<const float *>(lr_tensor->data()));
+
+    // Dampening
+    float_params[1] = param->dampening_;
+
+    // Momentum
+    const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx];
+
+    float_params[2] = *(reinterpret_cast<const float *>(momentum_tensor->data()));
+
+    // Weight Decay
+    float_params[3] = param->weight_decay_;
+
+    std::memcpy(float_params_buffer, float_params, float_param_bytes);
+  } else {
+    uint16_t float16_params[kSgdFloatParamSize] = {0};
+
+    // LR
+    const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx];
+
+    float16_params[0] = *(reinterpret_cast<const uint16_t *>(lr_tensor->data()));
+
+    // Dampening
+    float16_params[1] = Float32ToFloat16_(param->dampening_);
+
+    // Momentum
+    const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx];
+    float16_params[2] = *(reinterpret_cast<const uint16_t *>(momentum_tensor->data()));
+
+    // Weight Decay
+    float16_params[3] = Float32ToFloat16_(param->weight_decay_);
+
+    std::memcpy(float_params_buffer, float16_params, float_param_bytes);
+  }
+
+  uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer);
+
+  void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kSgdIntParamSize);
+
+  auto free_all_buffers = [&]() {
+    if (float_params_buffer != nullptr) {
+      allocator->Free(float_params_buffer);
+      float_params_buffer = nullptr;
+    }
+    if (int_params_buffer != nullptr) {
+      allocator->Free(int_params_buffer);
+      int_params_buffer = nullptr;
+    }
+  };
+
+  auto *int_params = reinterpret_cast<int32_t *>(int_params_buffer);
+  int_params[0] = 0;
+  int_params[1] = static_cast<int32_t>(elements_num);
+
+  uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer);
+
+  int use_nesterov = param->use_nesterov_ ? 1 : 0;
+
+  SetKernelArg({weight_device_ptr, accumulate_device_ptr, grad_device_ptr, float_params_device_ptr,
+                int_params_device_ptr, static_cast<uint64_t>(use_nesterov)});
+
+  int ret = RET_ERROR;
+  if (data_type == kNumberTypeFloat32) {
+    ret = SgdRunFp32();
+  } else if (data_type == kNumberTypeFloat16) {
+    ret = SgdRunFp16();
+  } else {
+    free_all_buffers();
+    MS_LOG(ERROR) << "Unsupported data type: " << static_cast<int>(data_type);
+    return RET_ERROR;
+  }
+
+  free_all_buffers();
+
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << this->name() << " Run failed! ";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_SGD, DSPKernelCreator<SgdDSPKernel>)
+REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_SGD, DSPKernelCreator<SgdDSPKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e0d9fba05a5e3ce6dbe49f52c362949afa0ff2d
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft04/sgd.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
+
+#include <string>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+constexpr size_t kSgdInputTensorSize = 6;
+constexpr size_t kSgdOutputTensorSize = 1;
+constexpr size_t kSgdFloatParamSize = 4;
+constexpr size_t kSgdIntParamSize = 2;
+
+constexpr size_t kSgdWeightIdx = 0;
+constexpr size_t kSgdGradientIdx = 1;
+constexpr size_t kSgdLrIdx = 2;
+constexpr size_t kSgdAccumulateIdx = 3;
+constexpr size_t kSgdMomentumIdx = 4;
+constexpr size_t kSgdStatIdx = 5;
+
+class SgdDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+
+  ~SgdDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int SgdRunFp32();
+  int SgdRunFp16();
+
+  std::string kernel_name_;
+  uint64_t core_mask_{0};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c5a6e0a142c124e9ddd7cab3c999c171260d3e4
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.cc
@@ -0,0 +1,200 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/dsp/ft78/sgd.h"
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+#include "src/common/utils.h"
+#include "src/litert/kernel/cpu/nnacl_c/nnacl_common.h"
+#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h"
+#include "src/litert/kernel/cpu/nnacl_c/base/cast_base.h"
+#include "src/litert/kernel_registry.h"
+
+using mindspore::kernel::KERNEL_ARCH::kDSP;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_SGD;
+
+namespace mindspore::kernel {
+int SgdDSPKernel::Prepare() { return RET_OK; }
+
+int SgdDSPKernel::CheckSpecs() {
+  if (in_tensors_.size() != kSgdInputTensorSize) {
+    MS_LOG(WARNING) << "Input size mismatch: expected " << kSgdInputTensorSize << ", got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != kSgdOutputTensorSize) {
+    MS_LOG(WARNING) << "Output size mismatch: expected " << kSgdOutputTensorSize << ", got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+
+  auto weight_shape = in_tensors_[kSgdWeightIdx]->shape();
+  if (weight_shape != in_tensors_[kSgdAccumulateIdx]->shape() ||
+      weight_shape != in_tensors_[kSgdGradientIdx]->shape()) {
+    MS_LOG(WARNING) << "Weight, accumulate or gradient tensor shapes mismatch.";
+    return RET_ERROR;
+  }
+
+  auto data_type = in_tensors_[kSgdWeightIdx]->data_type();
+  if (data_type != kNumberTypeFloat32) {
+    MS_LOG(WARNING) << "Unsupported data type: " << static_cast<int>(data_type);
+    return RET_ERROR;
+  }
+
+  auto check_scalar = [&](const lite::Tensor *tensor) -> bool {
+    return tensor != nullptr && tensor->ElementsNum() == 1 && tensor->data_type() == kNumberTypeFloat32;
+  };
+
+  if (!check_scalar(in_tensors_[kSgdLrIdx]) || !check_scalar(in_tensors_[kSgdMomentumIdx])) {
+    MS_LOG(WARNING) << "Optimizer scalar tensors are invalid.";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int SgdDSPKernel::SgdRunFp32() {
+  kernel_name_ = "fp_sgd_s";
+  core_mask_ = 0xf;
+  return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_);
+}
+
+int SgdDSPKernel::Run() {
+  auto allocator = dsp_runtime_->GetAllocator();
+
+  auto *weight = in_tensors_[kSgdWeightIdx];
+
+  int64_t elements_num = weight->ElementsNum();
+  if (elements_num <= 0) {
+    MS_LOG(ERROR) << "Invalid tensor length: " << elements_num;
+    return RET_ERROR;
+  }
+  if (elements_num > std::numeric_limits<int32_t>::max()) {
+    MS_LOG(ERROR) << "Tensor length overflow: " << elements_num;
+    return RET_ERROR;
+  }
+
+  auto *param = reinterpret_cast<SgdParameter *>(op_parameter_);
+
+  uint64_t weight_device_ptr = allocator->GetDeviceMemPtr(weight->data());
+  uint64_t accumulate_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdAccumulateIdx]->data());
+  uint64_t grad_device_ptr = allocator->GetDeviceMemPtr(in_tensors_[kSgdGradientIdx]->data());
+
+  if (weight_device_ptr == 0 || accumulate_device_ptr == 0 || grad_device_ptr == 0) {
+    MS_LOG(ERROR) << "Failed to obtain device pointers for Sgd tensors.";
+    return RET_ERROR;
+  }
+
+  size_t float_param_bytes = sizeof(float) * kSgdFloatParamSize;
+
+  void *float_params_buffer = allocator->Malloc(float_param_bytes);
+  if (float_params_buffer == nullptr) {
+    MS_LOG(ERROR) << "Failed to allocate float parameter buffer.";
+    return RET_ERROR;
+  }
+  auto free_float_buffer = [&]() {
+    if (float_params_buffer != nullptr) {
+      allocator->Free(float_params_buffer);
+      float_params_buffer = nullptr;
+    }
+  };
+
+  // Pack float params: [lr, dampening, momentum, weight_decay]
+  float float_params[kSgdFloatParamSize] = {0.f};
+
+  // LR
+  const lite::Tensor *lr_tensor = in_tensors_[kSgdLrIdx];
+  if (lr_tensor == nullptr || lr_tensor->data() == nullptr) {
+    free_float_buffer();
+    MS_LOG(ERROR) << "LR tensor is invalid.";
+    return RET_ERROR;
+  }
+  float_params[0] = *(reinterpret_cast<const float *>(lr_tensor->data()));
+
+  // Dampening
+  float_params[1] = param->dampening_;
+
+  // Momentum
+  const lite::Tensor *momentum_tensor = in_tensors_[kSgdMomentumIdx];
+  if (momentum_tensor == nullptr || momentum_tensor->data() == nullptr) {
+    free_float_buffer();
+    MS_LOG(ERROR) << "Momentum tensor is invalid.";
+    return RET_ERROR;
+  }
+  float_params[2] = *(reinterpret_cast<const float *>(momentum_tensor->data()));
+
+  // Weight Decay
+  float_params[3] = param->weight_decay_;
+
+  std::memcpy(float_params_buffer, float_params, float_param_bytes);
+
+  uint64_t float_params_device_ptr = allocator->GetDeviceMemPtr(float_params_buffer);
+  if (float_params_device_ptr == 0) {
+    free_float_buffer();
+    MS_LOG(ERROR) << "Failed to obtain device pointer for float parameter buffer.";
+    return RET_ERROR;
+  }
+
+  void *int_params_buffer = allocator->Malloc(sizeof(int32_t) * kSgdIntParamSize);
+  if (int_params_buffer == nullptr) {
+    free_float_buffer();
+    MS_LOG(ERROR) << "Failed to allocate int parameter buffer.";
+    return RET_ERROR;
+  }
+  auto free_all_buffers = [&]() {
+    if (float_params_buffer != nullptr) {
+      allocator->Free(float_params_buffer);
+      float_params_buffer = nullptr;
+    }
+    if (int_params_buffer != nullptr) {
+      allocator->Free(int_params_buffer);
+      int_params_buffer = nullptr;
+    }
+  };
+
+  auto *int_params = reinterpret_cast<int32_t *>(int_params_buffer);
+  int_params[0] = 0;
+  int_params[1] = static_cast<int32_t>(elements_num);
+
+  uint64_t int_params_device_ptr = allocator->GetDeviceMemPtr(int_params_buffer);
+  if (int_params_device_ptr == 0) {
+    free_all_buffers();
+    MS_LOG(ERROR) << "Failed to obtain device pointer for int parameter buffer.";
+    return RET_ERROR;
+  }
+
+  int use_nesterov = param->use_nesterov_ ? 1 : 0;
+
+  SetKernelArg({weight_device_ptr, accumulate_device_ptr, grad_device_ptr, float_params_device_ptr,
+                int_params_device_ptr, static_cast<uint64_t>(use_nesterov)});
+
+  int ret = SgdRunFp32();
+
+  free_all_buffers();
+
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << this->name() << " Run failed! ";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_SGD, DSPKernelCreator<SgdDSPKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..29584fe5845e14dbcb7c6cdd3411784658dd6c18
--- /dev/null
+++ b/mindspore-lite/src/litert/kernel/dsp/ft78/sgd.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
+
+#include <string>
+#include "src/litert/kernel/dsp/dsp_kernel.h"
+
+namespace mindspore::kernel {
+constexpr size_t kSgdInputTensorSize = 6;
+constexpr size_t kSgdOutputTensorSize = 1;
+constexpr size_t kSgdFloatParamSize = 4;
+constexpr size_t kSgdIntParamSize = 2;
+
+constexpr size_t kSgdWeightIdx = 0;
+constexpr size_t kSgdGradientIdx = 1;
+constexpr size_t kSgdLrIdx = 2;
+constexpr size_t kSgdAccumulateIdx = 3;
+constexpr size_t kSgdMomentumIdx = 4;
+constexpr size_t kSgdStatIdx = 5;
+
+class SgdDSPKernel : public DSPKernel {
+ public:
+  using DSPKernel::DSPKernel;
+
+  ~SgdDSPKernel() override = default;
+
+  int Prepare() override;
+  int CheckSpecs() override;
+  int Run() override;
+
+ private:
+  int SgdRunFp32();
+
+  std::string kernel_name_;
+  uint64_t core_mask_{0};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_SGD_H_
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h
index 88419f42d7e853af569ac4d207993293a3f96258..450e0d6c8c5a67f6b0d9675569fd25acd1494ba7 100644
--- a/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/dsp_test.h
@@ -46,6 +46,98 @@ class DSPCommonTest : public CommonTest {
     dsp_runtime_wrapper_ = nullptr;
   }
 
+  // Local IEEE754 half <-> float converters to avoid any linkage/impl mismatch in tests.
+  float fp16_to_fp32(uint16_t h) {
+    uint32_t sign = (static_cast<uint32_t>(h) & 0x8000u) << 16;
+    uint32_t exp = (static_cast<uint32_t>(h) & 0x7C00u) >> 10;
+    uint32_t mant = static_cast<uint32_t>(h & 0x03FFu);
+    uint32_t f;
+    if (exp == 0) {
+      if (mant == 0) {
+        f = sign;  // zero
+      } else {
+        // subnormal -> normalize
+        exp = 1;
+        while ((mant & 0x0400u) == 0) {
+          mant <<= 1;
+          --exp;
+        }
+        mant &= 0x03FFu;
+        uint32_t fexp = (exp + (127 - 15)) << 23;
+        f = sign | fexp | (mant << 13);
+      }
+    } else if (exp == 0x1Fu) {  // Inf/NaN
+      f = sign | 0x7F800000u | (mant << 13);
+    } else {
+      uint32_t fexp = (exp + (127 - 15)) << 23;
+      f = sign | fexp | (mant << 13);
+    }
+    float out;
+    std::memcpy(&out, &f, sizeof(out));
+    return out;
+  }
+
+  uint16_t fp32_to_fp16(float val) {
+    uint32_t fbits;
+    std::memcpy(&fbits, &val, sizeof(fbits));
+    uint32_t sign = (fbits >> 16) & 0x8000u;
+    uint32_t fexp = (fbits >> 23) & 0xFFu;
+    uint32_t fmant = fbits & 0x007FFFFFu;
+
+    // NaN/Inf handling
+    if (fexp == 0xFFu) {
+      if (fmant != 0) {
+        // NaN: keep a quiet NaN in half
+        return static_cast<uint16_t>(sign | 0x7C00u | 0x0001u);
+      }
+      // Inf
+      return static_cast<uint16_t>(sign | 0x7C00u);
+    }
+
+    // Rebias exponent for half
+    int32_t hexp = static_cast<int32_t>(fexp) - 127 + 15;
+
+    if (hexp <= 0) {
+      // Subnormal or underflow to zero in half
+      if (hexp < -10) {
+        return static_cast<uint16_t>(sign);  // Underflow to zero
+      }
+      // Make implicit leading 1 explicit
+      uint32_t mant = fmant | 0x00800000u;
+      // Shift to align to half subnormal mantissa (10 bits)
+      int shift = 1 - hexp;  // shift in [1..10]
+      // Compute mantissa with round-to-nearest-even
+      uint32_t mant_rounded = mant >> (shift + 13);
+      uint32_t round_bit = (mant >> (shift + 12)) & 1u;
+      uint32_t sticky = (mant & ((1u << (shift + 12)) - 1u)) != 0u;
+      mant_rounded += (round_bit & (sticky | (mant_rounded & 1u)));
+      return static_cast<uint16_t>(sign | static_cast<uint16_t>(mant_rounded));
+    }
+
+    if (hexp >= 0x1F) {
+      // Overflow to half inf
+      return static_cast<uint16_t>(sign | 0x7C00u);
+    }
+
+    // Normal case: build exponent and mantissa with round-to-nearest-even
+    uint16_t hexp_field = static_cast<uint16_t>(hexp) << 10;
+    uint32_t mant = fmant;
+    uint32_t mant_rounded = mant >> 13;
+    uint32_t round_bit = (mant >> 12) & 1u;
+    uint32_t sticky = (mant & 0xFFFu) != 0u;
+    mant_rounded += (round_bit & (sticky | (mant_rounded & 1u)));
+    if (mant_rounded == 0x400u) {
+      // Mantissa overflow after rounding; bump exponent, zero mantissa
+      mant_rounded = 0;
+      hexp_field = static_cast<uint16_t>(hexp_field + 0x0400u);
+      if (hexp_field >= 0x7C00u) {
+        // Exponent overflow -> inf
+        return static_cast<uint16_t>(sign | 0x7C00u);
+      }
+    }
+    return static_cast<uint16_t>(sign | hexp_field | static_cast<uint16_t>(mant_rounded));
+  }
+
  protected:
   dsp::DSPRuntimeInnerWrapper *dsp_runtime_wrapper_{nullptr};
   std::shared_ptr<DSPAllocator> allocator_;
diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f17461b08a0d491c7cb0ceaa509892fe6d420d66
--- /dev/null
+++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/sgd_test.cc
@@ -0,0 +1,332 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <vector>
+#include "ut/src/runtime/kernel/dsp/dsp_test.h"
+#include "include/api/context.h"
+#include "include/api/data_type.h"
+#include "schema/inner/model_generated.h"
+#include "src/litert/kernel_registry.h"
+#include "src/litert/kernel/cpu/nnacl_c/fp32_grad/optimizer.h"
+#ifdef SUPPORT_FT78
+#include "src/litert/kernel/dsp/ft78/sgd.h"
+#else
+#include "src/litert/kernel/dsp/ft04/sgd.h"
+#endif
+
+namespace mindspore::lite::dsp::test {
+namespace {
+constexpr int kTensorLength = 10000;
+constexpr float kLearningRate = 0.00001f;
+constexpr float kDampening = 0.1f;
+constexpr float kMomentum = 0.9f;
+constexpr float kWeightDecay = 0.0001f;
+
+// Reference implementation for SGD
+void DoSgdRef(float *weight, float *accumulate, float *gradient, float learning_rate, float dampening, float moment,
+              bool nesterov, float weight_decay, int start, int end) {
+  if (weight_decay > 0.f) {
+    for (int i = start; i < end; ++i) {
+      gradient[i] += weight[i] * weight_decay;
+    }
+  }
+  if (moment > 0.f) {
+    if (nesterov) {
+      for (int i = start; i < end; ++i) {
+        accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - dampening);
+        weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate;
+      }
+    } else {
+      for (int i = start; i < end; ++i) {
+        accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - dampening);
+        weight[i] -= accumulate[i] * learning_rate;
+      }
+    }
+  } else {
+    for (int i = start; i < end; ++i) {
+      weight[i] -= gradient[i] * learning_rate;
+    }
+  }
+}
+
+OpParameter *CreateSgdParameter(float dampening, float weight_decay, bool use_nesterov) {
+  auto *param = new SgdParameter();
+  param->op_parameter_.type_ = schema::PrimitiveType_SGD;
+  param->dampening_ = dampening;
+  param->weight_decay_ = weight_decay;
+  param->use_nesterov_ = use_nesterov;
+  return reinterpret_cast<OpParameter *>(param);
+}
+
+void BuildInitialData(std::vector<float> *weight, std::vector<float> *accumulate, std::vector<float> *gradient) {
+  weight->resize(kTensorLength);
+  accumulate->resize(kTensorLength);
+  gradient->resize(kTensorLength);
+  for (int i = 0; i < kTensorLength; ++i) {
+    (*weight)[i] = 0.5f + 0.001f * static_cast<float>(i % 100);  // stay near [0.5,1.5)
+    (*accumulate)[i] = 0.0f;                                     // zero momentum to limit growth
+    int t = i % 200;
+    (*gradient)[i] = 0.05f - 0.0005f * static_cast<float>(t);  // wrap every 200 elems into [-0.05,0.05]
+  }
+}
+
+}  // namespace
+
+class TestDSP_Sgd : public DSPCommonTest {};
+
+TEST_F(TestDSP_Sgd, Sgd_Fp32) {
+  InitDSPRuntime();
+
+  std::vector<lite::Tensor *> inputs;
+  std::vector<lite::Tensor *> outputs;
+  std::vector<lite::Tensor *> tensors_to_delete;
+
+  std::vector<int> param_shape = {kTensorLength};
+  std::vector<int> scalar_shape = {1};
+
+  // 0: Weight
+  auto weight_tensor = new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::VAR);
+  weight_tensor->MallocData(allocator_);
+  inputs.push_back(weight_tensor);
+  outputs.push_back(weight_tensor);
+  tensors_to_delete.push_back(weight_tensor);
+
+  // 1: Gradient
+  auto gradient_tensor =
+    new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  gradient_tensor->MallocData(allocator_);
+  inputs.push_back(gradient_tensor);
+  tensors_to_delete.push_back(gradient_tensor);
+
+  // 2: Learning Rate
+  auto lr_tensor = new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  lr_tensor->MallocData(allocator_);
+  inputs.push_back(lr_tensor);
+  tensors_to_delete.push_back(lr_tensor);
+
+  // 3: Accumulate
+  auto accumulate_tensor = new lite::Tensor(kNumberTypeFloat32, param_shape, mindspore::NHWC, lite::Category::VAR);
+  accumulate_tensor->MallocData(allocator_);
+  inputs.push_back(accumulate_tensor);
+  tensors_to_delete.push_back(accumulate_tensor);
+
+  // 4: Momentum
+  auto momentum_tensor =
+    new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  momentum_tensor->MallocData(allocator_);
+  inputs.push_back(momentum_tensor);
+  tensors_to_delete.push_back(momentum_tensor);
+
+  // 5: Stat
+  auto stat_tensor = new lite::Tensor(kNumberTypeFloat32, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  stat_tensor->MallocData(allocator_);
+  inputs.push_back(stat_tensor);
+  tensors_to_delete.push_back(stat_tensor);
+
+  // Initialize data
+  std::vector<float> initial_weight;
+  std::vector<float> initial_accumulate;
+  std::vector<float> initial_gradient;
+  BuildInitialData(&initial_weight, &initial_accumulate, &initial_gradient);
+
+  std::copy(initial_weight.begin(), initial_weight.end(), reinterpret_cast<float *>(weight_tensor->MutableData()));
+  std::copy(initial_accumulate.begin(), initial_accumulate.end(),
+            reinterpret_cast<float *>(accumulate_tensor->MutableData()));
+  std::copy(initial_gradient.begin(), initial_gradient.end(),
+            reinterpret_cast<float *>(gradient_tensor->MutableData()));
+
+  reinterpret_cast<float *>(lr_tensor->MutableData())[0] = kLearningRate;
+  reinterpret_cast<float *>(momentum_tensor->MutableData())[0] = kMomentum;
+  reinterpret_cast<float *>(stat_tensor->MutableData())[0] = 1.0f;  // Assume stat > 0 for normal step
+
+  // Run Reference
+  auto expected_weight = initial_weight;
+  auto expected_accumulate = initial_accumulate;
+  auto expected_gradient = initial_gradient;  // Gradient is modified in place if weight_decay > 0
+
+  DoSgdRef(expected_weight.data(), expected_accumulate.data(), expected_gradient.data(), kLearningRate, kDampening,
+           kMomentum, false, kWeightDecay, 0, kTensorLength);
+
+  // Run DSP Kernel
+  auto ctx = new lite::InnerContext;
+  ASSERT_NE(ctx, nullptr);
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  auto *param = CreateSgdParameter(kDampening, kWeightDecay, false);
+  ASSERT_NE(param, nullptr);
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_SGD};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+
+  auto kernel = creator(inputs, outputs, param, ctx, key);
+  ASSERT_NE(kernel, nullptr);
+
+  ASSERT_EQ(lite::RET_OK, kernel->Prepare());
+  ASSERT_EQ(lite::RET_OK, kernel->Run());
+
+  auto weight_data = reinterpret_cast<float *>(weight_tensor->MutableData());
+
+  ASSERT_EQ(0, CompareOutputData(weight_data, expected_weight.data(), kTensorLength, 1e-5f));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto *tensor : tensors_to_delete) delete tensor;
+  delete kernel;
+}
+
+#ifndef SUPPORT_FT78
+TEST_F(TestDSP_Sgd, Sgd_Fp16) {
+  InitDSPRuntime();
+
+  std::vector<lite::Tensor *> inputs;
+  std::vector<lite::Tensor *> outputs;
+  std::vector<lite::Tensor *> tensors_to_delete;
+
+  std::vector<int> param_shape = {kTensorLength};
+  std::vector<int> scalar_shape = {1};
+
+  // 0: Weight
+  auto weight_tensor = new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::VAR);
+  weight_tensor->MallocData(allocator_);
+  inputs.push_back(weight_tensor);
+  outputs.push_back(weight_tensor);
+  tensors_to_delete.push_back(weight_tensor);
+
+  // 1: Gradient
+  auto gradient_tensor =
+    new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  gradient_tensor->MallocData(allocator_);
+  inputs.push_back(gradient_tensor);
+  tensors_to_delete.push_back(gradient_tensor);
+
+  // 2: Learning Rate
+  auto lr_tensor = new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  lr_tensor->MallocData(allocator_);
+  inputs.push_back(lr_tensor);
+  tensors_to_delete.push_back(lr_tensor);
+
+  // 3: Accumulate
+  auto accumulate_tensor = new lite::Tensor(kNumberTypeFloat16, param_shape, mindspore::NHWC, lite::Category::VAR);
+  accumulate_tensor->MallocData(allocator_);
+  inputs.push_back(accumulate_tensor);
+  tensors_to_delete.push_back(accumulate_tensor);
+
+  // 4: Momentum
+  auto momentum_tensor =
+    new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  momentum_tensor->MallocData(allocator_);
+  inputs.push_back(momentum_tensor);
+  tensors_to_delete.push_back(momentum_tensor);
+
+  // 5: Stat
+  auto stat_tensor = new lite::Tensor(kNumberTypeFloat16, scalar_shape, mindspore::NHWC, lite::Category::CONST_TENSOR);
+  stat_tensor->MallocData(allocator_);
+  inputs.push_back(stat_tensor);
+  tensors_to_delete.push_back(stat_tensor);
+
+  // Initialize data (FP32 source)
+  std::vector<float> initial_weight;
+  std::vector<float> initial_accumulate;
+  std::vector<float> initial_gradient;
+  BuildInitialData(&initial_weight, &initial_accumulate, &initial_gradient);
+
+  // Convert to FP16 and fill tensors
+  auto *weight_ptr = reinterpret_cast<uint16_t *>(weight_tensor->MutableData());
+  auto *accumulate_ptr = reinterpret_cast<uint16_t *>(accumulate_tensor->MutableData());
+  auto *gradient_ptr = reinterpret_cast<uint16_t *>(gradient_tensor->MutableData());
+
+  for (int i = 0; i < kTensorLength; ++i) {
+    weight_ptr[i] = fp32_to_fp16(initial_weight[i]);
+    accumulate_ptr[i] = fp32_to_fp16(initial_accumulate[i]);
+    gradient_ptr[i] = fp32_to_fp16(initial_gradient[i]);
+  }
+
+  auto lr_half = fp32_to_fp16(kLearningRate);
+  auto momentum_half = fp32_to_fp16(kMomentum);
+  auto stat_half = fp32_to_fp16(1.0f);
+  auto dampening_half = fp32_to_fp16(kDampening);
+  auto weight_decay_half = fp32_to_fp16(kWeightDecay);
+  reinterpret_cast<uint16_t *>(lr_tensor->MutableData())[0] = lr_half;
+  reinterpret_cast<uint16_t *>(momentum_tensor->MutableData())[0] = momentum_half;
+  reinterpret_cast<uint16_t *>(stat_tensor->MutableData())[0] = stat_half;
+
+  // Run Reference (using FP32 for simplicity, but inputs are what we put in)
+  // Note: Precision loss is expected.
+  // Use values that have been round-tripped through FP16 to match DSP input
+  std::vector<float> ref_weight(kTensorLength);
+  std::vector<float> ref_accumulate(kTensorLength);
+  std::vector<float> ref_gradient(kTensorLength);
+
+  for (int i = 0; i < kTensorLength; ++i) {
+    ref_weight[i] = fp16_to_fp32(weight_ptr[i]);
+    ref_accumulate[i] = fp16_to_fp32(accumulate_ptr[i]);
+    ref_gradient[i] = fp16_to_fp32(gradient_ptr[i]);
+  }
+
+  float ref_lr = fp16_to_fp32(lr_half);
+  float ref_momentum = fp16_to_fp32(momentum_half);
+  float ref_dampening = fp16_to_fp32(dampening_half);
+  float ref_weight_decay = fp16_to_fp32(weight_decay_half);
+
+  DoSgdRef(ref_weight.data(), ref_accumulate.data(), ref_gradient.data(), ref_lr, ref_dampening, ref_momentum, false,
+           ref_weight_decay, 0, kTensorLength);
+
+  std::vector<float> ref_weight_quantized(kTensorLength);
+  for (int i = 0; i < kTensorLength; ++i) {
+    ref_weight_quantized[i] = fp16_to_fp32(fp32_to_fp16(ref_weight[i]));
+  }
+
+  // Run DSP Kernel
+  auto ctx = new lite::InnerContext;
+  ASSERT_NE(ctx, nullptr);
+  ASSERT_EQ(lite::RET_OK, ctx->Init());
+
+  auto *param = CreateSgdParameter(kDampening, kWeightDecay, false);
+  ASSERT_NE(param, nullptr);
+
+  kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_SGD};
+  auto creator = KernelRegistry::GetInstance()->GetCreator(key);
+  ASSERT_NE(creator, nullptr);
+
+  auto kernel = creator(inputs, outputs, param, ctx, key);
+  ASSERT_NE(kernel, nullptr);
+
+  ASSERT_EQ(lite::RET_OK, kernel->Prepare());
+  ASSERT_EQ(lite::RET_OK, kernel->Run());
+
+  auto weight_data_fp16 = reinterpret_cast<uint16_t *>(weight_tensor->MutableData());
+  std::vector<float> weight_data_fp32(kTensorLength);
+  for (int i = 0; i < kTensorLength; ++i) {
+    weight_data_fp32[i] = fp16_to_fp32(weight_data_fp16[i]);
+  }
+
+  // FP16 precision is lower, so tolerance is higher
+  ASSERT_EQ(0, CompareOutputData(weight_data_fp32.data(), ref_weight_quantized.data(), kTensorLength, 5e-3f));
+
+  UninitDSPRuntime();
+  delete ctx;
+  for (auto *tensor : tensors_to_delete) delete tensor;
+  delete kernel;
+}
+
+#endif  // not SUPPORT_FT78
+
+}  // namespace mindspore::lite::dsp::test