From 5fe4ddf032357d7bc17ba3d81dbb545fd3945181 Mon Sep 17 00:00:00 2001
From: gengchao <gengchao4@huawei.com>
Date: Thu, 25 May 2023 20:16:46 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E9=99=8D=E6=A1=A3=E5=8A=A0?=
 =?UTF-8?q?=E9=80=9F=E8=AE=AD=E7=BB=83=E7=89=B9=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../inc/external/ge/ge_api_types.h            |   2 +
 tf_adapter/kernels/geop_npu.cc                | 227 +++++++++++++++++-
 tf_adapter/kernels/geop_npu.h                 |  26 +-
 .../npu_bridge/estimator/npu/npu_config.py    |   5 +-
 .../npu_bridge/estimator/npu/npu_estimator.py |   3 +
 .../npu_bridge/estimator/npu/npu_hook.py      |  31 +++
 .../ut/kernels/testcase/geop_npu_test.cc      | 136 ++++++++++-
 .../tests/ut/util/testcase/npu_attrs_test.cc  |  26 ++
 tf_adapter/util/ge_plugin.cc                  |   2 +
 tf_adapter/util/npu_attrs.cc                  |  48 ++++
 tf_adapter/util/npu_attrs.h                   |   2 +
 11 files changed, 490 insertions(+), 18 deletions(-)
diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h
index c2c9e6f87..28687361a 100644
--- a/inc/graphengine/inc/external/ge/ge_api_types.h
+++ b/inc/graphengine/inc/external/ge/ge_api_types.h
@@ -81,6 +81,8 @@ const char *const OPTION_EXEC_LOGICAL_DEVICE_CLUSTER_DEPLOY_MODE = "ge.exec.logi
 const char *const OPTION_EXEC_LOGICAL_DEVICE_ID = "ge.exec.logicalDeviceId";
 const char *const OPTION_EXEC_MODEL_DEPLOY_MODE = "ge.exec.modelDeployMode";
 const char *const OPTION_EXEC_MODEL_DEPLOY_DEVICELIST = "ge.exec.modelDeployDevicelist";
+// accelerate train flag
+const char *const OPTION_EXEC_ACCELERATE_TRAIN_MODE = "ge.exec.accelerateTrainMode";
 
 // Option key: memory init
 const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize";
diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc
index 4e0dfc8ed..c51c80836 100644
--- a/tf_adapter/kernels/geop_npu.cc
+++ b/tf_adapter/kernels/geop_npu.cc
@@ -31,6 +31,7 @@
 #include <thread>
 #include <vector>
 #include <algorithm>
+#include <limits>
 
 #include "tf_adapter/common/adapter_logger.h"
 #include "tf_adapter/common/common.h"
@@ -90,6 +91,35 @@ namespace {
 const std::string ATTR_NAME_CONST_INPUT_NAME = "_const_input";
 const std::string kMdatTuning = "mdat";
 const std::string kAutoRecompute = "auto";
+const std::string kTotalStep = "TOTAL_STEP";
+const std::string kStepNow = "STEP_NOW";
+const std::string kTargetLoss = "TARGET_LOSS";
+const std::string kLossNow = "LOSS_NOW";
+const std::string kModeValueStep = "step";
+const std::string kModeValueLoss = "loss";
+const float kDefaultStepRatio = 0.9;
+const float kMinStepRatio = 0.2;
+const float kMaxStepRatio = 0.9;
+const float kDefaultLossRatio = 1.05;
+const float kMinLossRatio = 1.01;
+const float kMaxLossRatio = 1.5;
+
+const std::map<std::string, GeOp::FastValue> fast_value_string_2_eunm = {{"fast", GeOp::kfast},
+                                                                         {"fast1", GeOp::kfast1}};
+
+const std::map<GeOp::FastValue, std::string> fast_value_enum_2_string = {{GeOp::kfast, "fast"},
+                                                                         {GeOp::kfast1, "fast1"}};
+const std::map<GeOp::FastValue, std::string> fast_value_2_precision_mode_v1 = {
+    {GeOp::kfast, "allow_mix_precision_fp16"},
+    {GeOp::kfast1, "allow_mix_precision_bf16"},
+};
+const std::unordered_set<std::string> supported_origin_precision_mode_v1 = {"allow_fp32_to_fp16",
+                                                                            "must_keep_origin_dtype", ""};
+const std::unordered_set<std::string> valid_mode_values = {kModeValueStep, kModeValueLoss};
+const std::map<GeOp::FastValue, std::string> fast_value_2_precision_mode_v2 = {{GeOp::kfast, "mixed_float16"},
+                                                                               {GeOp::kfast1, "mixed_bfloat16"}};
+const std::unordered_set<std::string> supported_origin_precision_mode_v2 = {"origin", ""};
+
 using geDataUniquePtr = std::unique_ptr<uint8_t[], std::function<void(uint8_t *)>>;
 
 class NpuHostFixedAllocator : public tensorflow::Allocator, public tensorflow::core::RefCounted {
@@ -518,6 +548,185 @@ void GeOp::Finalize() {
   return;
 }
 
+uint32_t GetStepToChange(const uint32_t total_step, const float ratio) {
+  return total_step * ratio;
+}
+
+float GetLossToChange(const float target_loss, const float ratio) {
+  return target_loss * ratio;
+}
+
+Status GeOp::TriggeredByStep(bool &is_triggered) {
+  uint32_t total_step = 0U;
+  REQUIRES_STATUS_OK(GetStepFromEnv(kTotalStep, total_step));
+  uint32_t step_to_change = GetStepToChange(total_step, accelerate_info_.fast_ratio);
+  uint32_t step_now = 0U;
+  REQUIRES_STATUS_OK(GetStepFromEnv(kStepNow, step_now));
+  if (step_now == step_to_change) {
+    ADP_LOG(EVENT) << "[GEOP] trigger recompile when step is " << step_now;
+    is_triggered = true;
+    return Status::OK();
+  }
+  is_triggered = false;
+  return Status::OK();
+}
+
+Status GeOp::TriggeredByLoss(bool &is_triggered) {
+  float target_loss = 0.0;
+  REQUIRES_STATUS_OK(GetLossFromEnv(kTargetLoss, target_loss));
+  float loss_to_change = GetLossToChange(target_loss, accelerate_info_.fast_ratio);
+  float loss_now = 0U;
+  REQUIRES_STATUS_OK(GetLossFromEnv(kLossNow, loss_now));
+  if (loss_now == loss_to_change) {
+    ADP_LOG(EVENT) << "[GEOP] trigger recompile when loss is " << loss_now;
+    is_triggered = true;
+    return Status::OK();
+  }
+  is_triggered = false;
+  return Status::OK();
+}
+
+Status GeOp::NeedRecompileWhenAccelerateTrainOn(bool &need_recompile) {
+  if (!IsAccelerateTrainOn()) {
+    need_recompile = false;
+    return Status::OK();
+  }
+  // init_options_[ge::OPTION_EXEC_ACCELERATE_TRAIN_MODE] must be valid if `IsAccelerateTrainOn` is true
+  REQUIRES_STATUS_OK(ParserAccelerateTrain(init_options_["ge.exec.accelerateTrainMode"]));
+  if (accelerate_info_.fast_mode == kModeValueStep) {
+    REQUIRES_STATUS_OK(TriggeredByStep(need_recompile));
+  } else {
+    REQUIRES_STATUS_OK(TriggeredByLoss(need_recompile));
+  }
+  return Status::OK();
+}
+
+Status GeOp::CheckAndSetAccelarateRatio(const std::string &mode_value, const std::string &ratio_value) {
+  float ratio = 0.0;
+  std::stringstream ss;
+  if (!strings::safe_strtof(ratio_value, &ratio)) {
+    ss << "accelerate_train_mode third part is invalid: " << ratio_value
+       << " ,you can chose `0.9` for `step` or `1.02` for `loss`";
+    ADP_LOG(ERROR) << ss.str();
+    return errors::Unavailable(ss.str());
+  }
+
+  if (mode_value == kModeValueStep) {
+    if (ratio < kMinStepRatio || ratio > kMaxStepRatio) {
+      ss << "accelerate_train_mode third part is invalid: " << ratio_value << " ,you can chose `" << kMinStepRatio
+         << "-" << kMaxStepRatio << "`for `" << mode_value << "`";
+      ADP_LOG(ERROR) << ss.str();
+      return errors::Unavailable(ss.str());
+    }
+  } else if (mode_value == kModeValueLoss) {
+    if (ratio < kMinLossRatio || ratio > kMaxLossRatio) {
+      ss << "accelerate_train_mode third part is invalid: " << ratio_value << " ,you can chose `" << kMinLossRatio
+         << "-" << kMaxLossRatio << "`for `" << mode_value << "`";
+      ADP_LOG(ERROR) << ss.str();
+      return errors::Unavailable(ss.str());
+    }
+  } else {
+    ADP_LOG(ERROR) << "invalid mode value: " << mode_value;
+    return errors::Unavailable("invalid mode value");
+  }
+  accelerate_info_.fast_ratio = ratio;
+  return Status::OK();
+}
+
+Status GeOp::ParserAccelerateTrain(const std::string &accelerate_train_mode) {
+  // format like "fast|step|0.9" or "fast|step"
+  std::vector<std::string> infos = ge::StringUtils::Split(accelerate_train_mode, '|');
+  std::stringstream ss;
+  if ((infos.size() != 2U) && (infos.size() != 3U)) {
+    ss << "Format of accelerate_train_mode is invalid: " << accelerate_train_mode;
+    ADP_LOG(ERROR) << ss.str();
+    return errors::Unavailable(ss.str());
+  }
+  const auto &fast_value = infos[0U];
+  const auto &iter = fast_value_string_2_eunm.find(fast_value);
+  if (iter == fast_value_string_2_eunm.end()) {
+    ss << "accelerate_train_mode first part is invalid: , you can chose `fast`" << accelerate_train_mode;
+    ADP_LOG(ERROR) << ss.str();
+    return errors::Unavailable(ss.str());
+  }
+  accelerate_info_.fast_value = iter->second;
+  const auto &mode_value = infos[1U];
+  if (valid_mode_values.find(mode_value) == valid_mode_values.end()) {
+    ss << "accelerate_train_mode second part is invalid: , you can chose `step`" << accelerate_train_mode;
+    ADP_LOG(ERROR) << ss.str();
+    return errors::Unavailable(ss.str());
+  }
+  accelerate_info_.fast_mode = mode_value;
+  if ((infos.size() != 3U )|| (infos[2U].empty())) {
+    accelerate_info_.fast_ratio = accelerate_info_.fast_mode == kModeValueStep ? kDefaultStepRatio : kDefaultLossRatio;
+    return Status::OK();
+  }
+  return CheckAndSetAccelarateRatio(mode_value, infos[2U]);
+}
+
+bool GeOp::IsAccelerateTrainOn() {
+  const auto iter = init_options_.find("ge.exec.accelerateTrainMode");
+  if (iter == init_options_.end()) {
+    return false;
+  }
+  return !(iter->second.empty());
+}
+
+Status GeOp::CheckAndModifyPrecisionMode() {
+  std::stringstream ss;
+  const auto &iter_v2 = init_options_.find(ge::PRECISION_MODE_V2);
+  if ((accelerate_info_.origin_precision_mode_v2.empty()) && (iter_v2 != init_options_.end())) {
+    const auto &origin_mode_v2 = init_options_[ge::PRECISION_MODE_V2];
+    const auto &inner_iter_v2 = fast_value_2_precision_mode_v2.find(accelerate_info_.fast_value);
+    if ((inner_iter_v2 == fast_value_2_precision_mode_v2.end()) ||
+        (supported_origin_precision_mode_v2.find(origin_mode_v2) == supported_origin_precision_mode_v2.end())) {
+      ss << "accelerate fast_value:" << fast_value_enum_2_string.at(accelerate_info_.fast_value)
+         << " is not support with PRECISION_MODE_V2: " << origin_mode_v2;
+      ADP_LOG(ERROR) << ss.str();
+      return errors::Unavailable(ss.str());
+    }
+    graph_options_[ge::PRECISION_MODE_V2] = inner_iter_v2->second;
+    accelerate_info_.origin_precision_mode_v2 = origin_mode_v2;
+    ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_
+                  << "change PRECISION_MODE_V2 from: " << accelerate_info_.origin_precision_mode_v2
+                  << "to: " << inner_iter_v2->second;
+    return Status::OK();
+  }
+  if ((accelerate_info_.origin_precision_mode_v1.empty())) {
+    // if init_options_ has no PRECISION_MODE, set empty to origin mode
+    const auto &origin_mode_v1 = init_options_[ge::PRECISION_MODE];
+    const auto &inner_iter_v1 = fast_value_2_precision_mode_v1.find(accelerate_info_.fast_value);
+    if ((inner_iter_v1 == fast_value_2_precision_mode_v1.end()) ||
+        (supported_origin_precision_mode_v1.find(origin_mode_v1) == supported_origin_precision_mode_v1.end())) {
+      ss << "accelerate fast_value:" << fast_value_enum_2_string.at(accelerate_info_.fast_value)
+         << " is not support with PRECISION_MODE: " << origin_mode_v1;
+      ADP_LOG(ERROR) << ss.str();
+      return errors::Unavailable(ss.str());
+    }
+    graph_options_[ge::PRECISION_MODE] = inner_iter_v1->second;
+    accelerate_info_.origin_precision_mode_v1 = origin_mode_v1;
+    ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_
+                  << "change PRECISION_MODE from: " << accelerate_info_.origin_precision_mode_v1
+                  << "to: " << inner_iter_v1->second;
+  }
+  return Status::OK();
+}
+
+Status GeOp::RecoverPrecisionMode() {
+  if (!accelerate_info_.origin_precision_mode_v2.empty()) {
+    const auto &fast_value = graph_options_[ge::PRECISION_MODE_V2];
+    graph_options_[ge::PRECISION_MODE_V2] = accelerate_info_.origin_precision_mode_v2;
+    ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << "recover PRECISION_MODE_V2 from: " << fast_value
+                  << "to: " << accelerate_info_.origin_precision_mode_v2;
+  } else {
+    const auto &fast_value = graph_options_[ge::PRECISION_MODE];
+    graph_options_[ge::PRECISION_MODE] = accelerate_info_.origin_precision_mode_v1;
+    ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << "recover PRECISION_MODE_V2 from: " << fast_value
+                  << "to: " << accelerate_info_.origin_precision_mode_v1;
+  }
+  return Status::OK();
+}
+
 int32_t GeOp::InitRebuildFlag(uint32_t cache_graph_id) {
   if (!build_flag_) {
     ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id
@@ -534,12 +743,19 @@ int32_t GeOp::InitRebuildFlag(uint32_t cache_graph_id) {
     LOG(ERROR) << "[GEOP] GE session is nullptr";
     return -1;
   }
-  if (!ge_session_->IsGraphNeedRebuild(cache_graph_id)) {
+
+  if (NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode_) != Status::OK()) {
+    ADP_LOG(ERROR) << "[GEOP] prepare to accelerate for train failed";
+    LOG(ERROR) << "[GEOP] prepare to accelerate for train failed";
+    return -1;
+  }
+  if (!ge_session_->IsGraphNeedRebuild(cache_graph_id) && !(need_recover_precision_mode_)) {
     ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id << " no need to rebuild";
     return 0;
   }
 
-  ADP_LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id;
+  ADP_LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id << " ,need_change_precision_mode"
+                << need_recover_precision_mode_;
 
   // The graph need to rebuild, remove it from GE first.
   ADP_LOG(INFO) << "[GEOP] tf session: " << tf_session_ << ", graph id: " << cache_graph_id;
@@ -960,6 +1176,13 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) {
     graph_options_["ge.jit_compile"] = jit_compile_;
     graph_options_["ge.exec.overflow"] = "1";
     graph_options_["ge.graphLevelSat"] = (mix_compile_mode_ == "0") ? "1" : "0";
+    if(IsAccelerateTrainOn()) {
+      if (need_recover_precision_mode_) {
+        OP_REQUIRES_OK_ASYNC(ctx, RecoverPrecisionMode(), done);
+      } else {
+        OP_REQUIRES_OK_ASYNC(ctx, CheckAndModifyPrecisionMode(), done);
+      }
+    }
 
     // call ge session addGraph api
     auto graph_options = graph_options_;
diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h
index 0c290219d..75fac85bc 100644
--- a/tf_adapter/kernels/geop_npu.h
+++ b/tf_adapter/kernels/geop_npu.h
@@ -47,12 +47,20 @@ using AoeSetTuningGraphFunc = AoeStatus (*)(SessionId, ge::Graph &);
 using AoeTuningGraphFunc = AoeStatus (*)(SessionId, const std::map<ge::AscendString, ge::AscendString> &);
 
 class GeOp : public AsyncOpKernel {
-public:
+ public:
   explicit GeOp(OpKernelConstruction *ctx);
   ~GeOp() override;
   void ComputeAsync(OpKernelContext *ctx, DoneCallback done) override;
-
-private:
+  enum FastValue { kfast = 0, kfast1 };
+  struct AccelerateInfo {
+    FastValue fast_value;
+    std::string fast_mode;
+    float fast_ratio;
+    std::string origin_precision_mode_v1;
+    std::string origin_precision_mode_v2;
+  };
+
+ private:
   void Initialize(OpKernelConstruction *ctx);
   void Finalize();
 
@@ -92,7 +100,14 @@ private:
   void AddNodeAttrs(Node *node, bool &is_initialize);
 
   int InitRebuildFlag(uint32_t cache_graph_id);
-
+  Status NeedRecompileWhenAccelerateTrainOn(bool &need_recompile);
+  bool IsAccelerateTrainOn();
+  Status ParserAccelerateTrain(const std::string &accelerate_train_mode);
+  Status CheckAndSetAccelarateRatio(const std::string &mode_value, const std::string &ratio_value);
+  Status CheckAndModifyPrecisionMode();
+  Status RecoverPrecisionMode();
+  Status TriggeredByStep(bool &is_triggered);
+  Status TriggeredByLoss(bool &is_triggered);
   bool IncrementGraphIdCount(uint32_t &graph_id);
 
   bool DecrementGraphIdCount(const std::string &tf_session, uint32_t &graph_id);
@@ -207,6 +222,7 @@ private:
   std::map<std::string, bool> is_getnext_dynamic_shape_;
   SessionId session_id_;
   bool is_aoe_{false};
+  bool need_recover_precision_mode_{false};
   AoeInitializeFunc aoe_initialize_;
   AoeFinalizeFunc aoe_finalize_;
   AoeCreateSessionFunc aoe_create_session_;
@@ -217,6 +233,8 @@ private:
   AoeTuningGraphFunc aoe_tuning_graph_;
   AoeSetDependGraphsInputsFunc aoe_set_depend_graphs_inputs_;
   AoeSetTuningGraphInputFunc aoe_set_tuning_graph_input_;
+  // accelerate train
+  AccelerateInfo accelerate_info_;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_KERNELS_GEOP_NPU_H_
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
index 2d4a31f27..7a3fea17b 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py
@@ -419,7 +419,8 @@ class ExperimentalConfig():
                  enable_graph_parallel=None,
                  graph_parallel_option_path=None,
                  graph_compiler_cache_dir=None,
-                 resource_config_path=None):
+                 resource_config_path=None,
+                 accelerate_train_mode=None):
         """
         Constructs a ExperimentalConfig.
 
@@ -441,7 +442,7 @@ class ExperimentalConfig():
         self._graph_parallel_option_path = graph_parallel_option_path
         self._graph_compiler_cache_dir = graph_compiler_cache_dir
         self._resource_config_path = resource_config_path
-
+        self._accelerate_train_mode = accelerate_train_mode
 
 class NpuExecutePlacement(Enum):
     """npu execute place option. """
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
index 5bad261e0..6ca2c1b7a 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py
@@ -617,6 +617,9 @@ class NPUEstimator(estimator_lib.Estimator):
                     config._experimental_config._graph_parallel_option_path)
             if config._experimental_config._enable_graph_parallel is not None:
                 custom_op.parameter_map["enable_graph_parallel"].b = config._experimental_config._enable_graph_parallel
+            if config._experimental_config._accelerate_train_mode is not None:
+                custom_op.parameter_map[
+                    "experimental_accelerate_train_mode"].s = config._experimental_config._accelerate_train_mode
 
     def __load_stream_max_config(self, config, custom_op):
         """Load stream_max_parallel_num config ,and add to custom_optimizers
diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
index 5c9897d92..eac6d048c 100644
--- a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
+++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py
@@ -375,3 +375,34 @@ class NPUOutputTensorHook(basic_session_run_hooks.LoggingTensorHook):
     def _call_output_fn(self):
         self._output_fn.__call__(self._output_list)
         del self._output_list[:]
+
+
+class TellMeStepOrLossHook(session_run_hook.SessionRunHook):
+    """tell me step or loss"""
+
+    def __init__(self, step=None, loss=None, total_step=None, final_loss=None
+                 ):
+        """Initializes a `TellMeStepOrLossHook`.
+        """
+        self._step = step
+        self._loss = loss
+        self._total_step = total_step
+        self._final_loss = final_loss
+
+    def before_run(self, run_context):
+        """Call before session will run"""
+        logging.info("TellMeStepOrLossHook before_run...")
+        return tf.train.SessionRunArgs(self._step, self._total_step, self._loss, self._final_loss)
+
+    def after_run(self, run_values):
+        """Call after session has run"""
+        logging.info("TellMeStepOrLossHook after_run...", run_values.results)
+        step, total_step, loss, final_loss = run_values.results
+        if step is not None:
+            os.environ["STEP_NOW"] = step
+        if total_step is not None:
+            os.environ["TOTAL_STEP"] = total_step
+        if loss is not None:
+            os.environ["LOSS_NOW"] = loss
+        if final_loss is not None:
+            os.environ["TARGET_LOSS"] = final_loss
diff --git a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc
index f7a5d0ff3..ae810d554 100644
--- a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc
+++ b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc
@@ -48,22 +48,64 @@ class NpuHostGetNextAllocator : public tensorflow::Allocator {
   std::unique_ptr<NpuGetNextOutputInfo> output_;
 };
 
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env *env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override {
+    return save_;
+  }
+  Allocator *GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+namespace {
+std::unique_ptr<OpKernel> g_op = nullptr;
+void CreateGeOp() {
+  Env *env = Env::Default();
+  GraphDef graph_def;
+  NodeDef node_def;
+  std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt";
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  ReadTextProto(env, graph_def_path, &graph_def);
+  AsyncOpKernel *async_op;
+  for (int i = 0; i < graph_def.node_size(); i++) {
+    NodeDef *node_def = graph_def.mutable_node(i);
+    if (node_def->name() == "GeOp1_0") {
+      OpKernelContext::Params params;
+      params.record_tensor_accesses = false;
+      auto device = absl::make_unique<DummyDevice>(env, params.record_tensor_accesses);
+      params.device = device.get();
+      Status status;
+      g_op = CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(), *node_def, TF_GRAPH_DEF_VERSION, &status);
+      EXPECT_TRUE(status.ok());
+    }
+  }
+}
+void DelGeOp() {
+  g_op.reset();
+}
+void UnSetEnv() {
+  unsetenv("LOSS_NOW");
+  unsetenv("TARGET_LOSS");
+  unsetenv("STEP_NOW");
+  unsetenv("TOTAL_STEP");
+}
+}
 class GeOpTest : public testing::Test {
  protected:
   virtual void SetUp() {
     *const_cast<bool *>(&kDumpGraph) = true;
     NpuAttrs::SetNewDataTransferFlag(true);
+    CreateGeOp();
+  }
+  virtual void TearDown() {
+    DelGeOp();
+    UnSetEnv();
   }
-  virtual void TearDown() {}
-};
-class DummyDevice : public DeviceBase {
- public:
-  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
-  bool RequiresRecordingAccessedTensors() const override { return save_; }
-  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override { return cpu_allocator(); }
-
- private:
-  bool save_;
 };
 
 Status GeOpRunGraphAsync(std::string example_path, gtl::InlinedVector<TensorValue, 4> inputs, NodeDef& geop_node_def,
@@ -571,5 +613,79 @@ TEST_F(GeOpTest, test_SeparateGraphDef) {
   attr2->insert({"value", value_attr2});
   EXPECT_EQ(geop_node->SeparateGraphDef(graph_def, partition_graph, const_value_map).ok(), true);
 }
+
+TEST_F(GeOpTest, test_AccelerateTrain_InvalidOption) {
+  GeOp *geop_node = dynamic_cast<GeOp *>(g_op->AsAsync());
+  std::string invalid_option_value = "fastxx";
+  EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false);
+  invalid_option_value = "fastxx|step";
+  EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false);
+  invalid_option_value = "fast|stepxx";
+  EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false);
+  invalid_option_value = "fast|step|0.1";
+  EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false);
+  invalid_option_value = "fast|step|1.6";
+  EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false);
+}
+
+TEST_F(GeOpTest, test_AccelerateTrain_Loss_PrecisonV2) {
+  GeOp *geop_node = dynamic_cast<GeOp *>(g_op->AsAsync());
+  bool need_recover_precision_mode = false;
+  // not enable accelerate, skip
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast1|loss|1.1";
+  // not set env, return error
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), false);
+  EXPECT_FALSE(need_recover_precision_mode);
+  setenv("LOSS_NOW", "1.21", true);
+  setenv("TARGET_LOSS", "1.1", true);
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  // when loss is 1.21, which is 1.1 * 1.1, need recover
+  EXPECT_TRUE(need_recover_precision_mode);
+  geop_node->init_options_[ge::PRECISION_MODE_V2] = "fp16";
+  // not support
+  EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), false);
+  geop_node->init_options_[ge::PRECISION_MODE_V2] = "origin";
+  EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true);
+  // change mode to mixed successfully
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE_V2], "mixed_bfloat16");
+  EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true);
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE_V2], "origin");
+}
+
+TEST_F(GeOpTest, test_AccelerateTrain_Loss_PrecisonV1) {
+  GeOp *geop_node = dynamic_cast<GeOp *>(g_op->AsAsync());
+  bool need_recover_precision_mode = false;
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast1|loss|1.1";
+  setenv("LOSS_NOW", "1.22", true);
+  setenv("TARGET_LOSS", "1.1", true);
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  EXPECT_FALSE(need_recover_precision_mode);
+  setenv("LOSS_NOW", "1.21", true);
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  EXPECT_TRUE(need_recover_precision_mode);
+  EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true);
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "allow_mix_precision_bf16");
+  EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true);
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "");
+}
+
+TEST_F(GeOpTest, test_AccelerateTrain_Step) {
+  GeOp *geop_node = dynamic_cast<GeOp *>(g_op->AsAsync());
+  bool need_recover_precision_mode = false;
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast|step";
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), false);
+  EXPECT_FALSE(need_recover_precision_mode);
+  setenv("STEP_NOW", "9000", true);
+  setenv("TOTAL_STEP", "10000", true);
+  EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true);
+  EXPECT_TRUE(need_recover_precision_mode);
+  EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true);
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "allow_mix_precision_fp16");
+  EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true);
+  EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "");
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
index a33b0362c..39c52c807 100644
--- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
+++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc
@@ -59,6 +59,32 @@ TEST_F(NpuAttrTest, GetEnvAscendDeviceIdFailTest) {
   Status s = GetEnvDeviceID(device_id);
   EXPECT_EQ(s.ok(), false);
 }
+
+TEST_F(NpuAttrTest, GetStepFromEnv) {
+  uint32_t step = 0;
+  Status s = GetStepFromEnv("STEP_NOW", step);
+  EXPECT_EQ(s.ok(), false);
+  setenv("STEP_NOW", "1000", true);
+  s = GetStepFromEnv("STEP_NOW", step);
+  EXPECT_EQ(s.ok(), true);
+  EXPECT_EQ(step, 1000);
+  setenv("STEP_NOW", "1.1", true);
+  s = GetStepFromEnv("STEP_NOW", step);
+  EXPECT_EQ(s.ok(), false);
+  unsetenv("STEP_NOW");
+}
+
+TEST_F(NpuAttrTest, GetLossFromEnv) {
+  float loss = 0;
+  Status s = GetLossFromEnv("LOSS_NOW", loss);
+  EXPECT_EQ(s.ok(), false);
+  setenv("LOSS_NOW", "1.1", true);
+  s = GetLossFromEnv("LOSS_NOW", loss);
+  EXPECT_EQ(s.ok(), true);
+  EXPECT_FLOAT_EQ(loss, 1.1);
+  unsetenv("LOSS_NOW");
+}
+
 TEST_F(NpuAttrTest, SplitTest) {
   std::string s = "a,b,c";
   std::vector<std::string> res;
diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc
index 846195020..7b10bb277 100644
--- a/tf_adapter/util/ge_plugin.cc
+++ b/tf_adapter/util/ge_plugin.cc
@@ -286,6 +286,8 @@ void GePlugin::Init(std::map<std::string, std::string> &init_options, const bool
 
   ADP_LOG(INFO) << "[GePlugin] optypelist_for_implmode :" << init_options[ge::OPTYPELIST_FOR_IMPLMODE];
 
+  ADP_LOG(INFO) << "[GePlugin] accelerate train mode :" << init_options["ge.exec.accelerateTrainMode"];
+
   bool tdt_uninit_env = false;
   (void) ReadBoolFromEnvVar("ASCEND_TDT_UNINIT", false, &tdt_uninit_env);
   if (!kIsHeterogeneous && !tdt_uninit_env) {
diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc
index 1b064d4db..56926ca08 100644
--- a/tf_adapter/util/npu_attrs.cc
+++ b/tf_adapter/util/npu_attrs.cc
@@ -171,6 +171,39 @@ Status GetEnvDeviceID(uint32_t &device_id) {
   }
   return Status::OK();
 }
+
+Status GetStepFromEnv(const std::string &env_name, uint32_t &step) {
+  std::string step_string;
+  (void) ReadStringFromEnvVar(env_name, "", &step_string);
+  std::stringstream ss;
+  if (step_string.empty()) {
+    ss << env_name << " is not set, which is needed when accelarate by step";
+    return errors::InvalidArgument(ss.str());
+  } else {
+    if (!strings::safe_strtou32(step_string, &step)) {
+      ss << env_name << " is invalid, should be int, such as 1000";
+      return errors::InvalidArgument(ss.str());
+    }
+  }
+  return Status::OK();
+}
+
+Status GetLossFromEnv(const std::string &env_name, float &loss) {
+  std::string loss_string;
+  (void) ReadStringFromEnvVar(env_name, "", &loss_string);
+  std::stringstream ss;
+  if (loss_string.empty()) {
+    ss << env_name << " is not set, which is needed when accelarate by loss";
+    return errors::InvalidArgument(ss.str());
+  } else {
+    if (!strings::safe_strtof(loss_string, &loss)) {
+      ss << env_name << " is invalid, should be float, such as 0.1";
+      return errors::InvalidArgument(ss.str());
+    }
+  }
+  return Status::OK();
+}
+
 void Split(const std::string &s, std::vector<std::string> &result, const char *delchar) {
   if (s.empty()) {
     return;
@@ -604,6 +637,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   std::string logical_device_cluster_deploy_mode = "LB";
   std::string logical_device_id;
   std::string model_deploy_mode;
+  std::string accelerate_train_mode;
   std::string model_deploy_devicelist;
   std::string dump_data = "tensor";
   std::string aoe_config_file;
@@ -644,6 +678,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
     (void) ctx->GetAttr("_logical_device_cluster_deploy_mode", &logical_device_cluster_deploy_mode);
     (void) ctx->GetAttr("_logical_device_id", &logical_device_id);
     (void) ctx->GetAttr("_model_deploy_mode", &model_deploy_mode);
+    (void) ctx->GetAttr("_accelerate_train_mode", &accelerate_train_mode);
     (void) ctx->GetAttr("_model_deploy_devicelist", &model_deploy_devicelist);
     (void) ctx->GetAttr("_dump_data", &dump_data);
     (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file);
@@ -702,6 +737,7 @@ std::map<std::string, std::string> NpuAttrs::GetInitOptions(const OpKernelConstr
   init_options_["stream_sync_timeout"] = stream_sync_timeout;
   init_options_["event_sync_timeout"] = event_sync_timeout;
   init_options_["ge.esClusterConfig"] = es_cluster_config;
+  init_options_["ge.exec.accelerateTrainMode"] = accelerate_train_mode;
   return init_options_;
 }
 
@@ -1113,6 +1149,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   std::string enable_graph_parallel;
   std::string graph_compiler_cache_dir;
   std::string graph_slice_mode;
+  std::string accelerate_train_mode;
 
   auto NpuOptimizer_value = attrs.Find("_NpuOptimizer");
   auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc");
@@ -1202,6 +1239,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   auto enable_graph_parallel_val = attrs.Find("_enable_graph_parallel");
   auto jit_compile_value = attrs.Find("_jit_compile");
   auto graph_compiler_cache_dir_val = attrs.Find("_graph_compiler_cache_dir");
+  auto accelerate_train_mode_value = attrs.Find("_accelerate_train_mode");
 
   if (NpuOptimizer_value != nullptr) {
     do_npu_optimizer = "1";
@@ -1444,6 +1482,9 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
     if (model_deploy_mode_value != nullptr) {
       model_deploy_mode = model_deploy_mode_value->s();
     }
+    if (accelerate_train_mode_value != nullptr) {
+      accelerate_train_mode = accelerate_train_mode_value->s();
+    }
     if (model_deploy_devicelist_value != nullptr) {
       model_deploy_devicelist = model_deploy_devicelist_value->s();
     }
@@ -1577,6 +1618,7 @@ std::map<std::string, std::string> NpuAttrs::GetAllAttrOptions(const AttrSlice &
   all_options["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode;
   all_options["logical_device_id"] = logical_device_id;
   all_options["model_deploy_mode"] = model_deploy_mode;
+  all_options["accelerate_train_mode"] = accelerate_train_mode;
   all_options["model_deploy_devicelist"] = model_deploy_devicelist;
   all_options["topo_sorting_mode"] = topo_sorting_mode;
   all_options["ge.topoSortingMode"] = topo_sorting_mode;
@@ -1711,6 +1753,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   std::string es_cluster_config;
   std::string graph_slice_mode;
   std::string jit_compile;
+  std::string accelerate_train_mode;
 
   const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options();
   for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) {
@@ -2155,6 +2198,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
       if (params.count("experimental_logical_device_id") > 0) {
         logical_device_id = params.at("experimental_logical_device_id").s();
       }
+      if (params.count("experimental_accelerate_train_mode") > 0) {
+        accelerate_train_mode = params.at("experimental_accelerate_train_mode").s();
+      }
       if (params.count("experimental_model_deploy_mode") > 0) {
         model_deploy_mode = params.at("experimental_model_deploy_mode").s();
       }
@@ -2337,6 +2383,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options
   init_options_["ge.exec.logicalDeviceId"] = logical_device_id;
   init_options_["model_deploy_mode"] = model_deploy_mode;
   init_options_["ge.exec.modelDeployMode"] = model_deploy_mode;
+  init_options_["accelerate_train_mode"] = accelerate_train_mode;
+  init_options_["ge.exec.accelerateTrainMode"] = accelerate_train_mode;
   init_options_["model_deploy_devicelist"] = model_deploy_devicelist;
   init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist;
   init_options_["dump_data"] = dump_data;
diff --git a/tf_adapter/util/npu_attrs.h b/tf_adapter/util/npu_attrs.h
index 473d168c2..36eb9a37b 100644
--- a/tf_adapter/util/npu_attrs.h
+++ b/tf_adapter/util/npu_attrs.h
@@ -35,6 +35,8 @@
 namespace tensorflow {
 std::string GetDumpPath();
 Status GetEnvDeviceID(uint32_t &device_id);
+Status GetStepFromEnv(const std::string &env_name, uint32_t &step);
+Status GetLossFromEnv(const std::string &env_name, float &loss);
 void Split(const std::string &s, std::vector<std::string> &result, const char *delchar = " ");
 extern const bool kDumpGraph;
 extern const bool kIsHeterogeneous;
-- 
Gitee