From 5fe4ddf032357d7bc17ba3d81dbb545fd3945181 Mon Sep 17 00:00:00 2001 From: gengchao Date: Thu, 25 May 2023 20:16:46 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E9=99=8D=E6=A1=A3=E5=8A=A0?= =?UTF-8?q?=E9=80=9F=E8=AE=AD=E7=BB=83=E7=89=B9=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../inc/external/ge/ge_api_types.h | 2 + tf_adapter/kernels/geop_npu.cc | 227 +++++++++++++++++- tf_adapter/kernels/geop_npu.h | 26 +- .../npu_bridge/estimator/npu/npu_config.py | 5 +- .../npu_bridge/estimator/npu/npu_estimator.py | 3 + .../npu_bridge/estimator/npu/npu_hook.py | 31 +++ .../ut/kernels/testcase/geop_npu_test.cc | 136 ++++++++++- .../tests/ut/util/testcase/npu_attrs_test.cc | 26 ++ tf_adapter/util/ge_plugin.cc | 2 + tf_adapter/util/npu_attrs.cc | 48 ++++ tf_adapter/util/npu_attrs.h | 2 + 11 files changed, 490 insertions(+), 18 deletions(-) diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h index c2c9e6f87..28687361a 100644 --- a/inc/graphengine/inc/external/ge/ge_api_types.h +++ b/inc/graphengine/inc/external/ge/ge_api_types.h @@ -81,6 +81,8 @@ const char *const OPTION_EXEC_LOGICAL_DEVICE_CLUSTER_DEPLOY_MODE = "ge.exec.logi const char *const OPTION_EXEC_LOGICAL_DEVICE_ID = "ge.exec.logicalDeviceId"; const char *const OPTION_EXEC_MODEL_DEPLOY_MODE = "ge.exec.modelDeployMode"; const char *const OPTION_EXEC_MODEL_DEPLOY_DEVICELIST = "ge.exec.modelDeployDevicelist"; +// accelerate train flag +const char *const OPTION_EXEC_ACCELERATE_TRAIN_MODE = "ge.exec.accelerateTrainMode"; // Option key: memory init const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize"; diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 4e0dfc8ed..c51c80836 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include "tf_adapter/common/adapter_logger.h" #include "tf_adapter/common/common.h" @@ -90,6 +91,35 @@ namespace { const std::string ATTR_NAME_CONST_INPUT_NAME = "_const_input"; const std::string kMdatTuning = "mdat"; const std::string kAutoRecompute = "auto"; +const std::string kTotalStep = "TOTAL_STEP"; +const std::string kStepNow = "STEP_NOW"; +const std::string kTargetLoss = "TARGET_LOSS"; +const std::string kLossNow = "LOSS_NOW"; +const std::string kModeValueStep = "step"; +const std::string kModeValueLoss = "loss"; +const float kDefaultStepRatio = 0.9; +const float kMinStepRatio = 0.2; +const float kMaxStepRatio = 0.9; +const float kDefaultLossRatio = 1.05; +const float kMinLossRatio = 1.01; +const float kMaxLossRatio = 1.5; + +const std::map fast_value_string_2_eunm = {{"fast", GeOp::kfast}, + {"fast1", GeOp::kfast1}}; + +const std::map fast_value_enum_2_string = {{GeOp::kfast, "fast"}, + {GeOp::kfast1, "fast1"}}; +const std::map fast_value_2_precision_mode_v1 = { + {GeOp::kfast, "allow_mix_precision_fp16"}, + {GeOp::kfast1, "allow_mix_precision_bf16"}, +}; +const std::unordered_set supported_origin_precision_mode_v1 = {"allow_fp32_to_fp16", + "must_keep_origin_dtype", ""}; +const std::unordered_set valid_mode_values = {kModeValueStep, kModeValueLoss}; +const std::map fast_value_2_precision_mode_v2 = {{GeOp::kfast, "mixed_float16"}, + {GeOp::kfast1, "mixed_bfloat16"}}; +const std::unordered_set supported_origin_precision_mode_v2 = {"origin", ""}; + using geDataUniquePtr = std::unique_ptr>; class NpuHostFixedAllocator : public tensorflow::Allocator, public tensorflow::core::RefCounted { @@ -518,6 +548,185 @@ void GeOp::Finalize() { return; } +uint32_t GetStepToChange(const uint32_t total_step, const float ratio) { + return total_step * ratio; +} + +float GetLossToChange(const float target_loss, const float ratio) { + return target_loss * ratio; +} + +Status GeOp::TriggeredByStep(bool &is_triggered) { + uint32_t total_step = 0U; + REQUIRES_STATUS_OK(GetStepFromEnv(kTotalStep, total_step)); + uint32_t step_to_change = GetStepToChange(total_step, accelerate_info_.fast_ratio); + uint32_t step_now = 0U; + REQUIRES_STATUS_OK(GetStepFromEnv(kStepNow, step_now)); + if (step_now == step_to_change) { + ADP_LOG(EVENT) << "[GEOP] trigger recompile when step is " << step_now; + is_triggered = true; + return Status::OK(); + } + is_triggered = false; + return Status::OK(); +} + +Status GeOp::TriggeredByLoss(bool &is_triggered) { + float target_loss = 0.0; + REQUIRES_STATUS_OK(GetLossFromEnv(kTargetLoss, target_loss)); + float loss_to_change = GetLossToChange(target_loss, accelerate_info_.fast_ratio); + float loss_now = 0U; + REQUIRES_STATUS_OK(GetLossFromEnv(kLossNow, loss_now)); + if (loss_now == loss_to_change) { + ADP_LOG(EVENT) << "[GEOP] trigger recompile when loss is " << loss_now; + is_triggered = true; + return Status::OK(); + } + is_triggered = false; + return Status::OK(); +} + +Status GeOp::NeedRecompileWhenAccelerateTrainOn(bool &need_recompile) { + if (!IsAccelerateTrainOn()) { + need_recompile = false; + return Status::OK(); + } + // init_options_[ge::OPTION_EXEC_ACCELERATE_TRAIN_MODE] must be valid if `IsAccelerateTrainOn` is true + REQUIRES_STATUS_OK(ParserAccelerateTrain(init_options_["ge.exec.accelerateTrainMode"])); + if (accelerate_info_.fast_mode == kModeValueStep) { + REQUIRES_STATUS_OK(TriggeredByStep(need_recompile)); + } else { + REQUIRES_STATUS_OK(TriggeredByLoss(need_recompile)); + } + return Status::OK(); +} + +Status GeOp::CheckAndSetAccelarateRatio(const std::string &mode_value, const std::string &ratio_value) { + float ratio = 0.0; + std::stringstream ss; + if (!strings::safe_strtof(ratio_value, &ratio)) { + ss << "accelerate_train_mode third part is invalid: " << ratio_value + << " ,you can chose `0.9` for `step` or `1.02` for `loss`"; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + + if (mode_value == kModeValueStep) { + if (ratio < kMinStepRatio || ratio > kMaxStepRatio) { + ss << "accelerate_train_mode third part is invalid: " << ratio_value << " ,you can chose `" << kMinStepRatio + << "-" << kMaxStepRatio << "`for `" << mode_value << "`"; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + } else if (mode_value == kModeValueLoss) { + if (ratio < kMinLossRatio || ratio > kMaxLossRatio) { + ss << "accelerate_train_mode third part is invalid: " << ratio_value << " ,you can chose `" << kMinLossRatio + << "-" << kMaxLossRatio << "`for `" << mode_value << "`"; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + } else { + ADP_LOG(ERROR) << "invalid mode value: " << mode_value; + return errors::Unavailable("invalid mode value"); + } + accelerate_info_.fast_ratio = ratio; + return Status::OK(); +} + +Status GeOp::ParserAccelerateTrain(const std::string &accelerate_train_mode) { + // format like "fast|step|0.9" or "fast|step" + std::vector infos = ge::StringUtils::Split(accelerate_train_mode, '|'); + std::stringstream ss; + if ((infos.size() != 2U) && (infos.size() != 3U)) { + ss << "Format of accelerate_train_mode is invalid: " << accelerate_train_mode; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + const auto &fast_value = infos[0U]; + const auto &iter = fast_value_string_2_eunm.find(fast_value); + if (iter == fast_value_string_2_eunm.end()) { + ss << "accelerate_train_mode first part is invalid: , you can chose `fast`" << accelerate_train_mode; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + accelerate_info_.fast_value = iter->second; + const auto &mode_value = infos[1U]; + if (valid_mode_values.find(mode_value) == valid_mode_values.end()) { + ss << "accelerate_train_mode second part is invalid: , you can chose `step`" << accelerate_train_mode; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + accelerate_info_.fast_mode = mode_value; + if ((infos.size() != 3U )|| (infos[2U].empty())) { + accelerate_info_.fast_ratio = accelerate_info_.fast_mode == kModeValueStep ? kDefaultStepRatio : kDefaultLossRatio; + return Status::OK(); + } + return CheckAndSetAccelarateRatio(mode_value, infos[2U]); +} + +bool GeOp::IsAccelerateTrainOn() { + const auto iter = init_options_.find("ge.exec.accelerateTrainMode"); + if (iter == init_options_.end()) { + return false; + } + return !(iter->second.empty()); +} + +Status GeOp::CheckAndModifyPrecisionMode() { + std::stringstream ss; + const auto &iter_v2 = init_options_.find(ge::PRECISION_MODE_V2); + if ((accelerate_info_.origin_precision_mode_v2.empty()) && (iter_v2 != init_options_.end())) { + const auto &origin_mode_v2 = init_options_[ge::PRECISION_MODE_V2]; + const auto &inner_iter_v2 = fast_value_2_precision_mode_v2.find(accelerate_info_.fast_value); + if ((inner_iter_v2 == fast_value_2_precision_mode_v2.end()) || + (supported_origin_precision_mode_v2.find(origin_mode_v2) == supported_origin_precision_mode_v2.end())) { + ss << "accelerate fast_value:" << fast_value_enum_2_string.at(accelerate_info_.fast_value) + << " is not support with PRECISION_MODE_V2: " << origin_mode_v2; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + graph_options_[ge::PRECISION_MODE_V2] = inner_iter_v2->second; + accelerate_info_.origin_precision_mode_v2 = origin_mode_v2; + ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ + << "change PRECISION_MODE_V2 from: " << accelerate_info_.origin_precision_mode_v2 + << "to: " << inner_iter_v2->second; + return Status::OK(); + } + if ((accelerate_info_.origin_precision_mode_v1.empty())) { + // if init_options_ has no PRECISION_MODE, set empty to origin mode + const auto &origin_mode_v1 = init_options_[ge::PRECISION_MODE]; + const auto &inner_iter_v1 = fast_value_2_precision_mode_v1.find(accelerate_info_.fast_value); + if ((inner_iter_v1 == fast_value_2_precision_mode_v1.end()) || + (supported_origin_precision_mode_v1.find(origin_mode_v1) == supported_origin_precision_mode_v1.end())) { + ss << "accelerate fast_value:" << fast_value_enum_2_string.at(accelerate_info_.fast_value) + << " is not support with PRECISION_MODE: " << origin_mode_v1; + ADP_LOG(ERROR) << ss.str(); + return errors::Unavailable(ss.str()); + } + graph_options_[ge::PRECISION_MODE] = inner_iter_v1->second; + accelerate_info_.origin_precision_mode_v1 = origin_mode_v1; + ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ + << "change PRECISION_MODE from: " << accelerate_info_.origin_precision_mode_v1 + << "to: " << inner_iter_v1->second; + } + return Status::OK(); +} + +Status GeOp::RecoverPrecisionMode() { + if (!accelerate_info_.origin_precision_mode_v2.empty()) { + const auto &fast_value = graph_options_[ge::PRECISION_MODE_V2]; + graph_options_[ge::PRECISION_MODE_V2] = accelerate_info_.origin_precision_mode_v2; + ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << "recover PRECISION_MODE_V2 from: " << fast_value + << "to: " << accelerate_info_.origin_precision_mode_v2; + } else { + const auto &fast_value = graph_options_[ge::PRECISION_MODE]; + graph_options_[ge::PRECISION_MODE] = accelerate_info_.origin_precision_mode_v1; + ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << "recover PRECISION_MODE_V2 from: " << fast_value + << "to: " << accelerate_info_.origin_precision_mode_v1; + } + return Status::OK(); +} + int32_t GeOp::InitRebuildFlag(uint32_t cache_graph_id) { if (!build_flag_) { ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id @@ -534,12 +743,19 @@ int32_t GeOp::InitRebuildFlag(uint32_t cache_graph_id) { LOG(ERROR) << "[GEOP] GE session is nullptr"; return -1; } - if (!ge_session_->IsGraphNeedRebuild(cache_graph_id)) { + + if (NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode_) != Status::OK()) { + ADP_LOG(ERROR) << "[GEOP] prepare to accelerate for train failed"; + LOG(ERROR) << "[GEOP] prepare to accelerate for train failed"; + return -1; + } + if (!ge_session_->IsGraphNeedRebuild(cache_graph_id) && !(need_recover_precision_mode_)) { ADP_LOG(INFO) << "[GEOP] tf session " << tf_session_ << ", graph id: " << cache_graph_id << " no need to rebuild"; return 0; } - ADP_LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id; + ADP_LOG(INFO) << "[GEOP] The graph need rebuild, graph id " << cache_graph_id << " ,need_change_precision_mode" + << need_recover_precision_mode_; // The graph need to rebuild, remove it from GE first. ADP_LOG(INFO) << "[GEOP] tf session: " << tf_session_ << ", graph id: " << cache_graph_id; @@ -960,6 +1176,13 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { graph_options_["ge.jit_compile"] = jit_compile_; graph_options_["ge.exec.overflow"] = "1"; graph_options_["ge.graphLevelSat"] = (mix_compile_mode_ == "0") ? "1" : "0"; + if(IsAccelerateTrainOn()) { + if (need_recover_precision_mode_) { + OP_REQUIRES_OK_ASYNC(ctx, RecoverPrecisionMode(), done); + } else { + OP_REQUIRES_OK_ASYNC(ctx, CheckAndModifyPrecisionMode(), done); + } + } // call ge session addGraph api auto graph_options = graph_options_; diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index 0c290219d..75fac85bc 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -47,12 +47,20 @@ using AoeSetTuningGraphFunc = AoeStatus (*)(SessionId, ge::Graph &); using AoeTuningGraphFunc = AoeStatus (*)(SessionId, const std::map &); class GeOp : public AsyncOpKernel { -public: + public: explicit GeOp(OpKernelConstruction *ctx); ~GeOp() override; void ComputeAsync(OpKernelContext *ctx, DoneCallback done) override; - -private: + enum FastValue { kfast = 0, kfast1 }; + struct AccelerateInfo { + FastValue fast_value; + std::string fast_mode; + float fast_ratio; + std::string origin_precision_mode_v1; + std::string origin_precision_mode_v2; + }; + + private: void Initialize(OpKernelConstruction *ctx); void Finalize(); @@ -92,7 +100,14 @@ private: void AddNodeAttrs(Node *node, bool &is_initialize); int InitRebuildFlag(uint32_t cache_graph_id); - + Status NeedRecompileWhenAccelerateTrainOn(bool &need_recompile); + bool IsAccelerateTrainOn(); + Status ParserAccelerateTrain(const std::string &accelerate_train_mode); + Status CheckAndSetAccelarateRatio(const std::string &mode_value, const std::string &ratio_value); + Status CheckAndModifyPrecisionMode(); + Status RecoverPrecisionMode(); + Status TriggeredByStep(bool &is_triggered); + Status TriggeredByLoss(bool &is_triggered); bool IncrementGraphIdCount(uint32_t &graph_id); bool DecrementGraphIdCount(const std::string &tf_session, uint32_t &graph_id); @@ -207,6 +222,7 @@ private: std::map is_getnext_dynamic_shape_; SessionId session_id_; bool is_aoe_{false}; + bool need_recover_precision_mode_{false}; AoeInitializeFunc aoe_initialize_; AoeFinalizeFunc aoe_finalize_; AoeCreateSessionFunc aoe_create_session_; @@ -217,6 +233,8 @@ private: AoeTuningGraphFunc aoe_tuning_graph_; AoeSetDependGraphsInputsFunc aoe_set_depend_graphs_inputs_; AoeSetTuningGraphInputFunc aoe_set_tuning_graph_input_; + // accelerate train + AccelerateInfo accelerate_info_; }; } // namespace tensorflow #endif // TENSORFLOW_KERNELS_GEOP_NPU_H_ diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index 2d4a31f27..7a3fea17b 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -419,7 +419,8 @@ class ExperimentalConfig(): enable_graph_parallel=None, graph_parallel_option_path=None, graph_compiler_cache_dir=None, - resource_config_path=None): + resource_config_path=None, + accelerate_train_mode=None): """ Constructs a ExperimentalConfig. @@ -441,7 +442,7 @@ class ExperimentalConfig(): self._graph_parallel_option_path = graph_parallel_option_path self._graph_compiler_cache_dir = graph_compiler_cache_dir self._resource_config_path = resource_config_path - + self._accelerate_train_mode = accelerate_train_mode class NpuExecutePlacement(Enum): """npu execute place option. """ diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index 5bad261e0..6ca2c1b7a 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -617,6 +617,9 @@ class NPUEstimator(estimator_lib.Estimator): config._experimental_config._graph_parallel_option_path) if config._experimental_config._enable_graph_parallel is not None: custom_op.parameter_map["enable_graph_parallel"].b = config._experimental_config._enable_graph_parallel + if config._experimental_config._accelerate_train_mode is not None: + custom_op.parameter_map[ + "experimental_accelerate_train_mode"].s = config._experimental_config._accelerate_train_mode def __load_stream_max_config(self, config, custom_op): """Load stream_max_parallel_num config ,and add to custom_optimizers diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py index 5c9897d92..eac6d048c 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_hook.py @@ -375,3 +375,34 @@ class NPUOutputTensorHook(basic_session_run_hooks.LoggingTensorHook): def _call_output_fn(self): self._output_fn.__call__(self._output_list) del self._output_list[:] + + +class TellMeStepOrLossHook(session_run_hook.SessionRunHook): + """tell me step or loss""" + + def __init__(self, step=None, loss=None, total_step=None, final_loss=None + ): + """Initializes a `TellMeStepOrLossHook`. + """ + self._step = step + self._loss = loss + self._total_step = total_step + self._final_loss = final_loss + + def before_run(self, run_context): + """Call before session will run""" + logging.info("TellMeStepOrLossHook before_run...") + return tf.train.SessionRunArgs(self._step, self._total_step, self._loss, self._final_loss) + + def after_run(self, run_values): + """Call after session has run""" + logging.info("TellMeStepOrLossHook after_run...", run_values.results) + step, total_step, loss, final_loss = run_values.results + if step is not None: + os.environ["STEP_NOW"] = step + if total_step is not None: + os.environ["TOTAL_STEP"] = total_step + if loss is not None: + os.environ["LOSS_NOW"] = loss + if final_loss is not None: + os.environ["TARGET_LOSS"] = final_loss diff --git a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc index f7a5d0ff3..ae810d554 100644 --- a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc +++ b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc @@ -48,22 +48,64 @@ class NpuHostGetNextAllocator : public tensorflow::Allocator { std::unique_ptr output_; }; +class DummyDevice : public DeviceBase { + public: + DummyDevice(Env *env, bool save) : DeviceBase(env), save_(save) {} + bool RequiresRecordingAccessedTensors() const override { + return save_; + } + Allocator *GetAllocator(AllocatorAttributes /*attr*/) override { + return cpu_allocator(); + } + + private: + bool save_; +}; + +namespace { +std::unique_ptr g_op = nullptr; +void CreateGeOp() { + Env *env = Env::Default(); + GraphDef graph_def; + NodeDef node_def; + std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt"; + gtl::InlinedVector inputs; + ReadTextProto(env, graph_def_path, &graph_def); + AsyncOpKernel *async_op; + for (int i = 0; i < graph_def.node_size(); i++) { + NodeDef *node_def = graph_def.mutable_node(i); + if (node_def->name() == "GeOp1_0") { + OpKernelContext::Params params; + params.record_tensor_accesses = false; + auto device = absl::make_unique(env, params.record_tensor_accesses); + params.device = device.get(); + Status status; + g_op = CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(), *node_def, TF_GRAPH_DEF_VERSION, &status); + EXPECT_TRUE(status.ok()); + } + } +} +void DelGeOp() { + g_op.reset(); +} +void UnSetEnv() { + unsetenv("LOSS_NOW"); + unsetenv("TARGET_LOSS"); + unsetenv("STEP_NOW"); + unsetenv("TOTAL_STEP"); +} +} class GeOpTest : public testing::Test { protected: virtual void SetUp() { *const_cast(&kDumpGraph) = true; NpuAttrs::SetNewDataTransferFlag(true); + CreateGeOp(); + } + virtual void TearDown() { + DelGeOp(); + UnSetEnv(); } - virtual void TearDown() {} -}; -class DummyDevice : public DeviceBase { - public: - DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {} - bool RequiresRecordingAccessedTensors() const override { return save_; } - Allocator* GetAllocator(AllocatorAttributes /*attr*/) override { return cpu_allocator(); } - - private: - bool save_; }; Status GeOpRunGraphAsync(std::string example_path, gtl::InlinedVector inputs, NodeDef& geop_node_def, @@ -571,5 +613,79 @@ TEST_F(GeOpTest, test_SeparateGraphDef) { attr2->insert({"value", value_attr2}); EXPECT_EQ(geop_node->SeparateGraphDef(graph_def, partition_graph, const_value_map).ok(), true); } + +TEST_F(GeOpTest, test_AccelerateTrain_InvalidOption) { + GeOp *geop_node = dynamic_cast(g_op->AsAsync()); + std::string invalid_option_value = "fastxx"; + EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false); + invalid_option_value = "fastxx|step"; + EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false); + invalid_option_value = "fast|stepxx"; + EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false); + invalid_option_value = "fast|step|0.1"; + EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false); + invalid_option_value = "fast|step|1.6"; + EXPECT_EQ(geop_node->ParserAccelerateTrain(invalid_option_value).ok(), false); +} + +TEST_F(GeOpTest, test_AccelerateTrain_Loss_PrecisonV2) { + GeOp *geop_node = dynamic_cast(g_op->AsAsync()); + bool need_recover_precision_mode = false; + // not enable accelerate, skip + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast1|loss|1.1"; + // not set env, return error + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), false); + EXPECT_FALSE(need_recover_precision_mode); + setenv("LOSS_NOW", "1.21", true); + setenv("TARGET_LOSS", "1.1", true); + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + // when loss is 1.21, which is 1.1 * 1.1, need recover + EXPECT_TRUE(need_recover_precision_mode); + geop_node->init_options_[ge::PRECISION_MODE_V2] = "fp16"; + // not support + EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), false); + geop_node->init_options_[ge::PRECISION_MODE_V2] = "origin"; + EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true); + // change mode to mixed successfully + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE_V2], "mixed_bfloat16"); + EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true); + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE_V2], "origin"); +} + +TEST_F(GeOpTest, test_AccelerateTrain_Loss_PrecisonV1) { + GeOp *geop_node = dynamic_cast(g_op->AsAsync()); + bool need_recover_precision_mode = false; + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast1|loss|1.1"; + setenv("LOSS_NOW", "1.22", true); + setenv("TARGET_LOSS", "1.1", true); + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + EXPECT_FALSE(need_recover_precision_mode); + setenv("LOSS_NOW", "1.21", true); + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + EXPECT_TRUE(need_recover_precision_mode); + EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true); + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "allow_mix_precision_bf16"); + EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true); + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], ""); +} + +TEST_F(GeOpTest, test_AccelerateTrain_Step) { + GeOp *geop_node = dynamic_cast(g_op->AsAsync()); + bool need_recover_precision_mode = false; + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + geop_node->init_options_["ge.exec.accelerateTrainMode"] = "fast|step"; + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), false); + EXPECT_FALSE(need_recover_precision_mode); + setenv("STEP_NOW", "9000", true); + setenv("TOTAL_STEP", "10000", true); + EXPECT_EQ(geop_node->NeedRecompileWhenAccelerateTrainOn(need_recover_precision_mode).ok(), true); + EXPECT_TRUE(need_recover_precision_mode); + EXPECT_EQ(geop_node->CheckAndModifyPrecisionMode().ok(), true); + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], "allow_mix_precision_fp16"); + EXPECT_EQ(geop_node->RecoverPrecisionMode().ok(), true); + EXPECT_EQ(geop_node->graph_options_[ge::PRECISION_MODE], ""); +} } // namespace } // namespace tensorflow diff --git a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc index a33b0362c..39c52c807 100644 --- a/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc +++ b/tf_adapter/tests/ut/util/testcase/npu_attrs_test.cc @@ -59,6 +59,32 @@ TEST_F(NpuAttrTest, GetEnvAscendDeviceIdFailTest) { Status s = GetEnvDeviceID(device_id); EXPECT_EQ(s.ok(), false); } + +TEST_F(NpuAttrTest, GetStepFromEnv) { + uint32_t step = 0; + Status s = GetStepFromEnv("STEP_NOW", step); + EXPECT_EQ(s.ok(), false); + setenv("STEP_NOW", "1000", true); + s = GetStepFromEnv("STEP_NOW", step); + EXPECT_EQ(s.ok(), true); + EXPECT_EQ(step, 1000); + setenv("STEP_NOW", "1.1", true); + s = GetStepFromEnv("STEP_NOW", step); + EXPECT_EQ(s.ok(), false); + unsetenv("STEP_NOW"); +} + +TEST_F(NpuAttrTest, GetLossFromEnv) { + float loss = 0; + Status s = GetLossFromEnv("LOSS_NOW", loss); + EXPECT_EQ(s.ok(), false); + setenv("LOSS_NOW", "1.1", true); + s = GetLossFromEnv("LOSS_NOW", loss); + EXPECT_EQ(s.ok(), true); + EXPECT_FLOAT_EQ(loss, 1.1); + unsetenv("LOSS_NOW"); +} + TEST_F(NpuAttrTest, SplitTest) { std::string s = "a,b,c"; std::vector res; diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index 846195020..7b10bb277 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -286,6 +286,8 @@ void GePlugin::Init(std::map &init_options, const bool ADP_LOG(INFO) << "[GePlugin] optypelist_for_implmode :" << init_options[ge::OPTYPELIST_FOR_IMPLMODE]; + ADP_LOG(INFO) << "[GePlugin] accelerate train mode :" << init_options["ge.exec.accelerateTrainMode"]; + bool tdt_uninit_env = false; (void) ReadBoolFromEnvVar("ASCEND_TDT_UNINIT", false, &tdt_uninit_env); if (!kIsHeterogeneous && !tdt_uninit_env) { diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 1b064d4db..56926ca08 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -171,6 +171,39 @@ Status GetEnvDeviceID(uint32_t &device_id) { } return Status::OK(); } + +Status GetStepFromEnv(const std::string &env_name, uint32_t &step) { + std::string step_string; + (void) ReadStringFromEnvVar(env_name, "", &step_string); + std::stringstream ss; + if (step_string.empty()) { + ss << env_name << " is not set, which is needed when accelarate by step"; + return errors::InvalidArgument(ss.str()); + } else { + if (!strings::safe_strtou32(step_string, &step)) { + ss << env_name << " is invalid, should be int, such as 1000"; + return errors::InvalidArgument(ss.str()); + } + } + return Status::OK(); +} + +Status GetLossFromEnv(const std::string &env_name, float &loss) { + std::string loss_string; + (void) ReadStringFromEnvVar(env_name, "", &loss_string); + std::stringstream ss; + if (loss_string.empty()) { + ss << env_name << " is not set, which is needed when accelarate by loss"; + return errors::InvalidArgument(ss.str()); + } else { + if (!strings::safe_strtof(loss_string, &loss)) { + ss << env_name << " is invalid, should be float, such as 0.1"; + return errors::InvalidArgument(ss.str()); + } + } + return Status::OK(); +} + void Split(const std::string &s, std::vector &result, const char *delchar) { if (s.empty()) { return; @@ -604,6 +637,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr std::string logical_device_cluster_deploy_mode = "LB"; std::string logical_device_id; std::string model_deploy_mode; + std::string accelerate_train_mode; std::string model_deploy_devicelist; std::string dump_data = "tensor"; std::string aoe_config_file; @@ -644,6 +678,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr (void) ctx->GetAttr("_logical_device_cluster_deploy_mode", &logical_device_cluster_deploy_mode); (void) ctx->GetAttr("_logical_device_id", &logical_device_id); (void) ctx->GetAttr("_model_deploy_mode", &model_deploy_mode); + (void) ctx->GetAttr("_accelerate_train_mode", &accelerate_train_mode); (void) ctx->GetAttr("_model_deploy_devicelist", &model_deploy_devicelist); (void) ctx->GetAttr("_dump_data", &dump_data); (void) ctx->GetAttr("_aoe_config_file", &aoe_config_file); @@ -702,6 +737,7 @@ std::map NpuAttrs::GetInitOptions(const OpKernelConstr init_options_["stream_sync_timeout"] = stream_sync_timeout; init_options_["event_sync_timeout"] = event_sync_timeout; init_options_["ge.esClusterConfig"] = es_cluster_config; + init_options_["ge.exec.accelerateTrainMode"] = accelerate_train_mode; return init_options_; } @@ -1113,6 +1149,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & std::string enable_graph_parallel; std::string graph_compiler_cache_dir; std::string graph_slice_mode; + std::string accelerate_train_mode; auto NpuOptimizer_value = attrs.Find("_NpuOptimizer"); auto enable_data_pre_proc_value = attrs.Find("_enable_data_pre_proc"); @@ -1202,6 +1239,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & auto enable_graph_parallel_val = attrs.Find("_enable_graph_parallel"); auto jit_compile_value = attrs.Find("_jit_compile"); auto graph_compiler_cache_dir_val = attrs.Find("_graph_compiler_cache_dir"); + auto accelerate_train_mode_value = attrs.Find("_accelerate_train_mode"); if (NpuOptimizer_value != nullptr) { do_npu_optimizer = "1"; @@ -1444,6 +1482,9 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & if (model_deploy_mode_value != nullptr) { model_deploy_mode = model_deploy_mode_value->s(); } + if (accelerate_train_mode_value != nullptr) { + accelerate_train_mode = accelerate_train_mode_value->s(); + } if (model_deploy_devicelist_value != nullptr) { model_deploy_devicelist = model_deploy_devicelist_value->s(); } @@ -1577,6 +1618,7 @@ std::map NpuAttrs::GetAllAttrOptions(const AttrSlice & all_options["logical_device_cluster_deploy_mode"] = logical_device_cluster_deploy_mode; all_options["logical_device_id"] = logical_device_id; all_options["model_deploy_mode"] = model_deploy_mode; + all_options["accelerate_train_mode"] = accelerate_train_mode; all_options["model_deploy_devicelist"] = model_deploy_devicelist; all_options["topo_sorting_mode"] = topo_sorting_mode; all_options["ge.topoSortingMode"] = topo_sorting_mode; @@ -1711,6 +1753,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options std::string es_cluster_config; std::string graph_slice_mode; std::string jit_compile; + std::string accelerate_train_mode; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { @@ -2155,6 +2198,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options if (params.count("experimental_logical_device_id") > 0) { logical_device_id = params.at("experimental_logical_device_id").s(); } + if (params.count("experimental_accelerate_train_mode") > 0) { + accelerate_train_mode = params.at("experimental_accelerate_train_mode").s(); + } if (params.count("experimental_model_deploy_mode") > 0) { model_deploy_mode = params.at("experimental_model_deploy_mode").s(); } @@ -2337,6 +2383,8 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options init_options_["ge.exec.logicalDeviceId"] = logical_device_id; init_options_["model_deploy_mode"] = model_deploy_mode; init_options_["ge.exec.modelDeployMode"] = model_deploy_mode; + init_options_["accelerate_train_mode"] = accelerate_train_mode; + init_options_["ge.exec.accelerateTrainMode"] = accelerate_train_mode; init_options_["model_deploy_devicelist"] = model_deploy_devicelist; init_options_["ge.exec.modelDeployDevicelist"] = model_deploy_devicelist; init_options_["dump_data"] = dump_data; diff --git a/tf_adapter/util/npu_attrs.h b/tf_adapter/util/npu_attrs.h index 473d168c2..36eb9a37b 100644 --- a/tf_adapter/util/npu_attrs.h +++ b/tf_adapter/util/npu_attrs.h @@ -35,6 +35,8 @@ namespace tensorflow { std::string GetDumpPath(); Status GetEnvDeviceID(uint32_t &device_id); +Status GetStepFromEnv(const std::string &env_name, uint32_t &step); +Status GetLossFromEnv(const std::string &env_name, float &loss); void Split(const std::string &s, std::vector &result, const char *delchar = " "); extern const bool kDumpGraph; extern const bool kIsHeterogeneous; -- Gitee