From 563e0994cf62d5a90f2f2c4776b983dbee4b5e64 Mon Sep 17 00:00:00 2001 From: zhou_lili Date: Mon, 26 Apr 2021 17:09:35 +0800 Subject: [PATCH] support overflow detection when infer --- ge/common/dump/dump_manager.cc | 116 +++++++++++------- ge/common/dump/dump_manager.h | 5 + ge/common/dump/dump_properties.cc | 23 +++- ge/common/dump/dump_properties.h | 15 ++- ge/graph/load/model_manager/data_dumper.cc | 2 +- .../executor/hybrid_model_async_executor.cc | 12 +- inc/framework/common/ge_types.h | 1 + tests/ut/ge/common/dump_manager_unittest.cc | 29 +++++ 8 files changed, 146 insertions(+), 57 deletions(-) diff --git a/ge/common/dump/dump_manager.cc b/ge/common/dump/dump_manager.cc index 51936260..a6944fc6 100644 --- a/ge/common/dump/dump_manager.cc +++ b/ge/common/dump/dump_manager.cc @@ -23,6 +23,7 @@ const char *const kDumpOFF = "OFF"; const char *const kDumpoff = "off"; const char *const kDumpOn = "on"; const uint64_t kInferSessionId = 0; +const uint32_t kAllOverflow = 3; } // namespace namespace ge { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpManager &DumpManager::GetInstance() { @@ -30,78 +31,103 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpManager &DumpManager::GetIn return instance; } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf(const DumpConfig &dump_config) { - DumpProperties dump_properties; - std::string dump_status; - std::string dump_path; - std::string dump_mode; - std::string dump_op_switch; - - if (dump_config.dump_status.empty()) { +bool DumpManager::NeedDoDump(const DumpConfig &dump_config, DumpProperties &dump_properties) { + if (dump_config.dump_status.empty() && dump_config.dump_debug.empty()) { dump_properties_map_.emplace(kInferSessionId, dump_properties); GELOGI("Dump does not open"); - return SUCCESS; + return false; } - - dump_status = dump_config.dump_status; - GELOGI("Dump status is %s", dump_status.c_str()); - if (dump_config.dump_status == kDumpoff || dump_config.dump_status == kDumpOFF) { + GELOGI("Dump status is %s, dump debug is %s.", dump_config.dump_status.c_str(), dump_config.dump_debug.c_str()); + if ((dump_config.dump_status == kDumpoff || dump_config.dump_status == kDumpOFF) && + dump_config.dump_debug == kDumpoff) { dump_properties.ClearDumpPropertyValue(); dump_properties_map_.emplace(kInferSessionId, dump_properties); - return SUCCESS; + return false; + } + if (dump_config.dump_status == kDumpOn && dump_config.dump_debug == kDumpOn) { + GELOGW("Not support coexistence of dump debug and dump status."); + return false; } - dump_properties.SetDumpStatus(dump_status); + return true; +} - dump_op_switch = dump_config.dump_op_switch; - dump_properties.SetDumpOpSwitch(dump_op_switch); - if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) { - dump_properties_map_.emplace(kInferSessionId, dump_properties); - GELOGE(PARAM_INVALID, "[Check][DumpList]Invalid, dump_op_switch is %s", - dump_op_switch.c_str()); - REPORT_INNER_ERROR("E19999", "Dump list check invalid, dump_op_switch is %s", - dump_op_switch.c_str()); - return PARAM_INVALID; +void DumpManager::SetDumpDebugConf(const DumpConfig &dump_config, DumpProperties &dump_properties) { + if (dump_config.dump_debug == kDumpOn) { + GELOGI("Only do overflow detection, dump debug is %s.", dump_config.dump_debug.c_str()); + dump_properties.InitInferOpDebug(); + dump_properties.SetOpDebugMode(kAllOverflow); } +} - if (!dump_config.dump_list.empty()) { - for (auto model_dump : dump_config.dump_list) { - std::string model_name = model_dump.model_name; - GELOGI("Dump model is %s", model_name.c_str()); - std::set dump_layers; - for (auto layer : model_dump.layers) { - GELOGI("Dump layer is %s in model", layer.c_str()); - dump_layers.insert(layer); - } - dump_properties.AddPropertyValue(model_name, dump_layers); +void DumpManager::SetDumpList(const DumpConfig &dump_config, DumpProperties &dump_properties) { + for (const auto &model_dump : dump_config.dump_list) { + std::string model_name = model_dump.model_name; + GELOGI("Dump model is %s", model_name.c_str()); + std::set dump_layers; + for (const auto &layer : model_dump.layers) { + GELOGI("Dump layer is %s in model", layer.c_str()); + dump_layers.insert(layer); + } + dump_properties.AddPropertyValue(model_name, dump_layers); + } +} + +Status DumpManager::SetNormalDumpConf(const DumpConfig &dump_config, DumpProperties &dump_properties) { + if (dump_config.dump_status == kDumpOn) { + GELOGI("Only do normal dump process, dump status is %s.", dump_config.dump_status.c_str()); + dump_properties.SetDumpStatus(dump_config.dump_status); + std::string dump_op_switch = dump_config.dump_op_switch; + dump_properties.SetDumpOpSwitch(dump_op_switch); + if (dump_op_switch == kDumpoff && dump_config.dump_list.empty()) { + dump_properties_map_.emplace(kInferSessionId, dump_properties); + GELOGE(PARAM_INVALID, "[Check][DumpList]Invalid, dump_op_switch is %s", dump_op_switch.c_str()); + REPORT_INNER_ERROR("E19999", "Dump list check invalid, dump_op_switch is %s", dump_op_switch.c_str()); + return PARAM_INVALID; } - if (dump_op_switch == kDumpOn) { - GELOGI("Start to dump model and single op,dump op switch is %s", dump_op_switch.c_str()); + + if (!dump_config.dump_list.empty()) { + if (dump_op_switch == kDumpOn) { + GELOGI("Start to dump model and single op, dump op switch is %s", dump_op_switch.c_str()); + } else { + GELOGI("Only dump model, dump op switch is %s", dump_op_switch.c_str()); + } + SetDumpList(dump_config, dump_properties); } else { - GELOGI("Only dump model,dump op switch is %s", dump_op_switch.c_str()); + GELOGI("Only dump single op, dump op switch is %s", dump_op_switch.c_str()); } - } else { - GELOGI("Only dump single op,dump op switch is %s", dump_op_switch.c_str()); + GELOGI("Dump mode is %s", dump_config.dump_mode.c_str()); + dump_properties.SetDumpMode(dump_config.dump_mode); } + return SUCCESS; +} - dump_path = dump_config.dump_path; +Status DumpManager::SetDumpPath(const DumpConfig &dump_config, DumpProperties &dump_properties) { + std::string dump_path = dump_config.dump_path; if (dump_path.empty()) { GELOGE(PARAM_INVALID, "[Check][DumpPath]It is empty"); REPORT_INNER_ERROR("E19999", "Dump path check is empty"); return PARAM_INVALID; } - if (dump_path[dump_path.size() - 1] != '/') { dump_path = dump_path + "/"; } dump_path = dump_path + CurrentTimeInStr() + "/"; GELOGI("Dump path is %s", dump_path.c_str()); dump_properties.SetDumpPath(dump_path); + return SUCCESS; +} - dump_mode = dump_config.dump_mode; - GELOGI("Dump mode is %s", dump_mode.c_str()); - dump_properties.SetDumpMode(dump_mode); +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf(const DumpConfig &dump_config) { + DumpProperties dump_properties; + if (!NeedDoDump(dump_config, dump_properties)) { + GELOGD("No need do dump process."); + return SUCCESS; + } + SetDumpDebugConf(dump_config, dump_properties); + GE_CHK_STATUS_RET(SetNormalDumpConf(dump_config, dump_properties), "[Init][DumpConf] failed when dump status is on."); + GE_CHK_STATUS_RET(SetDumpPath(dump_config, dump_properties), "[Init][DumpPath] failed."); dump_properties_map_[kInferSessionId] = dump_properties; - + return SUCCESS; } diff --git a/ge/common/dump/dump_manager.h b/ge/common/dump/dump_manager.h index 095344b7..fa96de93 100644 --- a/ge/common/dump/dump_manager.h +++ b/ge/common/dump/dump_manager.h @@ -34,6 +34,11 @@ class DumpManager { void RemoveDumpProperties(uint64_t session_id); private: + bool NeedDoDump(const DumpConfig &dump_config, DumpProperties &dump_properties); + void SetDumpDebugConf(const DumpConfig &dump_config, DumpProperties &dump_properties); + Status SetDumpPath(const DumpConfig &dump_config, DumpProperties &dump_properties); + Status SetNormalDumpConf(const DumpConfig &dump_config, DumpProperties &dump_properties); + void SetDumpList(const DumpConfig &dump_config, DumpProperties &dump_properties); std::mutex mutex_; std::map dump_properties_map_; }; diff --git a/ge/common/dump/dump_properties.cc b/ge/common/dump/dump_properties.cc index 65b1e89a..08bddf43 100644 --- a/ge/common/dump/dump_properties.cc +++ b/ge/common/dump/dump_properties.cc @@ -53,7 +53,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti dump_path_.clear(); dump_step_.clear(); dump_mode_.clear(); - is_op_debug_ = false; + is_train_op_debug_ = false; + is_infer_op_debug_ = false; op_debug_mode_ = 0; std::string enable_dump; @@ -124,7 +125,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::ClearDumpI dump_mode_.clear(); dump_op_switch_.clear(); dump_status_.clear(); - is_op_debug_ = false; + is_train_op_debug_ = false; + is_infer_op_debug_ = false; op_debug_mode_ = 0; } @@ -203,6 +205,14 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti return dump_status_; } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitInferOpDebug() { + is_infer_op_debug_ = true; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetOpDebugMode(const uint32_t &op_debug_mode) { + op_debug_mode_ = op_debug_mode; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch( const std::string &dump_op_switch) { dump_op_switch_ = dump_op_switch; @@ -237,7 +247,8 @@ void DumpProperties::CopyFrom(const DumpProperties &other) { dump_op_switch_ = other.dump_op_switch_; model_dump_properties_map_ = other.model_dump_properties_map_; - is_op_debug_ = other.is_op_debug_; + is_train_op_debug_ = other.is_train_op_debug_; + is_infer_op_debug_ = other.is_infer_op_debug_; op_debug_mode_ = other.op_debug_mode_; } } @@ -254,15 +265,15 @@ void DumpProperties::SetDumpDebugOptions() { if (dump_debug_mode == OP_DEBUG_AICORE) { GELOGD("ge.exec.dumpDebugMode=aicore_overflow, op debug is open."); - is_op_debug_ = true; + is_train_op_debug_ = true; op_debug_mode_ = kAicoreOverflow; } else if (dump_debug_mode == OP_DEBUG_ATOMIC) { GELOGD("ge.exec.dumpDebugMode=atomic_overflow, op debug is open."); - is_op_debug_ = true; + is_train_op_debug_ = true; op_debug_mode_ = kAtomicOverflow; } else if (dump_debug_mode == OP_DEBUG_ALL) { GELOGD("ge.exec.dumpDebugMode=all, op debug is open."); - is_op_debug_ = true; + is_train_op_debug_ = true; op_debug_mode_ = kAllOverflow; } else { GELOGW("ge.exec.dumpDebugMode is invalid."); diff --git a/ge/common/dump/dump_properties.h b/ge/common/dump/dump_properties.h index 8c064d58..98487491 100644 --- a/ge/common/dump/dump_properties.h +++ b/ge/common/dump/dump_properties.h @@ -65,16 +65,26 @@ class DumpProperties { const std::string &GetDumpStatus() const; + void InitInferOpDebug(); + + bool IsInferOpDebug() const { + return is_infer_op_debug_; + } + void SetDumpOpSwitch(const std::string &dump_op_switch); const std::string &GetDumpOpSwitch() const; - bool IsOpDebugOpen() const { return is_op_debug_; } + bool IsOpDebugOpen() const { + return is_train_op_debug_ || is_infer_op_debug_; + } bool IsDumpOpen() const; bool IsSingleOpNeedDump() const; + void SetOpDebugMode(const uint32_t &op_debug_mode); + uint32_t GetOpDebugMode() const { return op_debug_mode_; } const std::string &GetEnableDump() const {return enable_dump_;} @@ -96,7 +106,8 @@ class DumpProperties { std::string dump_op_switch_; std::map> model_dump_properties_map_; - bool is_op_debug_ = false; + bool is_train_op_debug_ = false; + bool is_infer_op_debug_ = false; uint32_t op_debug_mode_ = 0; }; } diff --git a/ge/graph/load/model_manager/data_dumper.cc b/ge/graph/load/model_manager/data_dumper.cc index ac256526..c7463001 100644 --- a/ge/graph/load/model_manager/data_dumper.cc +++ b/ge/graph/load/model_manager/data_dumper.cc @@ -663,7 +663,7 @@ Status DataDumper::LoadDumpInfo() { SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info); if (!op_list_.empty() || is_op_debug_ || is_end_graph_) { - auto ret = ExecuteLoadDumpInfo(op_mapping_info); + ret = ExecuteLoadDumpInfo(op_mapping_info); if (ret != SUCCESS) { GELOGE(ret, "Execute load dump info failed"); return ret; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index 3294a286..dc8b496c 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -544,9 +544,15 @@ Status HybridModelAsyncExecutor::DumpOpDebug() { data_dumper_.SetModelId(model_->GetModelId()); data_dumper_.SetDeviceId(model_->GetDeviceId()); void *global_step = nullptr; - TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP); - if (varible_global_step != nullptr) { - global_step = const_cast(varible_global_step->GetData()); + + if (dump_properties.IsInferOpDebug()) { + GELOGD("Init global step when infer with op debug."); + global_step = executor_->GetContext()->global_step; + } else { + TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP); + if (varible_global_step != nullptr) { + global_step = const_cast(varible_global_step->GetData()); + } } void *loop_per_iter = nullptr; diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h index b37574f7..1fc25b1d 100644 --- a/inc/framework/common/ge_types.h +++ b/inc/framework/common/ge_types.h @@ -293,6 +293,7 @@ struct DumpConfig { std::string dump_mode; std::string dump_status; std::string dump_op_switch; + std::string dump_debug; std::vector dump_list; }; } // namespace ge diff --git a/tests/ut/ge/common/dump_manager_unittest.cc b/tests/ut/ge/common/dump_manager_unittest.cc index 7f3880f2..50eabc4a 100644 --- a/tests/ut/ge/common/dump_manager_unittest.cc +++ b/tests/ut/ge/common/dump_manager_unittest.cc @@ -67,6 +67,35 @@ TEST_F(UTEST_dump_manager, is_dump_single_op_close_success) { EXPECT_EQ(ret, ge::SUCCESS); } + // dump_debug and debug_status are on + TEST_F(UTEST_dump_manager, dump_op_debug_on) { + DumpConfig dump_config; + dump_config.dump_debug = "on"; + dump_config.dump_status = "on"; + auto ret = DumpManager::GetInstance().SetDumpConf(dump_config); + EXPECT_EQ(ret, ge::SUCCESS); + } + + // just dump_status is on + TEST_F(UTEST_dump_manager, dump_status_without_dump_list) { + DumpConfig dump_config; + dump_config.dump_status = "on"; + auto ret = DumpManager::GetInstance().SetDumpConf(dump_config); + EXPECT_EQ(ret, ge::PARAM_INVALID); + } + + // dump_status is on with dump_list + TEST_F(UTEST_dump_manager, dump_status_with_dump_list) { + DumpConfig dump_config; + dump_config.dump_status = "on"; + ModelDumpConfig dump_list; + dump_list.model_name = "test"; + dump_list.layers.push_back("first"); + dump_config.dump_list.push_back(dump_list); + auto ret = DumpManager::GetInstance().SetDumpConf(dump_config); + EXPECT_EQ(ret, ge::PARAM_INVALID); + } + TEST_F(UTEST_dump_manager, add_dump_properties_success) { DumpProperties dump_properties; DumpManager::GetInstance().AddDumpProperties(0, dump_properties);