From 179b0e21bd521fafa690803600433437b112297a Mon Sep 17 00:00:00 2001 From: wangxiaotian22 Date: Fri, 26 Feb 2021 11:43:17 +0800 Subject: [PATCH] support error_manager used in train --- CMakeLists.txt | 10 ------- ge/client/ge_api.cc | 30 ++++++++++++++++++++ ge/graph/load/model_manager/davinci_model.cc | 2 ++ ge/graph/load/model_manager/davinci_model.h | 3 ++ ge/graph/manager/graph_manager.cc | 12 +++++--- ge/graph/manager/graph_manager.h | 3 ++ ge/offline/main.cc | 1 + inc/external/ge/ge_api.h | 4 +++ metadef | 2 +- parser | 2 +- 10 files changed, 53 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f88da24e..e67b5074 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,9 +76,7 @@ if (ENABLE_OPEN_SRC) find_module(runtime libruntime.so ${GE_LIB_PATH}) find_module(runtime_compile libruntime_compile.so ${GE_LIB_PATH}) find_module(resource libresource.so ${GE_LIB_PATH}) - find_module(error_manager liberror_manager.so ${GE_LIB_PATH}) find_module(ascend_hal_stub libascend_hal.so ${GE_LIB_PATH}) - find_module(error_manager_static liberror_manager.a ${GE_LIB_PATH}) find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${GE_LIB_PATH}) #find_module(ascendcl_static libascendcl.a ${GE_LIB_PATH}) elseif(ENABLE_GE_COV OR ENABLE_GE_UT) @@ -86,11 +84,9 @@ if (ENABLE_OPEN_SRC) else() find_module(slog libalog.so ${ASCEND_ATC_DIR}) find_module(static_mmpa libmmpa.a ${ASCEND_ATC_DIR}) - find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR}) if(PLATFORM STREQUAL "train") find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR}) find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR}) - find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR}) find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR}) find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver) if(PRODUCT STREQUAL "flr3") @@ -100,8 +96,6 @@ if (ENABLE_OPEN_SRC) find_module(adump_server libadump_server.a ${ASCEND_ACL_DIR}) find_module(runtime libruntime.so ${ASCEND_ACL_DIR}) find_module(runtime_compile libruntime_compile.so ${ASCEND_ATC_DIR}) - find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR}) - find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR}) find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR}) if(PRODUCT STREQUAL "flr3") elseif(PRODUCT STREQUAL "flr1") @@ -114,11 +108,9 @@ if (ENABLE_OPEN_SRC) elseif(PLATFORM STREQUAL "all") find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR}) find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR}) - find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR}) find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR}) find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}) find_module(runtime_compile libruntime_compile.so ${ASCEND_ATC_DIR}) - find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR}) find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR}) else() message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!") @@ -144,7 +136,6 @@ elseif (ENABLE_D OR ENABLE_ACL) # common libraries find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) - find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) if (ENABLE_D) @@ -164,7 +155,6 @@ elseif(ENABLE_MS_TESTCASES) # common libraries find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) - find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH}) set(METADEF_DIR ${CMAKE_CURRENT_LIST_DIR}/metadef) diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc index d65d7667..05671408 100644 --- a/ge/client/ge_api.cc +++ b/ge/client/ge_api.cc @@ -32,6 +32,7 @@ #include "graph/common/ge_call_wrapper.h" #include "register/op_registry.h" #include "common/ge/tbe_plugin_manager.h" +#include "common/util/error_manager/error_manager.h" #include "toolchain/plog.h" using domi::OpRegistry; @@ -79,6 +80,8 @@ Status CheckOptionsValid(const std::map &options) { // Initialize GE, prepare for execution, call GELib::Initialize Status GEInitializeImpl(const std::map &options) { GELOGT(TRACE_INIT, "GEInitialize start"); + + ErrorManager::GetInstance().GenWorkStreamIdDefault(); // 0.check init status if (g_ge_initialized) { GELOGW("GEInitialize is called more than once"); @@ -157,6 +160,8 @@ Status GEInitialize(const std::map &options) { // GE finalize, releasing all resources Status GEFinalize() { GELOGT(TRACE_INIT, "GEFinalize start"); + + ErrorManager::GetInstance().GenWorkStreamIdDefault(); // check init status if (!g_ge_initialized) { GELOGW("GEFinalize is called before GEInitialize"); @@ -202,9 +207,19 @@ Status GEFinalize() { return ret; } +std::string GEGetErrorMsg() { + return ErrorManager::GetInstance().GetErrorMessage(); +} + +std::string GEGetWarningMsg() { + return ErrorManager::GetInstance.GetWarningMessage(); +} + // Initialize session,which calls innerSession Session::Session(const std::map &options) { GELOGT(TRACE_INIT, "Session Constructor start"); + + ErrorManager::GetInstance().GenWorkStreamIdDefault(); // check init status sessionId_ = 0; if (!g_ge_initialized) { @@ -235,6 +250,8 @@ Session::Session(const std::map &options) { Session::Session(const std::map &options) { GELOGT(TRACE_INIT, "Session Constructor start"); + + ErrorManager::GetInstance().GenWorkStreamIdDefault(); // check init status sessionId_ = 0; if (!g_ge_initialized) { @@ -311,11 +328,13 @@ Session::~Session() { Status Session::AddGraph(uint32_t graph_id, const Graph &graph) { std::map options; + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); return AddGraph(graph_id, graph, options); } Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); @@ -334,6 +353,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); @@ -360,6 +380,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, } Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) { + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::map options; return AddGraphWithCopy(graph_id, graph, options); } @@ -367,6 +388,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) { Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, const std::map &options) { GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); @@ -389,6 +411,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, Status Session::RemoveGraph(uint32_t graph_id) { GELOGT(TRACE_INIT, "Session RemoveGraph start"); + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); // call RemoveGraph std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (!instance_ptr || !instance_ptr->InitFlag()) { @@ -457,6 +480,7 @@ void PrintOutputResult(std::vector &outputs) { Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, std::vector &outputs) { GELOGT(TRACE_INIT, "Session RunGraph start"); + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::vector graph_inputs = inputs; // call RunGraph std::shared_ptr instance_ptr = ge::GELib::GetInstance(); @@ -483,10 +507,12 @@ Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, s } Status Session::RegisterCallBackFunc(const std::string &key, const pCallBackFunc &callback) { + ErrorManager::GetInstance().GenWorkStreamIdDefault(); return ge::GELib::GetInstance()->SessionManagerObj().RegisterCallBackFunc(sessionId_, key, callback); } Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFunc &callback) { + ErrorManager::GetInstance().GenWorkStreamIdDefault(); std::string str_key; if (key != nullptr) { str_key = key; @@ -495,6 +521,7 @@ Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFu } Status Session::BuildGraph(uint32_t graph_id, const std::vector &inputs) { + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); @@ -511,6 +538,7 @@ Status Session::BuildGraph(uint32_t graph_id, const std::vector Status Session::RunGraphAsync(uint32_t graph_id, const std::vector &inputs, RunAsyncCallback callback) { + ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); @@ -529,6 +557,7 @@ Status Session::RunGraphAsync(uint32_t graph_id, const std::vector &var_names, std::vector &var_values) { + ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); @@ -544,6 +573,7 @@ Status Session::GetVariables(const std::vector &var_names, std::vec } Status Session::GetVariables(const std::vector &var_names, std::vector &var_values) { + ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index 3462baab..740a86f5 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -2641,6 +2641,7 @@ void *DavinciModel::Run(DavinciModel *model) { bool seq_end_flag = false; uint32_t model_id = model->Id(); uint32_t device_id = model->GetDeviceId(); + GetContext().SetWorkStreamId(model->GetWorkStreamId()); GELOGI("Model Run thread start, model_id:%u.", model_id); rtError_t rt_ret = rtSetDevice(static_cast(device_id)); @@ -2807,6 +2808,7 @@ Status DavinciModel::ModelRunStart() { int64_t maxDumpOpNum = std::strtol(opt.c_str(), nullptr, kDecimal); maxDumpOpNum_ = maxDumpOpNum; + work_stream_id_ = GetContext().WorkStreamId(); CREATE_STD_THREAD(thread_id_, DavinciModel::Run, this); GELOGI("model tread create success, model id:%u.", model_id_); return SUCCESS; diff --git a/ge/graph/load/model_manager/davinci_model.h b/ge/graph/load/model_manager/davinci_model.h index 5bc3a68e..a0df910b 100755 --- a/ge/graph/load/model_manager/davinci_model.h +++ b/ge/graph/load/model_manager/davinci_model.h @@ -412,6 +412,8 @@ class DavinciModel { /// uint64_t GetSessionId() const { return session_id_; } + uint64_t GetWorkStreamId() const { return work_stream_id_; } + /// /// @ingroup ge /// @brief SetDeviceId @@ -960,6 +962,7 @@ class DavinciModel { vector output_mbuf_list_; // output mbuf created by dequeue task. uint64_t session_id_; + uint64_t work_stream_id_; uint32_t device_id_; diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index 8cff22ae..9412ae69 100755 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -541,7 +541,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, compute_graph->GetGraphID(), subgraph, - compute_graph->GetName(), session_id, + compute_graph->GetName(), session_id, GetContext().WorkStreamId(), GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -557,7 +557,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, compute_graph->GetGraphID(), subgraph, - compute_graph->GetName(), session_id, + compute_graph->GetName(), session_id, GetContext().WorkStreamId(), GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -2508,8 +2508,10 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager const SubGraphInfoPtr &sub_graph_info_ptr, const std::string &root_graph_name, uint64_t session_id, + uint64_t work_stream_id, const GEThreadLocalContext &ge_context) { if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) { + GetContext().SetWorkStreamId(work_stream_id); GetContext().SetSessionId(session_id); GetThreadLocalContext() = ge_context; graph_manager->UpdateLocalOmgContext(root_graph_id); @@ -2643,6 +2645,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { GELOGI("A new loop start."); + GetContext().SetWorkStreamId(args.work_stream_id); GetContext().SetSessionId(args.session_id); GetThreadLocalContext() = args.context; graph_manager->UpdateLocalOmgContext(args.graph_id); @@ -2724,8 +2727,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { ge_root_model = graph_node->GetGeRootModel(); } - graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.input_tensor, - ge_root_model, GetThreadLocalContext(), args.callback })); + graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.work_stream_id, + args.input_tensor, ge_root_model, GetThreadLocalContext(), args.callback })); GELOGI("Loop end."); } } @@ -2824,6 +2827,7 @@ void GraphManager::RunThread(GraphManager *graph_manager) { GELOGI("A new loop start."); + GetContext().SetWorkStreamId(args.work_stream_id); GetContext().SetSessionId(args.session_id); GetThreadLocalContext() = args.context; graph_manager->UpdateLocalOmgContext(args.graph_id); diff --git a/ge/graph/manager/graph_manager.h b/ge/graph/manager/graph_manager.h index 31e8799f..a13aa2ff 100644 --- a/ge/graph/manager/graph_manager.h +++ b/ge/graph/manager/graph_manager.h @@ -196,6 +196,7 @@ class GraphManager { GraphId graph_id; std::vector input_tensor; uint64_t session_id; + uint64_t work_stream_id; GEThreadLocalContext context; RunAsyncCallback callback; }; @@ -204,6 +205,7 @@ class GraphManager { GraphNodePtr graph_node; GraphId graph_id; uint64_t session_id; + uint64_t work_stream_id; std::vector input_tensor; GeRootModelPtr ge_root_model; GEThreadLocalContext context; @@ -221,6 +223,7 @@ class GraphManager { const SubGraphInfoPtr &sub_graph_info_ptr, const std::string &root_graph_name, uint64_t session_id, + uint64_t work_stream_id; const GEThreadLocalContext &ge_context); Status ParseInputsDims(const std::vector &input_tensor); void ParseInputsDimsForData(const std::vector &input_tensor); diff --git a/ge/offline/main.cc b/ge/offline/main.cc index c7bb46a3..069ec769 100755 --- a/ge/offline/main.cc +++ b/ge/offline/main.cc @@ -1325,6 +1325,7 @@ int init(int argc, char* argv[]) { return ret; } + ErrorManager::GetInstance().GenWorkStreamIdDefault(); return 0; } diff --git a/inc/external/ge/ge_api.h b/inc/external/ge/ge_api.h index cd4ca323..c8b5a8ec 100644 --- a/inc/external/ge/ge_api.h +++ b/inc/external/ge/ge_api.h @@ -42,6 +42,10 @@ GE_FUNC_VISIBILITY Status GEInitialize(const std::map &)) diff --git a/metadef b/metadef index b6de68fd..f982caa0 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit b6de68fdf0f131fd5f8aa3a84245ad7779b348f5 +Subproject commit f982caa0981b1fdcc55a8ec27b4f4de9c58d33ba diff --git a/parser b/parser index 7a631135..d2fc9584 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit 7a6311351f8294eb11033b10e9f7b2b993cc3c2a +Subproject commit d2fc958450f7bd243eff8432aadeb9fa95fa2f61