Browse Source

support error_manager used in train

tags/v1.2.0
wangxiaotian22 3 years ago
parent
commit
179b0e21bd
10 changed files with 53 additions and 16 deletions
  1. +0
    -10
      CMakeLists.txt
  2. +30
    -0
      ge/client/ge_api.cc
  3. +2
    -0
      ge/graph/load/model_manager/davinci_model.cc
  4. +3
    -0
      ge/graph/load/model_manager/davinci_model.h
  5. +8
    -4
      ge/graph/manager/graph_manager.cc
  6. +3
    -0
      ge/graph/manager/graph_manager.h
  7. +1
    -0
      ge/offline/main.cc
  8. +4
    -0
      inc/external/ge/ge_api.h
  9. +1
    -1
      metadef
  10. +1
    -1
      parser

+ 0
- 10
CMakeLists.txt View File

@@ -76,9 +76,7 @@ if (ENABLE_OPEN_SRC)
find_module(runtime libruntime.so ${GE_LIB_PATH})
find_module(runtime_compile libruntime_compile.so ${GE_LIB_PATH})
find_module(resource libresource.so ${GE_LIB_PATH})
find_module(error_manager liberror_manager.so ${GE_LIB_PATH})
find_module(ascend_hal_stub libascend_hal.so ${GE_LIB_PATH})
find_module(error_manager_static liberror_manager.a ${GE_LIB_PATH})
find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${GE_LIB_PATH})
#find_module(ascendcl_static libascendcl.a ${GE_LIB_PATH})
elseif(ENABLE_GE_COV OR ENABLE_GE_UT)
@@ -86,11 +84,9 @@ if (ENABLE_OPEN_SRC)
else()
find_module(slog libalog.so ${ASCEND_ATC_DIR})
find_module(static_mmpa libmmpa.a ${ASCEND_ATC_DIR})
find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
if(PLATFORM STREQUAL "train")
find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR})
find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR})
find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR})
find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR})
find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
if(PRODUCT STREQUAL "flr3")
@@ -100,8 +96,6 @@ if (ENABLE_OPEN_SRC)
find_module(adump_server libadump_server.a ${ASCEND_ACL_DIR})
find_module(runtime libruntime.so ${ASCEND_ACL_DIR})
find_module(runtime_compile libruntime_compile.so ${ASCEND_ATC_DIR})
find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR})
if(PRODUCT STREQUAL "flr3")
elseif(PRODUCT STREQUAL "flr1")
@@ -114,11 +108,9 @@ if (ENABLE_OPEN_SRC)
elseif(PLATFORM STREQUAL "all")
find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR})
find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR})
find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR})
find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR})
find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR})
find_module(runtime_compile libruntime_compile.so ${ASCEND_ATC_DIR})
find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR})
else()
message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
@@ -144,7 +136,6 @@ elseif (ENABLE_D OR ENABLE_ACL)

# common libraries
find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})

if (ENABLE_D)
@@ -164,7 +155,6 @@ elseif(ENABLE_MS_TESTCASES)

# common libraries
find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})

set(METADEF_DIR ${CMAKE_CURRENT_LIST_DIR}/metadef)


+ 30
- 0
ge/client/ge_api.cc View File

@@ -32,6 +32,7 @@
#include "graph/common/ge_call_wrapper.h"
#include "register/op_registry.h"
#include "common/ge/tbe_plugin_manager.h"
#include "common/util/error_manager/error_manager.h"
#include "toolchain/plog.h"

using domi::OpRegistry;
@@ -79,6 +80,8 @@ Status CheckOptionsValid(const std::map<string, string> &options) {
// Initialize GE, prepare for execution, call GELib::Initialize
Status GEInitializeImpl(const std::map<string, string> &options) {
GELOGT(TRACE_INIT, "GEInitialize start");

ErrorManager::GetInstance().GenWorkStreamIdDefault();
// 0.check init status
if (g_ge_initialized) {
GELOGW("GEInitialize is called more than once");
@@ -157,6 +160,8 @@ Status GEInitialize(const std::map<AscendString, AscendString> &options) {
// GE finalize, releasing all resources
Status GEFinalize() {
GELOGT(TRACE_INIT, "GEFinalize start");

ErrorManager::GetInstance().GenWorkStreamIdDefault();
// check init status
if (!g_ge_initialized) {
GELOGW("GEFinalize is called before GEInitialize");
@@ -202,9 +207,19 @@ Status GEFinalize() {
return ret;
}

std::string GEGetErrorMsg() {
return ErrorManager::GetInstance().GetErrorMessage();
}

std::string GEGetWarningMsg() {
return ErrorManager::GetInstance.GetWarningMessage();
}

// Initialize session,which calls innerSession
Session::Session(const std::map<string, string> &options) {
GELOGT(TRACE_INIT, "Session Constructor start");

ErrorManager::GetInstance().GenWorkStreamIdDefault();
// check init status
sessionId_ = 0;
if (!g_ge_initialized) {
@@ -235,6 +250,8 @@ Session::Session(const std::map<string, string> &options) {

Session::Session(const std::map<AscendString, AscendString> &options) {
GELOGT(TRACE_INIT, "Session Constructor start");

ErrorManager::GetInstance().GenWorkStreamIdDefault();
// check init status
sessionId_ = 0;
if (!g_ge_initialized) {
@@ -311,11 +328,13 @@ Session::~Session() {

Status Session::AddGraph(uint32_t graph_id, const Graph &graph) {
std::map<std::string, std::string> options;
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
return AddGraph(graph_id, graph, options);
}

Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map<std::string, std::string> &options) {
GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_);
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session.");
@@ -334,6 +353,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map<s
Status Session::AddGraph(uint32_t graph_id, const Graph &graph,
const std::map<AscendString, AscendString> &options) {
GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_);
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session.");
@@ -360,6 +380,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph,
}

Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) {
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::map<AscendString, AscendString> options;
return AddGraphWithCopy(graph_id, graph, options);
}
@@ -367,6 +388,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) {
Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph,
const std::map<AscendString, AscendString> &options) {
GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_);
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session.");
@@ -389,6 +411,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph,
Status Session::RemoveGraph(uint32_t graph_id) {
GELOGT(TRACE_INIT, "Session RemoveGraph start");

ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
// call RemoveGraph
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (!instance_ptr || !instance_ptr->InitFlag()) {
@@ -457,6 +480,7 @@ void PrintOutputResult(std::vector<Tensor> &outputs) {
Status Session::RunGraph(uint32_t graph_id, const std::vector<Tensor> &inputs, std::vector<Tensor> &outputs) {
GELOGT(TRACE_INIT, "Session RunGraph start");

ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::vector<Tensor> graph_inputs = inputs;
// call RunGraph
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
@@ -483,10 +507,12 @@ Status Session::RunGraph(uint32_t graph_id, const std::vector<Tensor> &inputs, s
}

Status Session::RegisterCallBackFunc(const std::string &key, const pCallBackFunc &callback) {
ErrorManager::GetInstance().GenWorkStreamIdDefault();
return ge::GELib::GetInstance()->SessionManagerObj().RegisterCallBackFunc(sessionId_, key, callback);
}

Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFunc &callback) {
ErrorManager::GetInstance().GenWorkStreamIdDefault();
std::string str_key;
if (key != nullptr) {
str_key = key;
@@ -495,6 +521,7 @@ Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFu
}

Status Session::BuildGraph(uint32_t graph_id, const std::vector<InputTensorInfo> &inputs) {
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed");
@@ -511,6 +538,7 @@ Status Session::BuildGraph(uint32_t graph_id, const std::vector<InputTensorInfo>

Status Session::RunGraphAsync(uint32_t graph_id, const std::vector<InputTensorInfo> &inputs,
RunAsyncCallback callback) {
ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id);
std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed");
@@ -529,6 +557,7 @@ Status Session::RunGraphAsync(uint32_t graph_id, const std::vector<InputTensorIn
}

Status Session::GetVariables(const std::vector<std::string> &var_names, std::vector<Tensor> &var_values) {
ErrorManager::GetInstance().GenWorkStreamIdDefault();
auto instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed");
@@ -544,6 +573,7 @@ Status Session::GetVariables(const std::vector<std::string> &var_names, std::vec
}

Status Session::GetVariables(const std::vector<AscendString> &var_names, std::vector<Tensor> &var_values) {
ErrorManager::GetInstance().GenWorkStreamIdDefault();
auto instance_ptr = ge::GELib::GetInstance();
if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed");


+ 2
- 0
ge/graph/load/model_manager/davinci_model.cc View File

@@ -2641,6 +2641,7 @@ void *DavinciModel::Run(DavinciModel *model) {
bool seq_end_flag = false;
uint32_t model_id = model->Id();
uint32_t device_id = model->GetDeviceId();
GetContext().SetWorkStreamId(model->GetWorkStreamId());

GELOGI("Model Run thread start, model_id:%u.", model_id);
rtError_t rt_ret = rtSetDevice(static_cast<int32_t>(device_id));
@@ -2807,6 +2808,7 @@ Status DavinciModel::ModelRunStart() {
int64_t maxDumpOpNum = std::strtol(opt.c_str(), nullptr, kDecimal);
maxDumpOpNum_ = maxDumpOpNum;

work_stream_id_ = GetContext().WorkStreamId();
CREATE_STD_THREAD(thread_id_, DavinciModel::Run, this);
GELOGI("model tread create success, model id:%u.", model_id_);
return SUCCESS;


+ 3
- 0
ge/graph/load/model_manager/davinci_model.h View File

@@ -412,6 +412,8 @@ class DavinciModel {
///
uint64_t GetSessionId() const { return session_id_; }

uint64_t GetWorkStreamId() const { return work_stream_id_; }

///
/// @ingroup ge
/// @brief SetDeviceId
@@ -960,6 +962,7 @@ class DavinciModel {
vector<uintptr_t> output_mbuf_list_; // output mbuf created by dequeue task.

uint64_t session_id_;
uint64_t work_stream_id_;

uint32_t device_id_;



+ 8
- 4
ge/graph/manager/graph_manager.cc View File

@@ -541,7 +541,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
}
std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
compute_graph->GetGraphID(), subgraph,
compute_graph->GetName(), session_id,
compute_graph->GetName(), session_id, GetContext().WorkStreamId(),
GetThreadLocalContext());
if (!f.valid()) {
GELOGE(FAILED, "Future is invalid");
@@ -557,7 +557,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
}
std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
compute_graph->GetGraphID(), subgraph,
compute_graph->GetName(), session_id,
compute_graph->GetName(), session_id, GetContext().WorkStreamId(),
GetThreadLocalContext());
if (!f.valid()) {
GELOGE(FAILED, "Future is invalid");
@@ -2508,8 +2508,10 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
const SubGraphInfoPtr &sub_graph_info_ptr,
const std::string &root_graph_name,
uint64_t session_id,
uint64_t work_stream_id,
const GEThreadLocalContext &ge_context) {
if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) {
GetContext().SetWorkStreamId(work_stream_id);
GetContext().SetSessionId(session_id);
GetThreadLocalContext() = ge_context;
graph_manager->UpdateLocalOmgContext(root_graph_id);
@@ -2643,6 +2645,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {

GELOGI("A new loop start.");

GetContext().SetWorkStreamId(args.work_stream_id);
GetContext().SetSessionId(args.session_id);
GetThreadLocalContext() = args.context;
graph_manager->UpdateLocalOmgContext(args.graph_id);
@@ -2724,8 +2727,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
ge_root_model = graph_node->GetGeRootModel();
}

graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.input_tensor,
ge_root_model, GetThreadLocalContext(), args.callback }));
graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.work_stream_id,
args.input_tensor, ge_root_model, GetThreadLocalContext(), args.callback }));
GELOGI("Loop end.");
}
}
@@ -2824,6 +2827,7 @@ void GraphManager::RunThread(GraphManager *graph_manager) {

GELOGI("A new loop start.");

GetContext().SetWorkStreamId(args.work_stream_id);
GetContext().SetSessionId(args.session_id);
GetThreadLocalContext() = args.context;
graph_manager->UpdateLocalOmgContext(args.graph_id);


+ 3
- 0
ge/graph/manager/graph_manager.h View File

@@ -196,6 +196,7 @@ class GraphManager {
GraphId graph_id;
std::vector<ge::InputTensorInfo> input_tensor;
uint64_t session_id;
uint64_t work_stream_id;
GEThreadLocalContext context;
RunAsyncCallback callback;
};
@@ -204,6 +205,7 @@ class GraphManager {
GraphNodePtr graph_node;
GraphId graph_id;
uint64_t session_id;
uint64_t work_stream_id;
std::vector<ge::InputTensorInfo> input_tensor;
GeRootModelPtr ge_root_model;
GEThreadLocalContext context;
@@ -221,6 +223,7 @@ class GraphManager {
const SubGraphInfoPtr &sub_graph_info_ptr,
const std::string &root_graph_name,
uint64_t session_id,
uint64_t work_stream_id;
const GEThreadLocalContext &ge_context);
Status ParseInputsDims(const std::vector<InputTensorInfo> &input_tensor);
void ParseInputsDimsForData(const std::vector<InputTensorInfo> &input_tensor);


+ 1
- 0
ge/offline/main.cc View File

@@ -1325,6 +1325,7 @@ int init(int argc, char* argv[]) {
return ret;
}

ErrorManager::GetInstance().GenWorkStreamIdDefault();
return 0;
}



+ 4
- 0
inc/external/ge/ge_api.h View File

@@ -42,6 +42,10 @@ GE_FUNC_VISIBILITY Status GEInitialize(const std::map<AscendString, AscendString
// Finalize GE, release all resources
GE_FUNC_VISIBILITY Status GEFinalize();

GE_FUNC_VISIBILITY std::string GEGetErrorMsg();

GE_FUNC_VISIBILITY std::string GEGetWarningMsg();

class GE_FUNC_VISIBILITY Session {
public:
ATTRIBUTED_DEPRECATED(Session(const std::map<AscendString, AscendString> &))


+ 1
- 1
metadef

@@ -1 +1 @@
Subproject commit b6de68fdf0f131fd5f8aa3a84245ad7779b348f5
Subproject commit f982caa0981b1fdcc55a8ec27b4f4de9c58d33ba

+ 1
- 1
parser

@@ -1 +1 @@
Subproject commit 7a6311351f8294eb11033b10e9f7b2b993cc3c2a
Subproject commit d2fc958450f7bd243eff8432aadeb9fa95fa2f61

Loading…
Cancel
Save