diff --git a/build.sh b/build.sh index 3e2dcdec..7b1c0792 100644 --- a/build.sh +++ b/build.sh @@ -229,7 +229,7 @@ if [[ "X$ENABLE_GE_UT" = "Xon" || "X$ENABLE_GE_COV" = "Xon" ]]; then rm -rf ${BASEPATH}/cov mkdir ${BASEPATH}/cov lcov -c -d build/tests/ut/ge -d build/tests/ut/common/graph/ -o cov/tmp.info - lcov -r cov/tmp.info '*/output/*' '*/build/opensrc/*' '*/build/proto/*' '*/third_party/*' '*/tests/*' '/usr/local/*' -o cov/coverage.info + lcov -r cov/tmp.info '*/output/*' '*/build/opensrc/*' '*/build/proto/*' '*/third_party/*' '*/tests/*' '/usr/local/*' '/usr/include/*' '*/metadef/*' '*/parser/*' -o cov/coverage.info cd ${BASEPATH}/cov genhtml coverage.info fi diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index 8977ad85..89745019 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -31,6 +31,7 @@ set(PROTO_HEADER_LIST protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) protobuf_generate(ge PROTO_CLIENT_SRCS PROTO_CLIENT_HDRS ${PROTO_CLIENT_LIST}) protobuf_generate(ge PROTO_HEADER_SRCS PROTO_HEADER_HDRS ${PROTO_HEADER_LIST}) +protobuf_generate(ge_client PROTO_CLIENT_HEADER_SRCS PROTO_CLIENT_HEADER_HDRS ${PROTO_HEADER_LIST}) if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES) ############ libge_proto_common.a ############ @@ -56,7 +57,7 @@ target_link_libraries(ge_proto_common PRIVATE ############ libge_proto_client.a ############ add_library(ge_proto_client STATIC - ${PROTO_HEADER_HDRS} + ${PROTO_CLIENT_HEADER_HDRS} ${PROTO_CLIENT_SRCS} ) @@ -65,6 +66,11 @@ target_compile_definitions(ge_proto_client PRIVATE google=ascend_private ) +target_include_directories(ge_proto_client PRIVATE + ${CMAKE_BINARY_DIR}/proto/ge_client + ${CMAKE_BINARY_DIR}/proto/ge_client/proto +) + target_compile_options(ge_proto_client PRIVATE -O2 -fno-common @@ -102,6 +108,7 @@ set(TRAIN_SRC_LIST "common/helper/model_cache_helper.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_manager.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_properties.cc" "common/dump/opdebug_register.cc" "common/dump/dump_op.cc" @@ -189,6 +196,7 @@ set(TRAIN_SRC_LIST "graph/passes/atomic_addr_clean_pass.cc" "graph/passes/mark_same_addr_pass.cc" "graph/passes/mark_graph_unknown_status_pass.cc" + "graph/passes/mark_node_unknown_shape_pass.cc" "graph/passes/mark_agnostic_pass.cc" "graph/partition/dynamic_shape_partition.cc" "graph/partition/stage_partition.cc" @@ -209,6 +217,7 @@ set(TRAIN_SRC_LIST "graph/passes/dimension_compute_pass.cc" "graph/passes/dropout_pass.cc" "graph/passes/hccl_group_pass.cc" + "graph/passes/hccl_tailing_optimization_pass.cc" "graph/passes/enter_pass.cc" "graph/passes/assign_remove_pass.cc" "graph/passes/inplace_support_check_pass.cc" @@ -320,7 +329,9 @@ set(TRAIN_SRC_LIST "graph/passes/variable_ref_useless_control_out_delete_pass.cc" "graph/passes/end_of_sequence_add_control_pass.cc" "graph/passes/memcpy_addr_async_pass.cc" + "graph/passes/parallel_group_pass.cc" "graph/passes/set_input_output_offset_pass.cc" + "graph/passes/buffer_pool_memory_pass.cc" "graph/preprocess/graph_preprocess.cc" "graph/preprocess/insert_op/ge_aipp_op.cc" "graph/preprocess/insert_op/util_insert_aipp_op.cc" @@ -399,6 +410,7 @@ set(TRAIN_SRC_LIST "graph/build/memory/hybrid_mem_assigner.cc" "graph/build/memory/max_block_mem_assigner.cc" "graph/build/memory/var_mem_assign_util.cc" + "graph/build/memory/buffer_pool_mem_assigner.cc" ) set(INFER_SRC_LIST @@ -426,6 +438,7 @@ set(INFER_SRC_LIST "common/formats/formats.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_properties.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_manager.cc" "common/dump/dump_op.cc" "common/dump/opdebug_register.cc" @@ -499,6 +512,7 @@ set(INFER_SRC_LIST "graph/passes/atomic_addr_clean_pass.cc" "graph/passes/mark_same_addr_pass.cc" "graph/passes/mark_graph_unknown_status_pass.cc" + "graph/passes/mark_node_unknown_shape_pass.cc" "graph/passes/mark_agnostic_pass.cc" "graph/common/omg_util.cc" "graph/common/bcast.cc" @@ -605,8 +619,11 @@ set(INFER_SRC_LIST "graph/passes/link_gen_mask_nodes_pass.cc" "graph/passes/replace_with_empty_const_pass.cc" "graph/passes/hccl_group_pass.cc" + "graph/passes/hccl_tailing_optimization_pass.cc" "graph/passes/memcpy_addr_async_pass.cc" "graph/passes/set_input_output_offset_pass.cc" + "graph/passes/parallel_group_pass.cc" + "graph/passes/buffer_pool_memory_pass.cc" "graph/manager/model_manager/event_manager.cc" "graph/manager/util/rt_context_util.cc" "graph/manager/util/variable_accelerate_ctrl.cc" @@ -670,6 +687,7 @@ set(INFER_SRC_LIST "graph/build/memory/hybrid_mem_assigner.cc" "graph/build/memory/max_block_mem_assigner.cc" "graph/build/memory/var_mem_assign_util.cc" + "graph/build/memory/buffer_pool_mem_assigner.cc" ) if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES) @@ -700,6 +718,7 @@ target_compile_definitions(ge_runner PRIVATE DAVINCI_CLOUD google=ascend_private FUNC_VISIBILITY + $<$:ONLY_COMPILE_OPEN_SRC> ) target_compile_options(ge_runner PRIVATE @@ -775,6 +794,7 @@ target_compile_definitions(ge_compiler PRIVATE COMPILE_OMG_PACKAGE google=ascend_private FUNC_VISIBILITY + $<$:ONLY_COMPILE_OPEN_SRC> ) target_compile_options(ge_compiler PRIVATE @@ -937,6 +957,10 @@ add_library(atc_stub_ge_compiler SHARED add_dependencies(atc_stub_ge_compiler ge_stub) +target_compile_options(atc_stub_ge_compiler PRIVATE + -fno-common +) + target_link_libraries(atc_stub_ge_compiler PRIVATE $ ) @@ -973,6 +997,10 @@ add_library(fwk_stub_ge_runner SHARED add_dependencies(fwk_stub_ge_runner ge_stub) +target_compile_options(fwk_stub_ge_runner PRIVATE + -fno-common +) + target_link_libraries(fwk_stub_ge_runner PRIVATE $ ) diff --git a/ge/analyzer/analyzer.cc b/ge/analyzer/analyzer.cc index 1f733f28..528a0265 100755 --- a/ge/analyzer/analyzer.cc +++ b/ge/analyzer/analyzer.cc @@ -155,12 +155,12 @@ std::shared_ptr Analyzer::GetJsonObject(uint64_t session_id, uint64_t std::lock_guard lg(mutex_); auto iter = graph_infos_.find(session_id); if (iter == graph_infos_.end()) { - GELOGE(PARAM_INVALID, "[Check][Session_id]session_id:%lu does not exist! graph_id:%lu.", session_id, graph_id); + GELOGE(PARAM_INVALID, "[Check][SessionId]session_id:%lu does not exist! graph_id:%lu", session_id, graph_id); return nullptr; } else { auto iter1 = (iter->second).find(graph_id); if (iter1 == (iter->second).end()) { - GELOGE(PARAM_INVALID, "[Check][Graph_id]graph_id:%lu does not exist! session_id:%lu.", graph_id, session_id); + GELOGE(PARAM_INVALID, "[Check][GraphId]graph_id:%lu does not exist! session_id:%lu.", graph_id, session_id); return nullptr; } GELOGI("GetJsonObject Success!session_id:%lu graph_id:%lu", session_id, graph_id); @@ -200,7 +200,7 @@ ge::Status Analyzer::CreateAnalyzerFile() { } ge::Status Analyzer::SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_id) { - GELOGD("start to save analyze file."); + GELOGD("start to save analyze file"); auto graph_info = GetJsonObject(session_id, graph_id); GE_CHECK_NOTNULL(graph_info); @@ -221,7 +221,10 @@ ge::Status Analyzer::SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_ try { json_file_ << jsn.dump(kJsonDumpLevel) << std::endl; } catch (nlohmann::detail::type_error &e) { - GELOGE(FAILED, "[Json.dump][GraphInfo]json.dump to analyze file [%s] failed because [%s], session_id:%lu, graph_id:%lu", json_file_name_.c_str(), e.what(), session_id, graph_id); + GELOGE(FAILED, + "[Json.dump][GraphInfo]json.dump to analyze file [%s] failed because [%s]," + "session_id:%lu, graph_id:%lu", + json_file_name_.c_str(), e.what(), session_id, graph_id); ret_failed = true; } json_file_.close(); @@ -229,7 +232,7 @@ ge::Status Analyzer::SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_ } ge::Status Analyzer::DoAnalyze(DataInfo &data_info) { - GELOGD("start to do analyzer process!"); + GELOGD("start to do analyzer process"); auto pnode = data_info.node_ptr; GE_CHECK_NOTNULL(pnode); @@ -241,7 +244,9 @@ ge::Status Analyzer::DoAnalyze(DataInfo &data_info) { GE_CHECK_NOTNULL(graph_info); auto status = SaveOpInfo(desc, data_info, graph_info); if (status != SUCCESS) { - GELOGE(status, "[Check][SaveOpInfo]save op info: desc_name [%s] desc_type [%s] failed!", desc->GetName().c_str(), desc->GetType().c_str()); + GELOGE(status, + "[Check][SaveOpInfo]save op info: desc_name [%s] desc_type [%s] failed!", + desc->GetName().c_str(), desc->GetType().c_str()); return FAILED; } // create json file diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc index f0cf9e03..0c63c6e3 100644 --- a/ge/client/ge_api.cc +++ b/ge/client/ge_api.cc @@ -69,7 +69,11 @@ Status CheckOptionsValid(const std::map &options) { auto job_id_iter = options.find(OPTION_EXEC_JOB_ID); if (job_id_iter != options.end()) { if (job_id_iter->second.length() > kMaxStrLen) { - GELOGE(PARAM_INVALID, "CheckOptionsValid job_id failed, string len > %d", kMaxStrLen); + GELOGE(PARAM_INVALID,"[Check][JobId]Failed," + "the job_id [%s] string length: %zu > max string length: %d", + job_id_iter->second.c_str(), job_id_iter->second.length(), kMaxStrLen); + REPORT_INPUT_ERROR("E10051", std::vector({"id","length"}), + std::vector({job_id_iter->second, std::to_string(kMaxStrLen)})); return FAILED; } } @@ -84,7 +88,8 @@ Status GEInitializeImpl(const std::map &options) { std::string path_base = ge::GELib::GetPath(); auto ret = ErrorManager::GetInstance().Init(path_base); if (ret != SUCCESS) { - GELOGE(GE_CLI_INIT_FAILED, "ErrorManager init fail"); + GELOGE(GE_CLI_INIT_FAILED, + "[Init][PathBase]Init failed when pass param path_base:%s", path_base.c_str()); return ret; } @@ -104,7 +109,9 @@ Status GEInitializeImpl(const std::map &options) { bool is_proto_init = manager->Initialize(option_tmp); GE_TIMESTAMP_END(GEInitialize, "GEInitialize::ManagerInitialize"); if (!is_proto_init) { - GELOGE(GE_CLI_INIT_FAILED, "geInitialize failed, ops proto path is invalid."); + GELOGE(GE_CLI_INIT_FAILED, + "[Init][OpsProtoPath]Loading OpsProto lib plugin failed, OpsProtoPath:%s invalid.", + opsproto_path.c_str()); return FAILED; } @@ -127,7 +134,7 @@ Status GEInitializeImpl(const std::map &options) { ret = ge::GELib::Initialize(options); GE_TIMESTAMP_END(GELibInitialize, "GEInitialize::GELibInitialize"); if (ret != SUCCESS) { - GELOGE(GE_CLI_INIT_FAILED, "geInitialize failed, error code = %u", ret); + GELOGE(GE_CLI_INIT_FAILED, "[Init][GELib]Failed, error code = %u", ret); return FAILED; } @@ -155,7 +162,9 @@ Status GEInitialize(const std::map &options) { std::map str_options; for (auto &option : options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { - GELOGE(FAILED, "GEInitialize options is nullptr."); + GELOGE(FAILED, "[Check][Param]Options invalid, first or second option is nullptr."); + REPORT_INNER_ERROR("E19999", "Check parameter's options invalid," + "the first or second option is nullptr."); return FAILED; } std::string key = option.first.GetString(); @@ -171,17 +180,17 @@ Status GEInitialize(const std::map &options) { // GE finalize, releasing all resources Status GEFinalize() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); - GELOGT(TRACE_INIT, "GEFinalize start"); - - ErrorManager::GetInstance().GenWorkStreamIdDefault(); + std::lock_guard lock(g_ge_release_mutex); // check init status if (!g_ge_initialized) { - GELOGW("GEFinalize is called before GEInitialize"); + GELOGW("[FINAL][FINAL]GEFinalize is called before GEInitialize"); return SUCCESS; } - std::lock_guard lock(g_ge_release_mutex); + ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().GenWorkStreamIdDefault(); + GELOGT(TRACE_INIT, "GEFinalize start"); + // call Finalize Status ret = SUCCESS; Status middle_ret; @@ -237,13 +246,17 @@ Session::Session(const std::map &options) { // check init status sessionId_ = 0; if (!g_ge_initialized) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Construct][Session]Failed because lack GEInitialize call before."); + REPORT_INNER_ERROR("E19999", + "Creating session failed because lack GEInitialize call before."); return; } // call Initialize std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Session Constructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Construct][Session]Failed, GELib instance is nullptr or it is not InitFlag"); return; } @@ -256,7 +269,7 @@ Session::Session(const std::map &options) { if (ret == SUCCESS) { sessionId_ = session_id; } else { - GELOGE(ret, "Session constructor failed, session Id not initialized"); + GELOGE(ret, "[Construct][Session]Failed, error code:%u.", ret); return; } GELOGT(TRACE_STOP, "Session Constructor finished"); @@ -270,13 +283,17 @@ Session::Session(const std::map &options) { // check init status sessionId_ = 0; if (!g_ge_initialized) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Construct][Session]Failed because lack GEInitialize call before."); + REPORT_INNER_ERROR("E19999", + "Creating session failed because lack GEInitialize call before."); return; } // call Initialize std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Session Constructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Construct][Session]Failed, the GELib instance is nullptr or is not InitFlag"); return; } @@ -284,7 +301,9 @@ Session::Session(const std::map &options) { std::map str_options; for (auto &option : options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { - GELOGE(FAILED, "Session options is nullptr."); + GELOGE(FAILED, "[Construct][Session]Failed, the first or second option is nullptr."); + REPORT_INNER_ERROR("E19999", "Creating session's options invalid," + "the first or second option is nullptr."); return; } std::string key = option.first.GetString(); @@ -299,7 +318,7 @@ Session::Session(const std::map &options) { if (ret == SUCCESS) { sessionId_ = session_id; } else { - GELOGE(ret, "Session constructor failed, session Id not initialized"); + GELOGE(ret, "[Construct][Session]Failed, error code:%u.", ret); return; } GELOGT(TRACE_STOP, "Session Constructor finished"); @@ -331,17 +350,18 @@ Session::~Session() { ret = instance_ptr->SessionManagerObj().DestroySession(session_id); } catch (google::protobuf::FatalException &e) { - GELOGE(GE_CLI_SESS_DESTROY_FAILED, "SessionDestructor throws FatalException"); + GELOGE(GE_CLI_SESS_DESTROY_FAILED, "[Destruct][Session]Failed because get fatalException."); } // check return status, return, update session id if success if (ret != SUCCESS) { - GELOGE(ret, "Session Destructor failed"); + GELOGE(ret, "[Destruct][Session]Failed, error code:%u.", ret); } GELOGT(TRACE_STOP, "Session Destructor finished"); } +// Add Graph Status Session::AddGraph(uint32_t graph_id, const Graph &graph) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); std::map options; @@ -349,25 +369,32 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph) { return AddGraph(graph_id, graph, options); } +// Add Graph Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Add][Graph]Failed because GELib instance is nullptr or it is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "AddGraph Failed, GELib instance is nullptr or it is not InitFlag."); return FAILED; } GELOGD("Adding graph to session"); Status ret = instance_ptr->SessionManagerObj().AddGraph(sessionId_, graph_id, graph, options); if (ret != SUCCESS) { - GELOGE(ret, "AddGraph failed in Session."); + GELOGE(ret, + "[Add][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } GELOGD("AddGraph finished in Session."); return ret; } +//Add Graph Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); @@ -375,14 +402,19 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Add][Graph]Failed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "AddGraph Failed, GELib instance is nullptr or it is not InitFlag."); return FAILED; } GELOGD("Adding graph to session"); std::map str_options; for (auto &option : options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { - GELOGE(FAILED, "AddGraph options is nullptr."); + GELOGE(FAILED, "[Add][Graph]Failed, the first or second option is nullptr."); + REPORT_INNER_ERROR("E19999", + "Add Graph Failed, the first or second option is nullptr."); return FAILED; } std::string key = option.first.GetString(); @@ -391,7 +423,9 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, } Status ret = instance_ptr->SessionManagerObj().AddGraph(sessionId_, graph_id, graph, str_options); if (ret != SUCCESS) { - GELOGE(ret, "AddGraph failed in Session."); + GELOGE(ret, + "[Add][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } GELOGD("AddGraph finished in Session."); @@ -405,6 +439,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) { return AddGraphWithCopy(graph_id, graph, options); } +// Add Graph With Copy Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, const std::map &options) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); @@ -412,7 +447,10 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "AddGraph failed in Session."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Add][Graph]Failed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "AddGraph Failed, GELib instance is nullptr or is not InitFlag."); return FAILED; } std::map str_options; @@ -422,13 +460,16 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, GELOGD("Adding graph to session"); Status ret = instance_ptr->SessionManagerObj().AddGraphWithCopy(sessionId_, graph_id, graph, str_options); if (ret != SUCCESS) { - GELOGE(ret, "AddGraph failed in Session."); + GELOGE(ret, + "[Add][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } GELOGD("AddGraph finished in Session."); return ret; } +// Remove Graph Status Session::RemoveGraph(uint32_t graph_id) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGT(TRACE_INIT, "Session RemoveGraph start"); @@ -437,7 +478,10 @@ Status Session::RemoveGraph(uint32_t graph_id) { // call RemoveGraph std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (!instance_ptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Session RemoveGraph failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Remove][Graph]Failed, GELib instance is nullptr or is not InitFlag "); + REPORT_INNER_ERROR("E19999", + "RemoveGraph Failed, GELib instance is nullptr or is not InitFlag."); return FAILED; } @@ -445,13 +489,16 @@ Status Session::RemoveGraph(uint32_t graph_id) { Status ret = instance_ptr->SessionManagerObj().RemoveGraph(sessionId_, graph_id); // check return status, return if (ret != SUCCESS) { - GELOGE(ret, "session RemoveGraph failed"); + GELOGE(ret, + "[Remove][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } GELOGT(TRACE_STOP, "Session RemoveGraph finished"); return ret; } +// Print Output Result void PrintOutputResult(std::vector &outputs) { if (outputs.empty() || outputs[0].GetData() == nullptr) { GELOGW("outputs is empty or data is nullptr."); @@ -499,6 +546,7 @@ void PrintOutputResult(std::vector &outputs) { } } +// Run Graph Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, std::vector &outputs) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGT(TRACE_INIT, "Session RunGraph start"); @@ -508,14 +556,19 @@ Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, s // call RunGraph std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Session RunGraph failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Run][Graph]Failed, GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "RunGraph Failed, GELib instance is nullptr or is not InitFlag."); return FAILED; } GELOGT(TRACE_RUNNING, "Running Graph"); Status ret = instance_ptr->SessionManagerObj().RunGraph(sessionId_, graph_id, graph_inputs, outputs); // check return status if (ret != SUCCESS) { - GELOGE(ret, "Session RunGraph failed"); + GELOGE(ret, + "[Run][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } @@ -529,6 +582,7 @@ Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, s return ret; } +// Register Call Back Status Session::RegisterCallBackFunc(const std::string &key, const pCallBackFunc &callback) { ErrorManager::GetInstance().GenWorkStreamIdDefault(); return ge::GELib::GetInstance()->SessionManagerObj().RegisterCallBackFunc(sessionId_, key, callback); @@ -543,30 +597,40 @@ Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFu return ge::GELib::GetInstance()->SessionManagerObj().RegisterCallBackFunc(sessionId_, str_key, callback); } +// Build Graph Status Session::BuildGraph(uint32_t graph_id, const std::vector &inputs) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Build][Graph]Failed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "Build graph failed, the GELib instance is nullptr or is not InitFlag."); return FAILED; } GELOGT(TRACE_RUNNING, "Building Graph"); Status ret = instance_ptr->SessionManagerObj().BuildGraph(sessionId_, graph_id, inputs); if (ret != SUCCESS) { - GELOGE(ret, "Session BuildGraph failed"); + GELOGE(ret, + "[Build][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } return SUCCESS; } +// Run Graph Asynchronously Status Session::RunGraphAsync(uint32_t graph_id, const std::vector &inputs, RunAsyncCallback callback) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Run][Graph]RunGraphAsyncFailed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "RunGraphAsync Failed, the GELib instance is nullptr or is not InitFlag."); return FAILED; } GELOGT(TRACE_RUNNING, "Run Graph Asynchronously"); @@ -575,49 +639,59 @@ Status Session::RunGraphAsync(uint32_t graph_id, const std::vectorSessionManagerObj().RunGraphAsync(sessionId_, graph_id, inputs, callback); if (ret != SUCCESS) { - GELOGE(ret, "SessionManager RunGraphAsync failed"); + GELOGE(ret, "[Run][Graph]RunGraphAsync Failed, error code:%u, session_id:%lu, graph_id:%u.", + ret, sessionId_, graph_id); return FAILED; } return SUCCESS; } +// Get Variables Status Session::GetVariables(const std::vector &var_names, std::vector &var_values) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Get][Variables]Failed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "GetVariables failed, the GELib instance is nullptr or is not InitFlag."); return FAILED; } GELOGT(TRACE_RUNNING, "Get Variables"); Status ret = ge::GELib::GetInstance()->SessionManagerObj().GetVariables(sessionId_, var_names, var_values); if (ret != SUCCESS) { - GELOGE(ret, "SessionManager RunGraphAsync failed"); + GELOGE(ret, "[Get][Variables]Failed, error code:%u, session_id:%lu.", ret, sessionId_); return FAILED; } return SUCCESS; } +// Get Variables Status Session::GetVariables(const std::vector &var_names, std::vector &var_values) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "SessionConstructor failed"); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, + "[Get][Variables]Failed, the GELib instance is nullptr or is not InitFlag."); + REPORT_INNER_ERROR("E19999", + "GetVariables failed, the GELib instance is nullptr or is not InitFlag."); return FAILED; } GELOGT(TRACE_RUNNING, "Get Variables"); std::vector str_var_names; for (auto &var_name : var_names) { if (var_name.GetString() == nullptr) { - GELOGE(FAILED, "GetVariables name is nullptr."); + GELOGE(FAILED, "[Get][Variable]Failed, variables' names are nullptr."); + REPORT_INNER_ERROR("E19999", "GetVariables failed, variables' names are nullptr."); return FAILED; } str_var_names.emplace_back(var_name.GetString()); } Status ret = ge::GELib::GetInstance()->SessionManagerObj().GetVariables(sessionId_, str_var_names, var_values); if (ret != SUCCESS) { - GELOGE(ret, "SessionManager RunGraphAsync failed"); + GELOGE(ret, "[Get][Variables]Failed, error code:%u, session_id:%lu.", ret, sessionId_); return FAILED; } return SUCCESS; diff --git a/ge/client/proto/insert_op.proto b/ge/client/proto/insert_op.proto index bf918b20..7d708865 100644 --- a/ge/client/proto/insert_op.proto +++ b/ge/client/proto/insert_op.proto @@ -88,6 +88,7 @@ message AippOpParams { int32 right_padding_size = 69; int32 top_padding_size = 70; int32 bottom_padding_size = 71; + float padding_value = 72; int32 mean_chn_0 = 10; int32 mean_chn_1 = 11; diff --git a/ge/common/CMakeLists.txt b/ge/common/CMakeLists.txt index a6f8e57c..75cb8ad1 100755 --- a/ge/common/CMakeLists.txt +++ b/ge/common/CMakeLists.txt @@ -16,6 +16,7 @@ set(PROTO_LIST ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) +protobuf_generate(ge_static PROTO_STATIC_SRCS PROTO_STATIC_HDRS ${PROTO_LIST}) set(SRC_LIST "context/ctx.cc" @@ -127,7 +128,7 @@ target_link_libraries(ge_common PRIVATE ) ############ libge_common.a ############ -add_library(ge_common_static STATIC ${SRC_LIST} ${PROTO_HDRS}) +add_library(ge_common_static STATIC ${SRC_LIST} ${PROTO_STATIC_HDRS}) target_compile_definitions(ge_common_static PRIVATE PROTOBUF_INLINE_NOT_IN_HEADERS=0 HOST_VISIBILITY @@ -158,7 +159,7 @@ target_include_directories(ge_common_static PRIVATE ${METADEF_DIR}/inc/external/graph ${METADEF_DIR}/inc/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_static #### yellow zone #### ${GE_DEPEND_DIR}/inc ${GE_DEPEND_DIR}/inc/cce diff --git a/ge/common/auth/file_saver.cc b/ge/common/auth/file_saver.cc index 12999e54..3c3b6197 100755 --- a/ge/common/auth/file_saver.cc +++ b/ge/common/auth/file_saver.cc @@ -33,7 +33,8 @@ const int kFileOpSuccess = 0; namespace ge { Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) { if (CheckPath(file_path) != SUCCESS) { - GELOGE(FAILED, "Check output file failed."); + GELOGE(FAILED, "[Check][FilePath]Check output file failed, file_path:%s.", file_path.c_str()); + REPORT_CALL_ERROR("E19999", "Check output file failed, file_path:%s.", file_path.c_str()); return FAILED; } @@ -45,7 +46,8 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) { fd = mmOpen2(real_path, M_RDWR | M_CREAT | O_TRUNC, mode); if (fd == EN_INVALID_PARAM || fd == EN_ERROR) { // -1: Failed to open file; - 2: Illegal parameter - GELOGE(FAILED, "Open file failed. mmpa_errno = %d, %s", fd, strerror(errno)); + GELOGE(FAILED, "[Open][File]Failed. mmpa_errno = %d, %s", fd, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Open file failed, mmpa_errno = %d, error:%s.", fd, strerror(errno)); return FAILED; } return SUCCESS; @@ -62,7 +64,9 @@ Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) { while (size > size_1g) { write_count = mmWrite(fd, reinterpret_cast(seek), size_1g); if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) { - GELOGE(FAILED, "Write data failed. mmpa_errorno = %ld, %s", write_count, strerror(errno)); + GELOGE(FAILED, "[Write][Data]Failed, mmpa_errorno = %ld, error:%s", write_count, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data failed, mmpa_errorno = %ld, error:%s.", + write_count, strerror(errno)); return FAILED; } size -= size_1g; @@ -75,7 +79,9 @@ Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) { // -1: Failed to write to file; - 2: Illegal parameter if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) { - GELOGE(FAILED, "Write data failed. mmpa_errorno = %ld, %s", write_count, strerror(errno)); + GELOGE(FAILED, "[Write][Data]Failed. mmpa_errorno = %ld, error:%s", write_count, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data failed, mmpa_errorno = %ld, error:%s.", + write_count, strerror(errno)); return FAILED; } @@ -85,7 +91,8 @@ Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) { Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header, const void *data, int len) { if (data == nullptr || len <= 0) { - GELOGE(FAILED, "Model_data is null or the length[%d] less than 1.", len); + GELOGE(FAILED, "[Check][Param]Failed, model_data is null or the length[%d] is less than 1.", len); + REPORT_INNER_ERROR("E19999", "Save file failed, model_data is null or the length:%d is less than 1.", len); return FAILED; } @@ -104,7 +111,8 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi } while (0); // Close file if (mmClose(fd) != 0) { // mmClose 0: success - GELOGE(FAILED, "Close file failed."); + GELOGE(FAILED, "[Close][File]Failed, error_code:%u errmsg:%s", ret, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Close file failed, error_code:%u errmsg:%s", ret, strerror(errno)); ret = FAILED; } return ret; @@ -140,60 +148,95 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi } } while (0); // Close file - GE_CHK_BOOL_RET_STATUS(mmClose(fd) == EN_OK, FAILED, "Close file failed."); + if (mmClose(fd) != EN_OK) { + GELOGE(FAILED, "[Close][File]Failed, error_code:%u errmsg:%s", ret, strerror(errno)); + REPORT_CALL_ERROR("E19999", "Close file failed, error_code:%u errmsg:%s", ret, strerror(errno)); + ret = FAILED; + } return ret; } Status FileSaver::SaveToBuffWithFileHeader(const ModelFileHeader &file_header, ModelPartitionTable &model_partition_table, - const std::vector &partitionDatas, + const std::vector &partition_datas, + ge::ModelBufferData &model) { + const vector model_partition_tables = { &model_partition_table }; + const std::vector> all_partition_datas = { partition_datas }; + return SaveToBuffWithFileHeader(file_header, model_partition_tables, all_partition_datas, model); +} + +Status FileSaver::SaveToBuffWithFileHeader(const ModelFileHeader &file_header, + const vector &model_partition_tables, + const std::vector> &all_partition_datas, ge::ModelBufferData &model) { - GE_CHK_BOOL_RET_STATUS( - !partitionDatas.empty() && model_partition_table.num != 0 && model_partition_table.num == partitionDatas.size(), - FAILED, "Invalid param:partition data size is (%u), model_partition_table.num is (%zu).", - model_partition_table.num, partitionDatas.size()); - uint32_t model_header_size = sizeof(ModelFileHeader); - uint32_t table_size = static_cast(SIZE_OF_MODEL_PARTITION_TABLE(model_partition_table)); - uint32_t total_size = model_header_size + table_size; - - for (const auto &partitionData : partitionDatas) { - auto ret = ge::CheckUint32AddOverflow(total_size, partitionData.size); - GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "add uint32 overflow!"); - total_size = total_size + partitionData.size; + GE_CHK_BOOL_RET_STATUS(model_partition_tables.size() == all_partition_datas.size(), PARAM_INVALID, + "Model table size %zu does not match partition size %zu.", + model_partition_tables.size(), all_partition_datas.size()); + for (size_t index = 0; index < model_partition_tables.size(); ++index) { + auto &cur_partiton_data = all_partition_datas[index]; + auto &cur_model_partition_table = *model_partition_tables[index]; + GE_CHK_BOOL_RET_STATUS(!cur_partiton_data.empty() && cur_model_partition_table.num != 0 + && cur_model_partition_table.num == cur_partiton_data.size(), FAILED, + "Invalid param: partition data size is (%zu), model_partition_table.num is (%u).", + cur_partiton_data.size(), cur_model_partition_table.num); } + + uint64_t model_header_size = sizeof(ModelFileHeader); + uint64_t total_size = model_header_size; + for (size_t index = 0; index < model_partition_tables.size(); ++index) { + auto &cur_model_partition_table = *model_partition_tables[index]; + total_size += static_cast(SIZE_OF_MODEL_PARTITION_TABLE(cur_model_partition_table)); + auto &cur_partition_data = all_partition_datas[index]; + for (const auto &partition_data : cur_partition_data) { + auto ret = ge::CheckUint64AddOverflow(total_size, partition_data.size); + GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "Add uint64 overflow!"); + total_size += partition_data.size; + } + } + // save to buff auto buff = reinterpret_cast(malloc(total_size)); - GE_CHK_BOOL_RET_STATUS(buff != nullptr, FAILED, "malloc failed!"); - GE_PRINT_DYNAMIC_MEMORY(malloc, "file buffer.", total_size) + GE_CHK_BOOL_RET_STATUS(buff != nullptr, FAILED, "Malloc failed!"); + GE_PRINT_DYNAMIC_MEMORY(malloc, "File buffer.", total_size) model.data.reset(buff, [](uint8_t *buff) { GELOGD("Free online model memory."); free(buff); buff = nullptr; }); model.length = total_size; - uint32_t left_space = total_size; - auto ret_mem1 = memcpy_s(buff, left_space, reinterpret_cast(const_cast(&file_header)), - model_header_size); - GE_CHK_BOOL_RET_STATUS(ret_mem1 == 0, FAILED, "memcpy_s failed!"); + uint64_t left_space = total_size; + auto ret_mem = memcpy_s(buff, left_space, reinterpret_cast(const_cast(&file_header)), + model_header_size); + GE_CHK_BOOL_RET_STATUS(ret_mem == EOK, FAILED, "Memcpy_s failed!"); buff += model_header_size; left_space -= model_header_size; - auto ret_mem2 = memcpy_s(buff, left_space, reinterpret_cast(&model_partition_table), table_size); - GE_CHK_BOOL_RET_STATUS(ret_mem2 == 0, FAILED, "memcpy_s failed!"); - buff += table_size; - left_space -= table_size; - for (const auto &partitionData : partitionDatas) { - auto ret_mem3 = memcpy_s(buff, left_space, reinterpret_cast(const_cast(partitionData.data)), - partitionData.size); - GE_CHK_BOOL_RET_STATUS(ret_mem3 == 0, FAILED, "memcpy failed!"); - buff += partitionData.size; - left_space -= partitionData.size; + + for (size_t index = 0; index < model_partition_tables.size(); ++index) { + auto &cur_tabel = *model_partition_tables[index]; + uint64_t table_size = static_cast(SIZE_OF_MODEL_PARTITION_TABLE(cur_tabel)); + ret_mem = memcpy_s(buff, left_space, reinterpret_cast(&cur_tabel), table_size); + GE_CHK_BOOL_RET_STATUS(ret_mem == EOK, FAILED, "Memcpy_s failed!"); + buff += table_size; + left_space -= table_size; + auto &cur_partition_data = all_partition_datas[index]; + for (const auto &partition_data : cur_partition_data) { + ret_mem = memcpy_s(buff, left_space, reinterpret_cast(const_cast(partition_data.data)), + partition_data.size); + GE_CHK_BOOL_RET_STATUS(ret_mem == EOK, FAILED, "Memcpy_s failed!"); + buff += partition_data.size; + left_space -= partition_data.size; + } } + return SUCCESS; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::CheckPath(const std::string &file_path) { // Determine file path length if (file_path.size() >= MMPA_MAX_PATH) { - GELOGE(FAILED, "Path is too long:%zu", file_path.size()); + GELOGE(FAILED, "[Check][FilePath]Failed, file path's length:%zu > mmpa_max_path:%d", + file_path.size(), MMPA_MAX_PATH); + REPORT_INNER_ERROR("E19999", "Check file path failed, file path's length:%zu > mmpa_max_path:%d", + file_path.size(), MMPA_MAX_PATH); return FAILED; } @@ -212,7 +255,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::CheckPath(con // If there is a path before the file name, create the path if (path_split_pos != -1) { if (CreateDirectory(std::string(file_path).substr(0, static_cast(path_split_pos))) != kFileOpSuccess) { - GELOGE(FAILED, "CreateDirectory failed, file path:%s.", file_path.c_str()); + GELOGE(FAILED, "[Create][Directory]Failed, file path:%s.", file_path.c_str()); return FAILED; } } @@ -223,7 +266,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::CheckPath(con FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, const ge::ModelData &model, const ModelFileHeader *model_file_header) { if (file_path.empty() || model.model_data == nullptr || model.model_len == 0) { - GELOGE(FAILED, "Incorrected input param. file_path.empty() || model.model_data == nullptr || model.model_len == 0"); + GELOGE(FAILED, "[Save][File]Incorrect input param, file_path is empty or model_data is nullptr or model_len is 0"); + REPORT_INNER_ERROR("E19999", "Save file failed, at least one of the input parameters(file_path, model_data, model_len) is incorrect"); return FAILED; } @@ -240,7 +284,8 @@ FileSaver::SaveToFile(const string &file_path, const ge::ModelData &model, const const Status ret = SaveWithFileHeader(file_path, file_header, model.model_data, file_header.length); if (ret != SUCCESS) { - GELOGE(FAILED, "Save file failed, file_path:%s, file header len:%u.", file_path.c_str(), file_header.length); + GELOGE(FAILED, "[Save][File]Failed, file_path:%s, file_header_len:%u, error_code:%u.", + file_path.c_str(), file_header.length, ret); return FAILED; } @@ -305,7 +350,7 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi // Write partition data auto &cur_partition_datas = all_partition_datas[index]; for (const auto &partition_data : cur_partition_datas) { - GELOGI("GC:size[%u]", partition_data.size); + GELOGI("part_size[%u]", partition_data.size); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( WriteData(static_cast(partition_data.data), partition_data.size, fd) != SUCCESS, ret = FAILED; break); @@ -313,14 +358,19 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi } } while (0); // Close file - GE_CHK_BOOL_RET_STATUS(mmClose(fd) == EN_OK, FAILED, "Close file failed."); + if (mmClose(fd) != 0) { // mmClose 0: success + GELOGE(FAILED, "[Close][File]Failed, error_code:%u errmsg:%s", ret, strerror(errno)); + REPORT_CALL_ERROR("E19999", "Close file failed, error_code:%u errmsg:%s", ret, strerror(errno)); + ret = FAILED; + } return ret; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, const void *data, int len) { if (data == nullptr || len <= 0) { - GELOGE(FAILED, "Model_data is null or the length[%d] less than 1.", len); + GELOGE(FAILED, "[Check][Param]Failed, model_data is null or the length[%d] is less than 1.", len); + REPORT_INNER_ERROR("E19999", "Save file failed, the model_data is null or its length:%d is less than 1.", len); return FAILED; } @@ -335,7 +385,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(co // Close file if (mmClose(fd) != 0) { // mmClose 0: success - GELOGE(FAILED, "Close file failed."); + GELOGE(FAILED, "[Close][File]Failed, error_code:%u errmsg:%s", ret, strerror(errno)); + REPORT_CALL_ERROR("E19999", "Close file failed, error_code:%u errmsg:%s", ret, strerror(errno)); ret = FAILED; } return ret; diff --git a/ge/common/auth/file_saver.h b/ge/common/auth/file_saver.h index 97fbaae5..d98184d6 100644 --- a/ge/common/auth/file_saver.h +++ b/ge/common/auth/file_saver.h @@ -80,9 +80,14 @@ class FileSaver { static Status SaveToBuffWithFileHeader(const ModelFileHeader &file_header, ModelPartitionTable &model_partition_table, - const std::vector &partitionDatas, + const std::vector &partition_datas, ge::ModelBufferData& model); + static Status SaveToBuffWithFileHeader(const ModelFileHeader &file_header, + const std::vector &model_partition_tables, + const std::vector> &all_partition_datas, + ge::ModelBufferData &model); + static Status SaveToFile(const string &file_path, const void *data, int len); protected: @@ -113,8 +118,8 @@ class FileSaver { ModelPartitionTable &model_partition_table, const std::vector &partition_datas); static Status SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header, - vector &model_partition_tables, - const vector> &all_partition_datas); + std::vector &model_partition_tables, + const std::vector> &all_partition_datas); }; } // namespace ge #endif // GE_COMMON_AUTH_FILE_SAVER_H_ diff --git a/ge/common/debug/memory_dumper.cc b/ge/common/debug/memory_dumper.cc index 527f0bb2..668cf2ae 100644 --- a/ge/common/debug/memory_dumper.cc +++ b/ge/common/debug/memory_dumper.cc @@ -41,14 +41,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile GE_CHECK_NOTNULL(filename); GE_CHECK_NOTNULL(data); if (len == 0) { - GELOGE(FAILED, "len is 0."); + GELOGE(FAILED, "[Check][Param]Failed, data length is 0."); + REPORT_INNER_ERROR("E19999", "Check param failed, data length is 0."); return PARAM_INVALID; } // Open the file int fd = OpenFile(filename); if (fd == kInvalidFd) { - GELOGE(FAILED, "Open file failed."); + GELOGE(FAILED, "[Open][File]Failed, filename:%s.", filename); + REPORT_INNER_ERROR("E19999", "Opne file failed, filename:%s.", filename); return FAILED; } @@ -57,13 +59,15 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile int32_t mmpa_ret = mmWrite(fd, data, len); // mmWrite return -1:Failed to write data to file;return -2:Invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "Write to file failed. errno = %d, %s", mmpa_ret, strerror(errno)); + GELOGE(FAILED, "[Write][Data]Failed, errno = %d, error:%s", mmpa_ret, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data failed, errno = %d, error:%s.", mmpa_ret, strerror(errno)); ret = FAILED; } // Close the file if (mmClose(fd) != EN_OK) { // mmClose return 0: success - GELOGE(FAILED, "Close file failed."); + GELOGE(FAILED, "[Close][File]Failed, error_code:%u, filename:%s.", ret, filename); + REPORT_INNER_ERROR("E19999", "Close file failed, error_code:%u, filename:%s.", ret, filename); ret = FAILED; } @@ -89,7 +93,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::Open(const fd_ = OpenFile(filename); if (fd_ == kInvalidFd) { - GELOGE(FAILED, "Open %s failed.", filename); + GELOGE(FAILED, "[Open][File]Failed, filename:%s.", filename); + REPORT_INNER_ERROR("E19999", "Open file:%s failed.", filename); return FAILED; } @@ -104,7 +109,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::Dump(void int32_t mmpa_ret = mmWrite(fd_, data, len); // mmWrite return -1:failed to write data to file;return -2:invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "Write to file failed. errno = %d, %s", mmpa_ret, strerror(errno)); + GELOGE(FAILED, "[Write][Data]Failed, errno = %d, error:%s", mmpa_ret, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data to file failed, errno = %d, error:%s.", mmpa_ret, strerror(errno)); return FAILED; } @@ -155,9 +161,10 @@ int MemoryDumper::OpenFile(const char *filename) { // Using the O_EXCL, if the file already exists,return failed to avoid privilege escalation vulnerability. mmMode_t mode = M_IRUSR | M_IWUSR; - int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | O_TRUNC, mode); + int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | M_APPEND, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { - GELOGE(kInvalidFd, "open file failed. errno = %d, %s", fd, strerror(errno)); + GELOGE(kInvalidFd, "[Open][File]Failed. errno = %d, error:%s, filename:%s.", + fd, strerror(errno), filename); return kInvalidFd; } return fd; diff --git a/ge/common/dump/dump_manager.cc b/ge/common/dump/dump_manager.cc index a659d9c6..61a60afd 100644 --- a/ge/common/dump/dump_manager.cc +++ b/ge/common/dump/dump_manager.cc @@ -96,7 +96,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf dump_mode = dump_config.dump_mode; GELOGI("Dump mode is %s", dump_mode.c_str()); dump_properties.SetDumpMode(dump_mode); - dump_properties_map_.emplace(kInferSessionId, dump_properties); + dump_properties_map_[kInferSessionId] = dump_properties; return SUCCESS; } diff --git a/ge/common/dump/dump_op.cc b/ge/common/dump/dump_op.cc index 0becbdc8..1ce37b02 100755 --- a/ge/common/dump/dump_op.cc +++ b/ge/common/dump/dump_op.cc @@ -20,6 +20,7 @@ #include "common/ge/datatype_util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/util.h" +#include "framework/common/types.h" #include "graph/anchor.h" #include "graph/ge_tensor.h" #include "graph/op_desc.h" @@ -55,8 +56,10 @@ void DumpOp::SetLoopAddr(void *global_step, void *loop_per_iter, void *loop_cond loop_cond_ = reinterpret_cast(loop_cond); } -void DumpOp::SetDynamicModelInfo(const string &dynamic_model_name, uint32_t dynamic_model_id) { +void DumpOp::SetDynamicModelInfo(const string &dynamic_model_name, const string &dynamic_om_name, + uint32_t dynamic_model_id) { dynamic_model_name_ = dynamic_model_name; + dynamic_om_name_ = dynamic_om_name; dynamic_model_id_ = dynamic_model_id; } @@ -200,6 +203,32 @@ Status DumpOp::ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info) { return SUCCESS; } +Status DumpOp::SetDumpModelName(aicpu::dump::OpMappingInfo &op_mapping_info) { + if (dynamic_model_name_.empty() && dynamic_om_name_.empty()) { + GELOGI("Single op dump, no need set model name"); + return SUCCESS; + } + std::set model_list = dump_properties_.GetAllDumpModel(); + bool not_find_by_omname = model_list.find(dynamic_om_name_) == model_list.end(); + bool not_find_by_modelname = model_list.find(dynamic_model_name_) == model_list.end(); + std::string dump_model_name = not_find_by_omname ? dynamic_model_name_ : dynamic_om_name_; + if (model_list.find(DUMP_ALL_MODEL) == model_list.end()) { + if (not_find_by_omname && not_find_by_modelname) { + std::string model_list_str; + for (auto &model : model_list) { + model_list_str += "[" + model + "]."; + } + GELOGW("Model %s will not be set to dump, dump list: %s", dump_model_name.c_str(), model_list_str.c_str()); + return FAILED; + } + } + if (!dump_model_name.empty() && dump_properties_.IsDumpOpen()) { + GELOGI("Dump model name is %s", dump_model_name.c_str()); + op_mapping_info.set_model_name(dump_model_name); + } + return SUCCESS; +} + Status DumpOp::LaunchDumpOp() { GELOGI("Start to launch dump op %s", op_desc_->GetName().c_str()); int32_t device_id = 0; @@ -209,8 +238,7 @@ Status DumpOp::LaunchDumpOp() { return RT_ERROR_TO_GE_STATUS(rt_ret); } if (device_id < 0) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, - "Check device_id failed, device_id = %d, which should be not less than 0.", + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Check device_id failed, device_id = %d, which should be not less than 0.", device_id); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -220,11 +248,12 @@ Status DumpOp::LaunchDumpOp() { op_mapping_info.set_flag(kAicpuLoadFlag); op_mapping_info.set_dump_step(dump_properties_.GetDumpStep()); op_mapping_info.set_model_id(dynamic_model_id_); - if (!dynamic_model_name_.empty() && dump_properties_.IsDumpOpen()) { - op_mapping_info.set_model_name(dynamic_model_name_); + + if (SetDumpModelName(op_mapping_info) != SUCCESS) { + return SUCCESS; } SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info); - GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(), + GELOGI("Dump step is %s ,dump path is %s in Launch dump op", dump_properties_.GetDumpStep().c_str(), dump_path.c_str()); uint32_t task_id = 0; uint32_t stream_id = 0; @@ -273,4 +302,4 @@ Status DumpOp::LaunchDumpOp() { } return SUCCESS; } -} // namesapce ge +} // namespace ge diff --git a/ge/common/dump/dump_op.h b/ge/common/dump/dump_op.h index d59962e6..4d322bee 100755 --- a/ge/common/dump/dump_op.h +++ b/ge/common/dump/dump_op.h @@ -34,12 +34,13 @@ class DumpOp { vector output_addrs, rtStream_t stream); Status LaunchDumpOp(); void SetLoopAddr(void *global_step, void *loop_per_iter, void *loop_cond); - void SetDynamicModelInfo(const string &dynamic_model_name, uint32_t dynamic_model_id); + void SetDynamicModelInfo(const string &dynamic_model_name, const string &dynamic_om_name, uint32_t dynamic_model_id); private: Status ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info); Status DumpOutput(aicpu::dump::Task &task); Status DumpInput(aicpu::dump::Task &task); + Status SetDumpModelName(aicpu::dump::OpMappingInfo &op_mapping_info); DumpProperties dump_properties_; OpDescPtr op_desc_; @@ -54,6 +55,7 @@ class DumpOp { uintptr_t loop_cond_; std::string dynamic_model_name_; + std::string dynamic_om_name_; std::uint32_t dynamic_model_id_; }; } // namespace ge diff --git a/ge/common/dump/dump_properties.cc b/ge/common/dump/dump_properties.cc index 3fbfd16b..65b1e89a 100644 --- a/ge/common/dump/dump_properties.cc +++ b/ge/common/dump/dump_properties.cc @@ -35,14 +35,14 @@ const std::string kDumpStatusOpen = "on"; const uint32_t kAicoreOverflow = (0x1 << 0); const uint32_t kAtomicOverflow = (0x1 << 1); const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow); -} +} // namespace namespace ge { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpProperties::DumpProperties(const DumpProperties &other) { CopyFrom(other); } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY DumpProperties &DumpProperties::operator=( - const DumpProperties &other) { + const DumpProperties &other) { CopyFrom(other); return *this; } @@ -97,7 +97,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::InitByOpti // The following is the new dump scenario of the fusion operator FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::AddPropertyValue( - const std::string &model, const std::set &layers) { + const std::string &model, const std::set &layers) { for (const std::string &layer : layers) { GELOGI("This model %s config to dump layer %s", model.c_str(), layer.c_str()); } @@ -138,7 +138,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set DumpPrope } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set DumpProperties::GetPropertyValue( - const std::string &model) const { + const std::string &model) const { auto iter = model_dump_properties_map_.find(model); if (iter != model_dump_properties_map_.end()) { return iter->second; @@ -147,8 +147,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set DumpPrope } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool DumpProperties::IsLayerNeedDump( - const std::string &model, const std::string &om_name, const std::string &op_name) const { + const std::string &model, const std::string &om_name, const std::string &op_name) const { // if dump all + GELOGD("model name is %s om name is %s op is %s in layer need dump", model.c_str(), om_name.c_str(), op_name.c_str()); if (model_dump_properties_map_.find(DUMP_ALL_MODEL) != model_dump_properties_map_.end()) { return true; } @@ -203,7 +204,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string &DumpProperti } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpProperties::SetDumpOpSwitch( - const std::string &dump_op_switch) { + const std::string &dump_op_switch) { dump_op_switch_ = dump_op_switch; } @@ -270,4 +271,4 @@ void DumpProperties::SetDumpDebugOptions() { GELOGI("ge.exec.enableDumpDebug is false or is not set."); } } -} // namespace +} // namespace ge diff --git a/ge/common/dump/exception_dumper.cc b/ge/common/dump/exception_dumper.cc new file mode 100644 index 00000000..bed389a7 --- /dev/null +++ b/ge/common/dump/exception_dumper.cc @@ -0,0 +1,241 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/dump/exception_dumper.h" + +#include "common/ge/datatype_util.h" +#include "common/debug/memory_dumper.h" +#include "framework/common/debug/log.h" +#include "graph/manager/util/debug.h" +#include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_utils.h" +#include "proto/dump_task.pb.h" + +namespace { +static uint64_t GetNowTime() { + uint64_t ret = 0; + mmTimeval tv; + if (mmGetTimeOfDay(&tv, nullptr) == 0) { + ret = tv.tv_sec * 1000000ULL + tv.tv_usec; + } + + return ret; +} + +static void ReplaceStringElem(std::string &str) { + for_each(str.begin(), str.end(), [](char &ch) { + if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { + ch = '_'; + } + }); +} + +static void SetDumpData(const ge::OpDescInfo &op_desc_info, toolkit::dumpdata::DumpData &dump_data) { + dump_data.set_version("2.0"); + dump_data.set_dump_time(GetNowTime()); + dump_data.set_op_name(op_desc_info.op_name); + for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { + toolkit::dumpdata::OpInput input; + input.set_data_type(toolkit::dumpdata::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i]))); + input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); + for (auto dim : op_desc_info.input_shape[i]) { + input.mutable_shape()->add_dim(dim); + } + input.set_size(op_desc_info.input_size[i]); + GELOGI("[Set][DumpData] The input size int exception is %ld", op_desc_info.input_size[i]); + dump_data.mutable_input()->Add(std::move(input)); + } + + for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { + toolkit::dumpdata::OpOutput output; + output.set_data_type(toolkit::dumpdata::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j]))); + output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); + for (auto dim : op_desc_info.output_shape[j]) { + output.mutable_shape()->add_dim(dim); + } + output.set_size(op_desc_info.output_size[j]); + GELOGI("[Set][DumpData] The output size int exception is %ld", op_desc_info.output_size[j]); + dump_data.mutable_output()->Add(std::move(output)); + } +} +} // namespace + +namespace ge { +ExceptionDumper::~ExceptionDumper() {} + +void ExceptionDumper::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + vector &input_addrs, vector &output_addrs) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = input_addrs; + op_desc_info.output_addrs = output_addrs; + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, + uint32_t task_id, uint32_t stream_id) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); + op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + OpDescInfo &op_desc_info) { + if (op == nullptr) { + GELOGW("[Save][OpExceptionInfo] op desc ptr is null."); + return; + } + GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u", + op->GetName().c_str(), task_id, stream_id); + op_desc_info.op_name = op->GetName(); + op_desc_info.op_type = op->GetType(); + op_desc_info.task_id = task_id; + op_desc_info.stream_id = stream_id; + for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { + GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); + if (input_tensor_desc == nullptr) { + continue; + } + op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); + op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); + op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); + int64_t input_size = 0; + + if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size); + op_desc_info.input_size.emplace_back(input_size); + } + for (size_t j = 0; j < op->GetOutputsSize(); ++j) { + GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); + if (output_tensor_desc == nullptr) { + continue; + } + op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); + op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); + op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); + int64_t output_size = 0; + if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size); + op_desc_info.output_size.emplace_back(output_size); + } +} + +Status ExceptionDumper::DumpExceptionInfo(const std::vector &exception_infos) const { + GELOGI("[Dump][Exception] Start to dump exception info"); + for (const rtExceptionInfo &iter : exception_infos) { + OpDescInfo op_desc_info; + if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { + toolkit::dumpdata::DumpData dump_data; + SetDumpData(op_desc_info, dump_data); + uint64_t now_time = GetNowTime(); + std::string op_name = op_desc_info.op_name; + std::string op_type = op_desc_info.op_type; + ReplaceStringElem(op_name); + ReplaceStringElem(op_type); + string dump_file_path = + "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); + GELOGI("[Dump][Exception] The exception dump file path is %s", dump_file_path.c_str()); + + uint64_t proto_size = dump_data.ByteSizeLong(); + std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); + bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); + if (!ret || proto_size == 0) { + REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed"); + return PARAM_INVALID; + } + + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), + "Failed to dump proto size"); + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), + "Failed to dump proto msg"); + if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception input failed"); + return PARAM_INVALID; + } + + if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception output failed"); + return PARAM_INVALID; + } + GELOGI("[Dump][Exception] Dump exception info SUCCESS"); + } else { + GELOGE(PARAM_INVALID, "[Dump][Exception] Get op desc info failed,task id:%u,stream id:%u", + iter.taskid, iter.streamid); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +bool ExceptionDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size()); + for (size_t index = 0; index < op_desc_info_.size(); ++index) { + OpDescInfo dump_op_info = op_desc_info_.at(index); + if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { + GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.", + dump_op_info.op_name.c_str(), task_id, stream_id); + op_desc_info = dump_op_info; + return true; + } + } + return false; +} + +Status ExceptionDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionInput] Start to dump exception input"); + for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status ExceptionDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionOutput] Start to dump exception output"); + for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != + SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +OpDescInfo *ExceptionDumper::MutableOpDescInfo(uint32_t task_id, uint32_t stream_id) { + for (OpDescInfo &op_desc_info : op_desc_info_) { + if (op_desc_info.task_id == task_id && op_desc_info.stream_id == stream_id) { + return &op_desc_info; + } + } + return nullptr; +} +} // namespace ge \ No newline at end of file diff --git a/ge/common/dump/exception_dumper.h b/ge/common/dump/exception_dumper.h new file mode 100644 index 00000000..38a3f26e --- /dev/null +++ b/ge/common/dump/exception_dumper.h @@ -0,0 +1,48 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ +#define GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ + +#include + +#include "graph/op_desc.h" +#include "framework/common/ge_types.h" +#include "graph/load/model_manager/task_info/task_info.h" + +namespace ge { +class ExceptionDumper { + public: + ExceptionDumper() = default; + ~ExceptionDumper(); + + void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + std::vector &input_addrs, std::vector &output_addrs); + void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); + Status DumpExceptionInfo(const std::vector &exception_infos) const; + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + OpDescInfo *MutableOpDescInfo(uint32_t task_id, uint32_t stream_id); + + private: + void SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, OpDescInfo &op_desc_info); + Status DumpExceptionInput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + + std::vector op_desc_info_; +}; +} // namespace ge + +#endif // GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ diff --git a/ge/common/dump/opdebug_register.cc b/ge/common/dump/opdebug_register.cc index 340b89e5..aae80cb0 100644 --- a/ge/common/dump/opdebug_register.cc +++ b/ge/common/dump/opdebug_register.cc @@ -80,13 +80,11 @@ Status OpdebugRegister::RegisterDebugForStream(rtStream_t stream, uint32_t op_de uint32_t debug_stream_id = 0; uint32_t debug_task_id = 0; -#ifdef ONLY_COMPILE_OPEN_SRC auto rt_ret = rtDebugRegisterForStream(stream, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "rtDebugRegisterForStream error, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } -#endif GELOGD("debug_task_id:%u, debug_stream_id:%u in stream overflow.", debug_task_id, debug_stream_id); data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true); return SUCCESS; @@ -94,7 +92,6 @@ Status OpdebugRegister::RegisterDebugForStream(rtStream_t stream, uint32_t op_de void OpdebugRegister::UnregisterDebugForStream(rtStream_t stream) { rtError_t rt_ret = RT_ERROR_NONE; -#ifdef ONLY_COMPILE_OPEN_SRC if (stream != nullptr) { GELOGD("start call rtDebugUnRegisterForStream in unknown shape over flow."); rt_ret = rtDebugUnRegisterForStream(stream); @@ -102,7 +99,6 @@ void OpdebugRegister::UnregisterDebugForStream(rtStream_t stream) { GELOGW("rtDebugUnRegisterForStream failed, ret: 0x%X", rt_ret); } } -#endif if (op_debug_addr_ != nullptr) { rt_ret = rtFree(op_debug_addr_); @@ -145,4 +141,4 @@ Status OpdebugRegister::MallocMemForOpdebug() { return SUCCESS; } -} // namespace ge \ No newline at end of file +} // namespace ge diff --git a/ge/common/formats/format_transfers/datatype_transfer.cc b/ge/common/formats/format_transfers/datatype_transfer.cc index 4ef866f5..5aaa8fd5 100644 --- a/ge/common/formats/format_transfers/datatype_transfer.cc +++ b/ge/common/formats/format_transfers/datatype_transfer.cc @@ -154,7 +154,11 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to alloc the memory for dst buf %zu, data size %zu", total_size, args.src_data_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Allocate][DSTMemory]Failed, memory for dst buf %zu, data size %zu", + total_size, args.src_data_size); + REPORT_CALL_ERROR("E19999", "Failed to allocate memory for dst buf %zu, data size %zu", + total_size, args.src_data_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } diff --git a/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc b/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc index 706f401e..ce271c6d 100644 --- a/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc +++ b/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc @@ -49,11 +49,15 @@ Status CheckArgsForC1hwncoc0ToHwcn(const TransArgs &args) { return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(src_shape, kC1hwncoc0DimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", ShapeToString(src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][SrcShape]Failed, src shape %s", + ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check src shape %s", ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(dst_shape, kHwcnDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s.", ShapeToString(dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][DSTShape]Failed, dst shape %s.", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } auto cube_size = GetCubeSizeByDataType(args.src_data_type); @@ -73,9 +77,17 @@ Status CheckArgsForC1hwncoc0ToHwcn(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size, int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Allocate][DSTMemory]Failed to allcoate memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -115,10 +127,16 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld to " + "[Operate][Memory]Failed to copy data from " + "C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld to " "HWCN[%ld, %ld, %ld, %ld] offset %ld, err-code %d", - c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, src_offset, h_idx, w_idx, c_idx, n_idx, dst_offset, - ret); + c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, src_offset, + h_idx, w_idx, c_idx, n_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from " + "C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld to " + "HWCN[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, src_offset, + h_idx, w_idx, c_idx, n_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -144,8 +162,13 @@ Status FormatTransferC1hwncoc0Hwcn::TransFormat(const TransArgs &args, TransResu result.length = static_cast(total_size); return SUCCESS; } - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s.", total_size, - ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][Shape]Failed, total size %ld from dst shape %s, " + "src shape %s.", + total_size, ShapeToString(args.dst_shape).c_str(), + ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Get shape faield, total size %ld from dst shape %s, src shape %s.", + total_size, ShapeToString(args.dst_shape).c_str(), + ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from C1HWNCoC0 to HWCN, src shape %s, data type %s, dst shape %s, memory size %ld.", @@ -153,9 +176,16 @@ Status FormatTransferC1hwncoc0Hwcn::TransFormat(const TransArgs &args, TransResu ShapeToString(args.dst_shape).c_str(), total_size); ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed when after trans, src shape %s, data type %s, dst shape %s, " + "memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; diff --git a/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc b/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc index 57574856..4854fdd2 100644 --- a/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc +++ b/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc @@ -94,9 +94,14 @@ Status TransFormatDhwckToFz3D(const TransArgs &args, TransResult &result) { std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -122,8 +127,10 @@ Status TransFormatDhwckToFz3D(const TransArgs &args, TransResult &result) { args.data + src_idx * data_size, static_cast(data_size)); } if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", - dst_offset, ret, pad_zero); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at " + "offset %ld, error-code %d, pad mode %d", dst_offset, ret, pad_zero); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, " + "error-code %d, pad mode %d", dst_offset, ret, pad_zero); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } diff --git a/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc b/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc index 6e1e47ed..04ce299a 100644 --- a/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc +++ b/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc @@ -95,9 +95,14 @@ Status TransFormatDhwncToFz3DTranspose(const TransArgs &args, TransResult &resul std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -123,8 +128,10 @@ Status TransFormatDhwncToFz3DTranspose(const TransArgs &args, TransResult &resul args.data + src_idx * data_size, static_cast(data_size)); } if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", - dst_offset, ret, pad_zero); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at " + "offset %ld, error-code %d, pad mode %d", dst_offset, ret, pad_zero); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, " + "error-code %d, pad mode %d", dst_offset, ret, pad_zero); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc index bb9b71de..24be6023 100755 --- a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc +++ b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc @@ -60,7 +60,7 @@ bool CheckShape(Format format, const ShapeVector &shape) { default: std::string error = "Trans format between " + FmtToStr(TypeUtils::FormatToSerialString(format)) + " and FORMAT_FRACTAL_NZ is not supported."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_FORMAT_INVALID, error.c_str()); return false; } } @@ -87,7 +87,10 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap hw_shape.push_back(DIM_DEFAULT_VALUE); hw_shape.push_back(src_shape[kNdDimIndexN]); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][DSTShape]Failed, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -106,7 +109,10 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]); hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][DSTShape]Failed, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -117,10 +123,12 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) { ShapeVector expect_src_shape; auto ret = TransShapeToFracNz(args.dst_shape, args.src_data_type, expect_src_shape, hw_shape); if (ret != SUCCESS) { - GELOGE(ret, "Trans shape from %s to %s, shape %s to %s, data type %s failed", - TypeUtils::FormatToSerialString(args.dst_format).c_str(), - TypeUtils::FormatToSerialString(args.src_format).c_str(), ShapeToString(args.dst_shape).c_str(), - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + GELOGE(ret, "[Transfer][ShapeToFracNz]Failed, shape from %s to %s, shape %s to %s, " + "data type %s, error_code:%u", TypeUtils::FormatToSerialString(args.dst_format).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + ShapeToString(args.dst_shape).c_str(), + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ret); return ret; } if (!IsTransShapeSrcCorrect(args, expect_src_shape)) { @@ -139,9 +147,14 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allocate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allocate memory for dst buf %ld " + "trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -175,7 +188,10 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED,"[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -189,7 +205,10 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED,"[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -210,9 +229,14 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to trans format " + "from %s to %s, memory for dst buf %ld", TypeUtils::FormatToSerialString(args.src_format).c_str(), TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + REPORT_CALL_ERROR("E19999", "Failed to trans format from %s to %s and allocate memory " + "for dst buf %ld", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -246,7 +270,11 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -260,7 +288,11 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -274,17 +306,39 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con Status FormatTransferFractalNz::TransFormat(const TransArgs &args, TransResult &result) { if (!IsDataTypeSupport(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Trans format from %s to %s, src shape %s, dst shape %s, data type %s is not supported", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, + "[Check][Datatype]Failed, trans format from %s to %s, src shape %s, dst shape %s, " + "data type %s is not supported", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype failed, trans format from %s to %s, src shape %s, " + "dst shape %s, data type %s is not supported", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShape(args.src_format, args.src_shape) || !IsShapeValid(args.dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Trans format from %s to %s, src shape %s, dst shape %s, data type %s is not supported", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, + "[Check][Shape]Failed, trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s is not supported", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check shape failed, trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s is not supported", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from %s to %s, src shape %s, dst shape %s, data type %s", @@ -307,16 +361,34 @@ Status FormatTransferFractalNz::TransShape(Format src_format, const ShapeVector Format dst_format, ShapeVector &dst_shape) { if (!IsDataTypeSupport(data_type)) { GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, - "Trans format from %s to %s, src shape %s, data type %s is not supported", - TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), - ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + "[Check][Datatype]Failed, trans format from %s to %s, src shape %s, " + "data type %s is not supported", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype failed, trans format from %s to %s, src shape %s, " + "data type %s is not supported", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShape(src_format, src_shape)) { GELOGE(ACL_ERROR_GE_SHAPE_INVALID, - "Trans format from %s to %s, src shape %s, data type %s is not supported", - TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), - ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + "[Check][Shape]Failed, trans format from %s to %s, src shape %s, " + "data type %s is not supported", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check shape failed, trans format from %s to %s, src shape %s, " + "data type %s is not supported", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } ShapeVector hw_shape; @@ -325,18 +397,40 @@ Status FormatTransferFractalNz::TransShape(Format src_format, const ShapeVector Status FormatTransferFractalNzND::TransFormat(const TransArgs &args, TransResult &result) { if (!IsDataTypeSupport(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Trans format from %s to %s, src shape %s, dst shape %s, data type %s is not supported", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, + "[Check][Datatype]Failed, trans format from %s to %s, src shape %s, dst shape %s, " + "data type %s is not supported", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype failed, trans format from %s to %s, src shape %s, " + "dst shape %s, data type %s is not supported", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!IsShapeValid(args.src_shape) || !CheckShape(args.dst_format, args.dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Trans format from %s to %s, src shape %s, dst shape %s, data type %s is not supported", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, + "[Check][Shape]Failed, trans format from %s to %s, src shape %s, dst shape %s, " + "data type %s is not supported", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check shape failed, trans format from %s to %s, src shape %s, " + "dst shape %s, data type %s is not supported", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from %s to %s, src shape %s, dst shape %s, data type %s", diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_z.cc b/ge/common/formats/format_transfers/format_transfer_fractal_z.cc index 712f7c61..ddce348b 100644 --- a/ge/common/formats/format_transfers/format_transfer_fractal_z.cc +++ b/ge/common/formats/format_transfers/format_transfer_fractal_z.cc @@ -29,6 +29,25 @@ namespace ge { namespace formats { namespace { +constexpr int64_t kDim = 1; +static int64_t Measure(int64_t x, int64_t y) { + int64_t z = y; + while (x % y != 0) { + z = x % y; + x = y; + y = z; + } + return z; +} +// least common multiple +static int64_t Lcm(int64_t a, int64_t b) { + if (b == 0) { + return -1; + } + int64_t temp = (a * b) / (Measure(a, b)); + return temp; +} + Status CheckDataTypeSupport(DataType data_type) { return GetSizeByDataType(data_type) > 0 ? SUCCESS : UNSUPPORTED; } /** @@ -54,8 +73,39 @@ Status TransShapeToFz(int64_t n, int64_t c, int64_t h, int64_t w, DataType data_ dst_shape.push_back(kNiSize); dst_shape.push_back(c0); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Failed, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; + } + return SUCCESS; +} + +Status TransShapeToFzWithGroups(int64_t n, int64_t c, int64_t h, int64_t w, DataType data_type, std::vector &dst_shape, + int64_t groups) { + auto c0 = GetCubeSizeByDataType(data_type); + if (c0 < 0) { + return ACL_ERROR_GE_DATATYPE_INVALID; + } + int64_t cin_ori = c; + int64_t cout_ori = n / groups; + int64_t cube_k = GetCubeSizeByDataType(data_type); + int64_t e_mult = std::min( + Lcm(Lcm(cin_ori, cube_k) / (cin_ori), Lcm(cout_ori, static_cast(kCubeSize)) / (cout_ori)), + groups); + int64_t cin_opt = Ceil(e_mult * cin_ori, cube_k) * cube_k; + int64_t c1_dim = cin_opt / cube_k; + int64_t g_dim = Ceil(groups, e_mult); + auto n1 = Ceil(cout_ori * e_mult, static_cast(kCubeSize)); + dst_shape.clear(); + dst_shape.push_back(g_dim * c1_dim * h * w); + dst_shape.push_back(n1); + dst_shape.push_back(16); + dst_shape.push_back(cube_k); + if (!IsShapeValid(dst_shape)) { + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Failed, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -86,6 +136,21 @@ Status TransShapeHwcnToFz(const std::vector &src_shape, DataType data_t return TransShapeToFz(n, c, h, w, data_type, dst_shape); } +Status TransShapeHwcnToFzWithGroups(const std::vector &src_shape, DataType data_type, std::vector &dst_shape +, int64_t groups){ + if (!CheckShapeValid(src_shape, kHwcnDimsNum)) { + return ACL_ERROR_GE_SHAPE_INVALID; + } + + auto h = src_shape.at(kHwcnH); + auto w = src_shape.at(kHwcnW); + auto c = src_shape.at(kHwcnC); + auto n = src_shape.at(kHwcnN); + + return TransShapeToFzWithGroups(n, c, h, w, data_type, dst_shape, groups); +} + + Status TransShapeNhwcToFz(const std::vector &src_shape, DataType data_type, std::vector &dst_shape) { if (!CheckShapeValid(src_shape, kNhwcDimsNum)) { return ACL_ERROR_GE_SHAPE_INVALID; @@ -127,9 +192,14 @@ Status TransFormatFromNchwToFz(const TransArgs &args, TransResult &result) { std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( dst == nullptr, - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION;); for (int64_t vfi = 0; vfi < vf_cnt; vfi++) { @@ -173,8 +243,12 @@ Status TransFormatFromNchwToFz(const TransArgs &args, TransResult &result) { } } if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d pad mode %d", offset, - ret, need_pad_zero); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED,"[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d pad mode %d", + offset, ret, need_pad_zero); + REPORT_CALL_ERROR("E19999","Failed to operate dst memory at offset %ld, " + "error-code %d pad mode %d", + offset, ret, need_pad_zero); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -187,6 +261,89 @@ Status TransFormatFromNchwToFz(const TransArgs &args, TransResult &result) { return SUCCESS; } +Status TransFormatHwcnToFzWithGroups(const TransArgs &args, TransResult &result, int64_t groups){ + int64_t h_dim = args.src_shape[kHwcnH]; + int64_t w_dim = args.src_shape[kHwcnW]; + int64_t c_dim = args.src_shape[kHwcnC]; + int64_t n_dim = args.src_shape[kHwcnN]; + int64_t cin_ori = c_dim; + int64_t cout_ori = n_dim / groups; + if (cin_ori == 0 || cout_ori == 0) { + GELOGE(GRAPH_FAILED, "[Check][Param]Failed, cin_ori, cout_ori must not be equal 0, " + "and current cin_ori, cout_ori, groups are %ld %ld %ld", cin_ori, cout_ori, groups); + REPORT_CALL_ERROR("E19999", "Check graph param failed, cin_ori, cout_ori must not be equal 0," + "and current cin_ori, cout_ori, groups are %ld %ld %ld", + cin_ori, cout_ori, groups); + return GRAPH_FAILED; + } + const int64_t cube_k = GetCubeSizeByDataType(args.src_data_type); + int64_t e_mult = std::min( + Lcm(Lcm(cin_ori, cube_k) / (cin_ori), Lcm(cout_ori, static_cast(kCubeSize)) / (cout_ori)), + groups); + int64_t cin_opt = Ceil(e_mult * cin_ori, cube_k) * cube_k; + int64_t cout_opt = Ceil(e_mult * cout_ori, static_cast(kCubeSize)) * static_cast(kCubeSize); + int64_t c1_dim = cin_opt / cube_k; + int64_t g_dim = Ceil(groups, e_mult); + int64_t dim_cin = cin_opt / cube_k; + int64_t data_size = GetSizeByDataType(args.src_data_type); + int64_t size_output_data = g_dim * kDim * dim_cin * h_dim * w_dim * cout_opt * cube_k * data_size; + if (size_output_data == 0) { + result.length = static_cast(size_output_data); + return SUCCESS; + } + errno_t ret = EOK; + std::shared_ptr dst(new (std::nothrow) uint8_t[size_output_data], std::default_delete()); + if (dst == nullptr) { + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + size_output_data, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + size_output_data, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + return ACL_ERROR_GE_MEMORY_ALLOCATION; + } + ret = memset_s(dst.get(), static_cast(size_output_data), 0, static_cast(size_output_data)); + if (ret != EOK) { + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed, ret is %d", ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory, ret is %d", ret); + return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; + } + for (int64_t g = 0; g < groups; g++) { + for (int64_t d = 0; d < kDim; d++) { + for (int64_t c = 0; c < c_dim; c++) { + for (int64_t h = 0; h < h_dim; h++) { + for (int64_t w = 0; w < w_dim; w++) { + for (int64_t n = 0; n < cout_ori; n++) { + int64_t e_val = g % e_mult; + int64_t dst_ci = e_val * cin_ori + c; + int64_t dst_co = e_val * cout_ori + n; + int64_t src_co = g * cout_ori + n; + int64_t tempory = dst_ci % cube_k; + int64_t srx_inx = 0; + int64_t dst_inx = (g / e_mult) * kDim * c1_dim * h_dim * w_dim * cout_opt * cube_k + + d * c1_dim * h_dim * w_dim * cout_opt * cube_k + + (dst_ci / cube_k) * h_dim * w_dim * cout_opt * cube_k + + h * w_dim * cout_opt * cube_k + w * cout_opt * cube_k + + dst_co * cube_k + tempory; + srx_inx = d * h_dim * w_dim * c_dim * n_dim + h * w_dim * c_dim * n_dim + + w * c_dim * n_dim + c * n_dim + src_co; + char *dst_data = reinterpret_cast(dst.get() + dst_inx * data_size); + const char *src_data = reinterpret_cast(args.data + srx_inx * data_size); + for (int64_t index = 0; index < data_size; index++) { + *dst_data++ = *src_data++; + } + } + } + } + } + } + } + result.data = dst; + result.length = static_cast(size_output_data); + return SUCCESS; +} Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) { int64_t h = args.src_shape[kHwcnH]; int64_t w = args.src_shape[kHwcnW]; @@ -213,9 +370,14 @@ Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) { std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( dst == nullptr, - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION;); for (int64_t c1i = 0; c1i < c1; c1i++) { @@ -235,7 +397,8 @@ Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) { static_cast(data_size)); } else { if (protected_size < data_size) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Failed to operate the dst memory, protected_size is %ld and size is %ld", + GELOGE(ACL_ERROR_GE_PARAM_INVALID,"[Operate][DSTMemory]Failed, protected_size " + "is %ld and size is %ld", protected_size, data_size); return ACL_ERROR_GE_PARAM_INVALID; } @@ -247,8 +410,11 @@ Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) { } } if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", - dst_offset, ret, pad_zero); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed, " + "at offset %ld, error-code %d, pad mode %d", dst_offset, ret, pad_zero); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memoery at offset %ld, " + "error-code %d, pad mode %d", + dst_offset, ret, pad_zero); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -288,9 +454,14 @@ Status TransFormatNhwcToFz(const TransArgs &args, TransResult &result) { std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( dst == nullptr, - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION;); for (int64_t c1i = 0; c1i < c1; c1i++) { @@ -310,7 +481,8 @@ Status TransFormatNhwcToFz(const TransArgs &args, TransResult &result) { static_cast(data_size)); } else { if (protected_size < data_size) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Failed to operate the dst memory, protected_size is %ld and size is %ld", + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Operate][DSTMemory]Failed, protected_size " + "is %ld and size is %ld", protected_size, data_size); return ACL_ERROR_GE_PARAM_INVALID; } @@ -322,8 +494,11 @@ Status TransFormatNhwcToFz(const TransArgs &args, TransResult &result) { } } if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", - dst_offset, ret, pad_zero); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld," + " error-code %d, pad mode %d", dst_offset, ret, pad_zero); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, " + "error-code %d, pad mode %d", + dst_offset, ret, pad_zero); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -355,15 +530,16 @@ Status FormatTransferFractalZ::TransFormat(const TransArgs &args, TransResult &r if (args.src_format == FORMAT_NHWC && args.dst_format == FORMAT_FRACTAL_Z) { return TransFormatNhwcToFz(args, result); } - - if (args.src_format == FORMAT_HWCN && args.dst_format == FORMAT_FRACTAL_Z) { + if ((args.src_format == FORMAT_HWCN) && (GetPrimaryFormat(args.dst_format) == FORMAT_FRACTAL_Z)) { + if (GetSubFormat(args.dst_format) > 1) { + return TransFormatHwcnToFzWithGroups(args, result, GetSubFormat(args.dst_format)); + } return TransFormatHwcnToFz(args, result); } if (args.src_format == FORMAT_NCHW && args.dst_format == FORMAT_FRACTAL_Z) { return TransFormatFromNchwToFz(args, result); } - return ACL_ERROR_GE_FORMAT_INVALID; } @@ -376,7 +552,10 @@ Status FormatTransferFractalZ::TransShape(Format src_format, const std::vector 1) { + return TransShapeHwcnToFzWithGroups(src_shape, data_type, dst_shape, GetSubFormat(dst_format)); + } return TransShapeHwcnToFz(src_shape, data_type, dst_shape); } if (src_format == FORMAT_NCHW && dst_format == FORMAT_FRACTAL_Z) { diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc index 7093aff2..1cb142b3 100755 --- a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc +++ b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc @@ -59,7 +59,7 @@ bool CheckShape(Format format, const ShapeVector &shape) { default: std::string error = "Trans format between " + FmtToStr(TypeUtils::FormatToSerialString(format)) + " and FORMAT_FRACTAL_ZZ is not supported."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_FORMAT_INVALID, error.c_str()); return false; } } @@ -86,8 +86,10 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap hw_shape.push_back(DIM_DEFAULT_VALUE); hw_shape.push_back(src_shape[kNdDimIndexN]); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][DSTShape]Failed, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -106,8 +108,10 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]); hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][DSTShape]Failed, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to check dst shape %s", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -118,10 +122,18 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) { ShapeVector expect_src_shape; auto ret = TransShapeToFracZz(args.dst_shape, args.src_data_type, expect_src_shape, hw_shape); if (ret != SUCCESS) { - GELOGE(ret, "Trans shape from %s to %s, shape %s to %s, data type %s failed", + GELOGE(ret, "[Trans][ShapeToFracZz] Failed from %s to %s, shape %s to %s, data type %s", TypeUtils::FormatToSerialString(args.dst_format).c_str(), - TypeUtils::FormatToSerialString(args.src_format).c_str(), ShapeToString(args.dst_shape).c_str(), - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.src_format).c_str(), + ShapeToString(args.dst_shape).c_str(), + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to trans shape from %s to %s, shape %s to %s, data type %s", + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + ShapeToString(args.dst_shape).c_str(), + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ret; } if (!IsTransShapeSrcCorrect(args, expect_src_shape)) { @@ -140,9 +152,14 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } // The src&dst_shape can be written as times*H*W & times*H1*W1*H0*W0, respectively. dst_shape_size >= kDimNum4D @@ -179,7 +196,11 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -195,7 +216,11 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -217,9 +242,14 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed to allcoate memory " + "for dst buf %ld when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to allcoate memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -257,7 +287,11 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -273,7 +307,11 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "Failed to operate the dst memory at offset %ld, error-code %d", dst_offset, ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Operate][DSTMemory]Failed at offset %ld, " + "error-code %d", + dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to operate dst memory at offset %ld, error-code %d", + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -288,17 +326,39 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con Status FormatTransferFractalZz::TransFormat(const TransArgs &args, TransResult &result) { if (!IsDataTypeSupport(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Not support trans format from %s to %s, src shape %s, dst shape %s, data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, + "[Check][Datatype]Failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype failed, not support trans format " + "from %s to %s, src shape %s, dst shape %s, data type %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShape(args.src_format, args.src_shape) || !IsShapeValid(args.dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Not support trans format from %s to %s, src shape %s, dst shape %s, data type %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, + "[Check][Shape]Failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Check shape failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from %s to %s, src shape %s, dst shape %s, data type %s", @@ -321,16 +381,34 @@ Status FormatTransferFractalZz::TransShape(Format src_format, const ShapeVector Format dst_format, ShapeVector &dst_shape) { if (!IsDataTypeSupport(data_type)) { GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, - "Not support trans format from %s to %s, src shape %s, data type %s", - TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), - ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + "[Check][Datatype]Failed, not support trans format from %s to %s, " + "src shape %s, data type %s", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype failed, not support trans format from %s to %s, " + "src shape %s, data type %s", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShape(src_format, src_shape)) { GELOGE(ACL_ERROR_GE_SHAPE_INVALID, - "Not support trans format from %s to %s, src shape %s, data type %s", - TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), - ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + "[Check][Shape]Failed, not support trans format from %s to %s, " + "src shape %s, data type %s", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Check shape failed, not support trans format from %s to %s, " + "src shape %s, data type %s", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + ShapeToString(src_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } ShapeVector hw_shape; @@ -339,18 +417,39 @@ Status FormatTransferFractalZz::TransShape(Format src_format, const ShapeVector Status FormatTransferFractalZzND::TransFormat(const TransArgs &args, TransResult &result) { if (!IsDataTypeSupport(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Not support trans format from %s to %s, src shape %s, dst shape %s, data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, + "[Check][Datatype]Failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Check datatype Failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!IsShapeValid(args.src_shape) || !CheckShape(args.dst_format, args.dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Not support trans format from %s to %s, src shape %s, dst shape %s, data type %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Failed, not support trans format " + "from %s to %s, src shape %s, dst shape %s, data type %s", TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), - ShapeToString(args.dst_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Check shape failed, not support trans format from %s to %s, " + "src shape %s, dst shape %s, data type %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from %s to %s, src shape %s, dst shape %s, data type %s", diff --git a/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc b/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc index e84033ed..f6af7534 100755 --- a/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc +++ b/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc @@ -41,16 +41,26 @@ Status CheckArgsForFracZToHwcn(const TransArgs &args) { return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to trans shape from FORMAT_FRACTAL_Z to HWCN, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed, " + "shape from FORMAT_FRACTAL_Z to HWCN, invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans shape from FORMAT_FRACTAL_Z to HWCN, " + "invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(src_shape, kFracZDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", ShapeToString(src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(dst_shape, kHwcnDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } int64_t c0 = GetCubeSizeByDataType(args.src_data_type); @@ -66,7 +76,7 @@ Status CheckArgsForFracZToHwcn(const TransArgs &args) { FmtToStr(ShapeToString(dst_shape)); GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_SHAPE_INVALID, error.c_str()); return ACL_ERROR_GE_SHAPE_INVALID; - } + } return SUCCESS; } @@ -74,9 +84,17 @@ Status CheckArgsForFracZToHwcn(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Allocate][DSTMemory]Failed, memory for dst buf %ld, shape %s " + "when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, shape %s " + "when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -114,9 +132,12 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from FracZ offset %ld to HWCN[%ld, %ld, %ld, %ld] " - "offset %ld, err-code %d", + "[Operate][Memory]Failed to copy data from FracZ offset %ld to " + "HWCN[%ld, %ld, %ld, %ld] offset %ld, err-code %d", src_offset, h_idx, w_idx, c_idx, n_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from FracZ offset %ld to " + "HWCN[%ld, %ld, %ld, %ld], offset %ld, err-code %d", + src_offset, h_idx, w_idx, c_idx, n_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -142,9 +163,12 @@ Status FormatTransferFracZHwcn::TransFormat(const TransArgs &args, TransResult & result.length = static_cast(total_size); return SUCCESS; } - - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, - ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][ShapeSize]Failed, " + "total size %ld from dst shape %s, src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from " + "dst shape %s, src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from FracZ to HWCN, src shape %s, data type %s, dst shape %s, memory size %ld", @@ -152,9 +176,16 @@ Status FormatTransferFracZHwcn::TransFormat(const TransArgs &args, TransResult & ShapeToString(args.dst_shape).c_str(), total_size); ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed after trans, src shape %s, " + "data type %s, dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, " + "data type %s, dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; diff --git a/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc b/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc index 3795208d..c112aa79 100755 --- a/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc +++ b/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc @@ -37,20 +37,30 @@ Status CheckArgsForFracZToNchw(const TransArgs &args) { std::string error = "Dose not support trans format from " + FmtToStr(TypeUtils::FormatToSerialString(args.src_format)) + " to " + FmtToStr(TypeUtils::FormatToSerialString(args.dst_format)); - GE_ERRORLOG_AND_ERRORMSG(UNSUPPORTED, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_FORMAT_INVALID, error.c_str()); return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to trans shape from FORMAT_FRACTAL_Z to NCHW, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed, " + "shape from FORMAT_FRACTAL_Z to NCHW, invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans shape from FORMAT_FRACTAL_Z to NCHW, " + "invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(src_shape, kFracZDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", ShapeToString(src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(dst_shape, kNchwDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } int64_t c0 = GetCubeSizeByDataType(args.src_data_type); @@ -59,10 +69,15 @@ Status CheckArgsForFracZToNchw(const TransArgs &args) { } int64_t c1 = Ceil(dst_shape.at(kNchwC), c0); int64_t n0 = Ceil(dst_shape.at(kNchwN), static_cast(kNiSize)); - if (src_shape.at(kFracZHWC1) != dst_shape.at(kNchwH) * dst_shape.at(kNchwW) * c1 || src_shape.at(kFracZC0) != c0 || - src_shape.at(kFracZNi) != kNiSize || src_shape.at(kFracZN0) != n0) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", + if (src_shape.at(kFracZHWC1) != dst_shape.at(kNchwH) * dst_shape.at(kNchwW) * c1 || + src_shape.at(kFracZC0) != c0 || src_shape.at(kFracZNi) != kNiSize || src_shape.at(kFracZN0) != n0) { + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, + "[Check][Shape]Failed to check relationship between src and dst shape, " + "src shape %s, dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to check relationship between src and dst shape, " + "src shape %s, dst shape %s", + ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -72,9 +87,17 @@ Status CheckArgsForFracZToNchw(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Allocate][DSTMemory]Failed, memory for dst buf %ld, shape %s " + "when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, shape %s " + "when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -112,9 +135,12 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from FracZ offset %ld to NCHW[%ld, %ld, %ld, %ld] offset %ld, " - "err-code %d", + "[Operate][Memory]Failed to copy data from FracZ offset %ld to " + "NCHW[%ld, %ld, %ld, %ld] offset %ld, err-code %d", src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999","Failed to copy data from FracZ offset %ld to " + "NCHW[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret ); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -141,8 +167,12 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult & return SUCCESS; } - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, - ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][ShapeSize]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from FracZ to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld", @@ -151,9 +181,16 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult & ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, " + "data type %s, dst shape %s, memory size %ld", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size); return ret; } return SUCCESS; diff --git a/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc b/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc index a2c86300..eb0d3801 100755 --- a/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc +++ b/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc @@ -37,33 +37,48 @@ Status CheckArgsForFracZToNhwc(const TransArgs &args) { std::string error = "Dose not support trans format from " + FmtToStr(TypeUtils::FormatToSerialString(args.src_format)) + " to " + FmtToStr(TypeUtils::FormatToSerialString(args.dst_format)); - GE_ERRORLOG_AND_ERRORMSG(UNSUPPORTED, error.c_str()); - return UNSUPPORTED; + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_FORMAT_INVALID, error.c_str()); + return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(UNSUPPORTED, "Failed to trans shape from FORMAT_FRACTAL_Z to NHWC, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed, " + "shape from FORMAT_FRACTAL_Z to NCHW, invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); - return UNSUPPORTED; + REPORT_INNER_ERROR("E19999", "Failed to trans shape from FORMAT_FRACTAL_Z to NCHW, " + "invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(src_shape, kFracZDimsNum)) { - GELOGE(PARAM_INVALID, "Failed to check src shape %s", ShapeToString(src_shape).c_str()); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(dst_shape, kNhwcDimsNum)) { - GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(dst_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } int64_t c0 = GetCubeSizeByDataType(args.src_data_type); if (c0 < 0) { - return PARAM_INVALID; + return ACL_ERROR_GE_DATATYPE_INVALID; } int64_t c1 = Ceil(dst_shape.at(kNhwcC), c0); int64_t n0 = Ceil(dst_shape.at(kNhwcN), static_cast(kNiSize)); - if (src_shape.at(kFracZHWC1) != dst_shape.at(kNhwcH) * dst_shape.at(kNhwcW) * c1 || src_shape.at(kFracZC0) != c0 || - src_shape.at(kFracZNi) != kNiSize || src_shape.at(kFracZN0) != n0) { - GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", + if (src_shape.at(kFracZHWC1) != dst_shape.at(kNhwcH) * dst_shape.at(kNhwcW) * c1 || + src_shape.at(kFracZC0) != c0 || src_shape.at(kFracZNi) != kNiSize || src_shape.at(kFracZN0) != n0) { + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, + "[Check][Shape]Failed to check relationship between src and dst shape, " + "src shape %s, dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); - return PARAM_INVALID; + REPORT_INNER_ERROR("E19999", "Failed to check relationship between src and dst shape, " + "src shape %s, dst shape %s", + ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -72,10 +87,18 @@ Status CheckArgsForFracZToNhwc(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size, int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Allocate][DSTMemory]Failed, memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); - return OUT_OF_MEMORY; + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto n0 = args.src_shape.at(kFracZN0); @@ -111,10 +134,14 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(INTERNAL_ERROR, - "Failed to copy data from FracZ offset %ld to HHWC[%ld, %ld, %ld, %ld] offset %ld, err-code %d", - src_offset, n_idx, h_idx, w_idx, c_idx, dst_offset, ret); - return INTERNAL_ERROR; + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, + "[Operate][Memory]Failed to copy data from FracZ offset %ld to " + "NCHW[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999","Failed to copy data from FracZ offset %ld to " + "NCHW[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret); + return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } } @@ -127,8 +154,9 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size } // namespace Status FormatTransferFracZNhwc::TransFormat(const TransArgs &args, TransResult &result) { - if (CheckArgsForFracZToNhwc(args) != SUCCESS) { - return PARAM_INVALID; + Status ret = CheckArgsForFracZToNhwc(args); + if (ret != SUCCESS) { + return ret; } int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; @@ -139,18 +167,30 @@ Status FormatTransferFracZNhwc::TransFormat(const TransArgs &args, TransResult & return SUCCESS; } - GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, - ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Get][ShapeSize]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + return ACL_ERROR_GE_PARAM_INVALID; } GELOGD("Begin to trans format from FracZ to NHWC, src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); - if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); - return INTERNAL_ERROR; + ret = GetDstDataAfterTrans(args, result, size, total_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999","Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + return ret; } return SUCCESS; } @@ -158,7 +198,7 @@ Status FormatTransferFracZNhwc::TransFormat(const TransArgs &args, TransResult & Status FormatTransferFracZNhwc::TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, std::vector &dst_shape) { GELOGD("The shape derivation from FracZ to NHWC is not unique. Trans shape in this direction is not supported"); - return UNSUPPORTED; + return ACL_ERROR_GE_FORMAT_INVALID; } REGISTER_FORMAT_TRANSFER(FormatTransferFracZNhwc, FORMAT_FRACTAL_Z, FORMAT_NHWC) diff --git a/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc b/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc index 16aa26f8..3f2b72c5 100755 --- a/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc +++ b/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc @@ -43,8 +43,10 @@ Status TransShapeHwcnToC1hwncoc0(const DataType &data_type, const std::vector expect_dst_shape; @@ -78,10 +90,14 @@ Status CheckArgsForHwcnToC1hwncoc0(const TransArgs &args) { } if (args.dst_shape != expect_dst_shape) { GELOGE(ACL_ERROR_GE_SHAPE_INVALID, - "Failed to trans format, src and dst shape are not compatible. src shape %s, dst shape %s, " + "[Trans][Shape]Failed, src shape %s and dst shape %s are not compatible. " "expect dst shape %s", ShapeToString(args.src_shape).c_str(), ShapeToString(args.dst_shape).c_str(), ShapeToString(expect_dst_shape).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans format, src shape %s and dst shape %s " + "are not compatible. expect dst shape %s", + ShapeToString(args.src_shape).c_str(), ShapeToString(args.dst_shape).c_str(), + ShapeToString(expect_dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -91,9 +107,16 @@ Status CheckArgsForHwcnToC1hwncoc0(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed, " + "memory for dst buf %ld, shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -135,11 +158,17 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from HWCN[%ld, %ld, %ld, %ld] offset %ld to " + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Copy][Data]Failed, " + "data from HWCN[%ld, %ld, %ld, %ld] offset %ld to " "C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", - h_idx, w_idx, c_idx, n_idx, src_offset, c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, - dst_offset, ret); + h_idx, w_idx, c_idx, n_idx, src_offset, c1_idx, h_idx, w_idx, + n_idx, co_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from " + "HWCN[%ld, %ld, %ld, %ld] offset %ld " + "to, C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] " + "offset %ld, err-code %d", + h_idx, w_idx, c_idx, n_idx, src_offset, c1_idx, h_idx, w_idx, + n_idx, co_idx, c0_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } else { @@ -147,14 +176,18 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to set to 0 to C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld, " - "err-code %d", + "[Operate][Memory]Failed to set to 0 to " + "C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to set to 0 to " + "C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld, " + "err-code %d", + c1_idx, h_idx, w_idx, n_idx, co_idx, c0_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } } - } + } } } } @@ -179,8 +212,12 @@ Status FormatTransferHwcnC1hwncoc0::TransFormat(const TransArgs &args, TransResu return SUCCESS; } - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][ShapeSize]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD("Begin to trans format from HWCN to C1HWNCoC0, src shape %s, data type %s, dst shape %s, memory size %ld", @@ -189,9 +226,16 @@ Status FormatTransferHwcnC1hwncoc0::TransFormat(const TransArgs &args, TransResu ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; @@ -201,8 +245,10 @@ Status FormatTransferHwcnC1hwncoc0::TransShape(Format src_format, const std::vec DataType data_type, Format dst_format, std::vector &dst_shape) { if (src_format == FORMAT_HWCN && CheckDataTypeSupported(data_type)) { if (!CheckShapeValid(src_shape, kHwcnDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return TransShapeHwcnToC1hwncoc0(data_type, src_shape, dst_shape); diff --git a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc index df8e5a29..09ff45d9 100755 --- a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc +++ b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc @@ -37,33 +37,49 @@ Status CheckArgsForNc1hwc0ToNchw(const TransArgs &args) { std::string error = "Dose not support trans format from " + FmtToStr(TypeUtils::FormatToSerialString(args.src_format)) + " to " + FmtToStr(TypeUtils::FormatToSerialString(args.dst_format)); - GE_ERRORLOG_AND_ERRORMSG(UNSUPPORTED, error.c_str()); - return UNSUPPORTED; + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_FORMAT_INVALID, error.c_str()); + return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(UNSUPPORTED, "Failed to trans shape from NC1HWC0 to NCHW, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed, shape from NC1HWC0 to NCHW, " + "invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); - return UNSUPPORTED; + REPORT_INNER_ERROR("E19999", "Failed to trans shape from NC1HWC0 to NCHW, invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(args.src_shape, kNc1hwc0DimsNum)) { - GELOGE(PARAM_INVALID, "Failed to check src shape %s", ShapeToString(args.src_shape).c_str()); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(args.src_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(args.dst_shape, kNchwDimsNum)) { - GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(args.dst_shape).c_str()); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(args.dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(args.dst_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } int64_t c0 = GetCubeSizeByDataType(args.src_data_type); if (c0 <= 0) { - GELOGE(PARAM_INVALID, "Failed to get cube size, the data type is invalid"); - return PARAM_INVALID; + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][Cube]Failed, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data tyep %s is invalid", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + return ACL_ERROR_GE_DATATYPE_INVALID; } if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNchwH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNchwW) || src_shape.at(kNc1hwc0N) != dst_shape.at(kNchwN) || src_shape.at(kNc1hwc0C0) != c0 || src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNchwC), c0))) { - GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Failed to check relationship between " + "src shape %s and dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); - return PARAM_INVALID; + REPORT_INNER_ERROR("E19999", "Failed to check relationship between src shape %s " + "and dst shape %s", + ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); + return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -72,10 +88,17 @@ Status CheckArgsForNc1hwc0ToNchw(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed, " + "memory for dst buf %ld, shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); - return OUT_OF_MEMORY; + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto h = args.src_shape.at(kNc1hwc0H); @@ -109,11 +132,17 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { - GELOGE(INTERNAL_ERROR, - "Failed to copy data from NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld to NCHW[%ld, %ld, %ld, %ld]" - " offset %ld, err-code %d", - n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret); - return INTERNAL_ERROR; + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Copy][Data]Failed, data from " + "NC1HWC0[%ld, %ld, %ld, %ld, %ld] " + "src offset %ld to NCHW[%ld, %ld, %ld, %ld], dst offset %ld, err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, + c_idx, h_idx, w_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from NC1HWC0[%ld, %ld, %ld, %ld, %ld] " + "src offset %ld to NCHW[%ld, %ld, %ld, %ld], dst offset %ld, " + "err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, + c_idx, h_idx, w_idx, dst_offset, ret); + return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } } @@ -126,8 +155,9 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in } // namespace Status FormatTransferNc1hwc0Nchw::TransFormat(const TransArgs &args, TransResult &result) { - if (CheckArgsForNc1hwc0ToNchw(args) != SUCCESS) { - return PARAM_INVALID; + Status ret = CheckArgsForNc1hwc0ToNchw(args); + if (ret != SUCCESS) { + return ret; } int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; @@ -138,18 +168,30 @@ Status FormatTransferNc1hwc0Nchw::TransFormat(const TransArgs &args, TransResult return SUCCESS; } - GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Get][ShapeSize]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); - return PARAM_INVALID; + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + return ACL_ERROR_GE_PARAM_INVALID; } GELOGD("Begin to trans format from NC1HWC0 to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); - if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ret = GetDstDataAfterTrans(args, result, size, total_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); - return INTERNAL_ERROR; + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size); + return ret; } return SUCCESS; } @@ -157,7 +199,7 @@ Status FormatTransferNc1hwc0Nchw::TransFormat(const TransArgs &args, TransResult Status FormatTransferNc1hwc0Nchw::TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, std::vector &dst_shape) { GELOGD("The shape derivation from NC1HWC0 to NCHW is not unique. Trans shape in this direction is not supported"); - return UNSUPPORTED; + return ACL_ERROR_GE_FORMAT_INVALID; } REGISTER_FORMAT_TRANSFER(FormatTransferNc1hwc0Nchw, FORMAT_NC1HWC0, FORMAT_NCHW) diff --git a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc index 2234bf05..e9e41cd1 100755 --- a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc +++ b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc @@ -41,28 +41,44 @@ Status CheckArgsForNc1hwc0ToNhwc(const TransArgs &args) { return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to trans shape from NC1HWC0 to NHWC, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed, shape from NC1HWC0 to NHWC, " + "invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans shape from NC1HWC0 to NHWC, invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(args.src_shape, kNc1hwc0DimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", ShapeToString(args.src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(args.dst_shape, kNhwcDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(args.dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(args.dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(args.dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } int64_t c0 = GetCubeSizeByDataType(args.src_data_type); if (c0 <= 0) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to get cube size, the data type is invalid"); + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][Cube]Failed, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNhwcH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNhwcW) || src_shape.at(kNc1hwc0N) != dst_shape.at(kNhwcN) || src_shape.at(kNc1hwc0C0) != c0 || src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNhwcC), c0))) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Failed to check relationship between " + "src shape %s and dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to check relationship between src shape %s " + "and dst shape %s", + ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -72,9 +88,16 @@ Status CheckArgsForNc1hwc0ToNhwc(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allocate][DSTMemory]Failed, memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -110,9 +133,14 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld to NHWC[%ld, %ld, %ld, %ld]" - " offset %ld, err-code %d", - n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, c_idx, h_idx, w_idx, dst_offset, ret); + "[Copy][Data]Failed, data from NC1HWC0[%ld, %ld, %ld, %ld, %ld] " + "offset %ld to NHWC[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, c_idx, + h_idx, w_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from NC1HWC0[%ld, %ld, %ld, %ld, %ld] " + "offset %ld to NHWC[%ld, %ld, %ld, %ld] offset %ld, err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, src_offset, n_idx, c_idx, + h_idx, w_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -139,19 +167,31 @@ Status FormatTransferNc1hwc0Nhwc::TransFormat(const TransArgs &args, TransResult return SUCCESS; } - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][ShapeSize]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } - GELOGD("Begin to trans format from NC1HWC0 to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld", + GELOGD("[Trans][Format]Begin to trans format from NC1HWC0 to NCHW, " + "src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "[Get][Data]Failed, after trans, src shape %s, " + "data type %s, dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; diff --git a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc index 795f8ff5..5efe486c 100644 --- a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc +++ b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc @@ -59,8 +59,9 @@ Status TransShape(int64_t n, int64_t c, int64_t h, int64_t w, DataType data_type dst_shape.push_back(c0); if (!IsShapeValid(dst_shape)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -94,7 +95,13 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { std::vector expect_shape = {n, h, w, c}; auto ret = ge::formats::Transpose(data, args.src_shape, args.src_data_type, perm_arg_1, trans_result_1); if (ret != SUCCESS) { - GELOGE(ret, "Failed to Transpose from NCHW to HWCN"); + GELOGE(ret, "[Trans][Formats]Failed from NCHW to HWCN, src_shape %s, src_data_type %s", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failede to trans formats from NCHW to HWCN, src_shape %s, " + "src_data_type %s", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ret; } @@ -104,7 +111,9 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { // check size it should be same with original size_t expect_size = n * c * h * w * size; // before has do check about mul if (trans_result_1.length != expect_size) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "size is not match after transpose!"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Shape]size %zu is not match expect size %zu " + "after transpose", + trans_result_1.length, expect_size); return ACL_ERROR_GE_PARAM_INVALID; } @@ -118,19 +127,32 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { // data overflow check totally GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(h_o, w_o), - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "int64 mul overflow.A[%ld], B[%ld]", h_o, w_o); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Shape]Failed, " + "int64 mul overflow.A[%ld], B[%ld]", h_o, w_o); + REPORT_CALL_ERROR("E19999", "Check shape failed, int64 mul overflow.A[%ld], " + "B[%ld]", h_o, w_o); return ACL_ERROR_GE_INTERNAL_ERROR); GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(n_o, c_o), - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "int64 mul overflow.A[%ld], B[%ld]", n_o, c_o); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Shape]Failed, " + "int64 mul overflow.A[%ld], B[%ld]", n_o, c_o); + REPORT_CALL_ERROR("E19999", "Check shape failed, int64 mul overflow.A[%ld], " + "B[%ld]", n_o, c_o); return ACL_ERROR_GE_INTERNAL_ERROR); auto t1 = h_o * w_o; auto t2 = n_o * c_o; - GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(t1, t2), GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%ld], B[%ld]", t1, t2); + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(t1, t2), + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Shape]Failed, " + "int64 mul overflow.A[%ld], B[%ld]", t1, t2); + REPORT_CALL_ERROR("E19999", "Check shape failed, " + "int64 mul overflow.A[%ld], B[%ld]", t1, t2); return ACL_ERROR_GE_INTERNAL_ERROR); int64_t total_ele_cnt = n_o * c_o * h_o * w_o; GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(total_ele_cnt, size), - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "int64 mul overflow.A[%ld], B[%d]", total_ele_cnt, size); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Shape]Failed, " + "int64 mul overflow.A[%ld], B[%d]", total_ele_cnt, size); + REPORT_CALL_ERROR("E19999", "Check shape failed, int64 mul overflow.A[%ld], " + "B[%d]", total_ele_cnt, size); return ACL_ERROR_GE_INTERNAL_ERROR); int64_t dst_size = total_ele_cnt * size; if (dst_size == 0) { @@ -140,14 +162,21 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to alloc the memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld " + "when trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto retMem = memset_s(dst.get(), dst_size, 0, dst_size); if (retMem != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memst failed!"); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Set][Memory]Failed, dst buf %ld, error_code %d", + dst_size, retMem); + REPORT_CALL_ERROR("E19999", "Set memory failed, dst buf %ld, error_code %d", dst_size, retMem); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } // copy data @@ -159,7 +188,10 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { for (auto k = 0; k < n; k++) { ret = memcpy_s(p_d + k * stride, protectSize, p_s + k * block, block); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memcpy_s failed!"); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Set][Memcpy]Failed, block %zu, stride %zu, " + "protect_size %ld, error_code %d", block, stride, protectSize, ret); + REPORT_CALL_ERROR("E19999", "[Set][Memcpy]Failed, block %zu, stride %zu, " + "protect_size %ld, error_code %d", block, stride, protectSize, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } protectSize = protectSize - block; @@ -169,7 +201,8 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { std::vector perm_arg_2 = {2, 0, 1, 3}; ret = ge::formats::Transpose(dst.get(), shape_o, args.src_data_type, perm_arg_2, result); if (ret != SUCCESS) { - GELOGE(ret, "Failed to Transpose from NCHW to HWCN"); + GELOGE(ret, "[Trans][Formats]Failed from NCHW to HWCN, error_code %u", ret); + REPORT_CALL_ERROR("E19999", "Failed to trans formats from NCHW to HWCN, error_code %u", ret); return ret; } @@ -190,7 +223,8 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr kMaxDimsNumC) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Invalie dim c num[%lu].It should be in (0,4]", c); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Invalid dim c num[%lu]. " + "It should be in (0,4]", c); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -205,20 +239,33 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", - TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to alloc the memory for dst buf %ld when " + "trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld when " + "trans format from %s to %s", + dst_size, TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto ret = memset_s(dst.get(), dst_size, 0, dst_size); if (ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memst failed!"); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Set][Memory]Failed, dst buf %ld, error_code %d", + dst_size, ret); + REPORT_CALL_ERROR("E19999", "Set memory failed, dst buf %ld, error_code %d", dst_size, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } @@ -249,7 +303,10 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr dst = nullptr; auto ret = PaddingNC(args, args_tmp, dst); if (ret != SUCCESS) { - GELOGE(ret, "Padding in NC axis failed!"); + GELOGE(ret, "[Padding][NCAxis]Failed, error_code %u", ret); + REPORT_CALL_ERROR("E19999", "Padding in NC axis failed, error_code %u", ret); return ret; } std::vector expect_shape; - ret = TransShape(args_tmp.src_format, args_tmp.src_shape, args_tmp.src_data_type, args_tmp.dst_format, expect_shape); + ret = TransShape(args_tmp.src_format, args_tmp.src_shape, args_tmp.src_data_type, + args_tmp.dst_format, expect_shape); if (ret != SUCCESS) { return ret; } diff --git a/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc b/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc index d0579353..ea2b1d7f 100755 --- a/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc +++ b/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc @@ -32,12 +32,17 @@ Status TransShapeNchwToNc1hwc0(const std::vector &src_shape, DataType d std::vector &dst_shape) { int64_t c0 = GetCubeSizeByDataType(data_type); if (c0 <= 0) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to get cube size, the data type is invalid"); + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][Cube]Failed, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(src_shape, kNchwDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } dst_shape.clear(); @@ -47,8 +52,10 @@ Status TransShapeNchwToNc1hwc0(const std::vector &src_shape, DataType d dst_shape.push_back(src_shape.at(kNchwW)); dst_shape.push_back(c0); if (!CheckShapeValid(dst_shape, kNc1hwc0DimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -69,10 +76,17 @@ Status CheckArgsForNchwToNc1hwc0(const TransArgs &args) { } if (expect_5d_shape != args.dst_shape) { GELOGE(ACL_ERROR_GE_SHAPE_INVALID, - "Failed to trans format, the src and dst shape are not compatible. data" - " type %s, src shape %s, dst shape %s, expect dst shape %s", - TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.src_shape).c_str(), + "[Trans][Format]Failed, the src and dst shape are not compatible. " + "data type %s, src shape %s, dst shape %s, expect dst shape %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.src_shape).c_str(), ShapeToString(args.dst_shape).c_str(), ShapeToString(expect_5d_shape).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans formats, the src and dst shape are not " + "compatible. data type %s, src shape %s, dst shape %s, expect dst shape %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.src_shape).c_str(), + ShapeToString(args.dst_shape).c_str(), + ShapeToString(expect_5d_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -83,10 +97,16 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, - "Failed to trans format from %s to %s, can not alloc the memory for" - " dst buf %ld, shape %s", + "[Allcoate][Memory]Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -97,7 +117,10 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in int64_t c0 = GetCubeSizeByDataType(args.src_data_type); if (c0 <= 0) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "The c0 is invalid %ld", c0); + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][Shape]The c0 is invalid %ld, data_type %s", + c0, TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Check shape failed, the c0 is invalid %ld, data_type %s", + c0, TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } int64_t c1 = (c - 1) / c0 + 1; @@ -130,9 +153,13 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from NCHW[%ld] offset %ld to " - "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", + "[Operate][Memory]Failed to copy data from NCHW[%ld] offset %ld " + "to NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", srcIdx, src_offset, n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from NCHW[%ld] offset %ld " + "to NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", + srcIdx, src_offset, n_idx, c1_idx, h_idx, w_idx, c0_idx, + dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } else { @@ -140,9 +167,12 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to set to 0 to " - "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", + "[Operate][Memory]Failed to set to 0 to NC1HWC0[%ld, %ld, %ld, %ld, %ld] " + "offset %ld, err-code %d", n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to set to 0 to " + "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld, err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -173,8 +203,12 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult return SUCCESS; } - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Get][Shape]Failed, total size %ld from dst shape %s, " + "src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get total size %ld from dst shape %s, src shape %s", + total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } GELOGD( @@ -184,9 +218,16 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult ShapeToString(args.dst_shape).c_str(), total_size); ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; diff --git a/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc b/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc index b09fd168..518790b6 100755 --- a/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc +++ b/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc @@ -34,7 +34,10 @@ Status TransShapeNhwcToNc1hwc0(const std::vector &src_shape, DataType d std::vector &dst_shape) { int64_t c0 = GetCubeSizeByDataType(data_type); if (c0 <= 0) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to get cube size, the data type is invalid"); + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][Cube]Failed, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data type %s is invalid", + TypeUtils::DataTypeToSerialString(data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } dst_shape.clear(); @@ -44,8 +47,10 @@ Status TransShapeNhwcToNc1hwc0(const std::vector &src_shape, DataType d dst_shape.push_back(src_shape.at(kNhwcW)); dst_shape.push_back(c0); if (!CheckShapeValid(dst_shape, kNc1hwc0DimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", ShapeToString(dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check invalid", + ShapeToString(dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return SUCCESS; @@ -60,16 +65,25 @@ Status CheckArgsForNhwcToNc1hwc0(const TransArgs &args) { return ACL_ERROR_GE_FORMAT_INVALID; } if (!CheckDataTypeSupported(args.src_data_type)) { - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Failed to trans shape from NHWC to NC1HWC0, invalid data type %s", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Check][DataType]Failed from NHWC to NC1HWC0, " + "invalid data type %s", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); + REPORT_INNER_ERROR("E19999", "Failed to trans shape from NHWC to NC1HWC0, invalid data type %s", + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } if (!CheckShapeValid(args.src_shape, kNhwcDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", ShapeToString(args.src_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", + ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } if (!CheckShapeValid(args.dst_shape, kNc1hwc0DimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check dst shape %s", ShapeToString(args.dst_shape).c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, dst shape %s", + ShapeToString(args.dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Dst shape %s check valid", + ShapeToString(args.dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } std::vector expect_dst_shape; @@ -79,10 +93,14 @@ Status CheckArgsForNhwcToNc1hwc0(const TransArgs &args) { } if (args.dst_shape != expect_dst_shape) { GELOGE(ACL_ERROR_GE_SHAPE_INVALID, - "Failed to trans format, the src and dst shape are not compatible. src shape %s, dst shape %s, " + "[Trans][Format]Failed , the src shape %s and dst shape %s are not compatible. " "expect dst shape %s", ShapeToString(args.src_shape).c_str(), ShapeToString(args.dst_shape).c_str(), ShapeToString(expect_dst_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to trans format, the src shape %s and " + "dst shape %s are not compatible. expect dst shape %s", + ShapeToString(args.src_shape).c_str(), ShapeToString(args.dst_shape).c_str(), + ShapeToString(expect_dst_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } @@ -92,9 +110,16 @@ Status CheckArgsForNhwcToNc1hwc0(const TransArgs &args) { Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld, shape %s", + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Allcoate][Memory]Failed, memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), TypeUtils::FormatToSerialString(args.src_format).c_str(), - TypeUtils::FormatToSerialString(args.dst_format).c_str(), total_size, ShapeToString(args.dst_shape).c_str()); + TypeUtils::FormatToSerialString(args.dst_format).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to alloc the memory for dst buf %ld, " + "shape %s when trans format from %s to %s", + total_size, ShapeToString(args.dst_shape).c_str(), + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -132,17 +157,27 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in auto ret = memcpy_s(dst.get() + dst_offset, protected_size, args.data + src_offset, size); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to copy data from NHWC[%ld, %ld, %ld, %ld] offset %ld to " - "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld err-code %d", - n_idx, h_idx, w_idx, c_idx, src_offset, n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); + "[Operate][Memory]Failed to copy data from NHWC[%ld, %ld, %ld, %ld] " + "offset %ld to NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld err-code %d", + n_idx, h_idx, w_idx, c_idx, src_offset, + n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to copy data from NHWC[%ld, %ld, %ld, %ld] " + "offset %ld to " + "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld err-code %d", + n_idx, h_idx, w_idx, c_idx, src_offset, + n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } else { auto ret = memset_s(dst.get() + dst_offset, protected_size, 0, size); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to set 0 to NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld base err-code %d", n_idx, c1_idx, - h_idx, w_idx, c0_idx, dst_offset, ret); + "[Operate][Memory]Failed to set 0 to " + "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld base err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); + REPORT_CALL_ERROR("E19999", "Failed to set 0 to " + "NC1HWC0[%ld, %ld, %ld, %ld, %ld] offset %ld base err-code %d", + n_idx, c1_idx, h_idx, w_idx, c0_idx, dst_offset, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } } @@ -171,8 +206,12 @@ Status FormatTransferNhwcNc1hwc0::TransFormat(const TransArgs &args, TransResult return SUCCESS; } - GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "Get %ld total size from dst shape %s, src shape %s", total_size, + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][ShapeSize]Failed, " + "total size %ld from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "[Get][Shape]Failed, total size %ld from " + "dst shape %s, src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } GELOGD("Begin to trans format from NHWC to NC1HWC0, src shape %s, data type %s, dst shape %s, memory size %ld", @@ -181,9 +220,16 @@ Status FormatTransferNhwcNc1hwc0::TransFormat(const TransArgs &args, TransResult ret = GetDstDataAfterTrans(args, result, size, total_size); if (ret != SUCCESS) { - GELOGE(ret, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); + GELOGE(ret, "[Get][Data]Failed, after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); + REPORT_CALL_ERROR("E19999", "Failed to get data after trans, src shape %s, data type %s, " + "dst shape %s, memory size %ld, error_code %u", + ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size, ret); return ret; } return SUCCESS; @@ -193,8 +239,10 @@ Status FormatTransferNhwcNc1hwc0::TransShape(Format src_format, const std::vecto DataType data_type, Format dst_format, std::vector &dst_shape) { if (src_format == FORMAT_NHWC && CheckDataTypeSupported(data_type)) { if (!CheckShapeValid(src_shape, kNhwcDimsNum)) { - GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "Failed to check src shape %s", + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Check][Shape]Value is invalid, src shape %s", ShapeToString(src_shape).c_str()); + REPORT_CALL_ERROR("E19999", "Src shape %s check invalid", + ShapeToString(src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } return TransShapeNhwcToNc1hwc0(src_shape, data_type, dst_shape); diff --git a/ge/common/formats/format_transfers/format_transfer_transpose.cc b/ge/common/formats/format_transfers/format_transfer_transpose.cc index 694777f3..54c5444b 100755 --- a/ge/common/formats/format_transfers/format_transfer_transpose.cc +++ b/ge/common/formats/format_transfers/format_transfer_transpose.cc @@ -50,21 +50,21 @@ std::map>> perm_args{ bool IsShapeArgValid(const std::vector &src_shape, const std::vector &perm_arg) { if (src_shape.empty()) { std::string error = "Failed to transpose, empty src shape"; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); - GELOGE(PARAM_INVALID, "Failed to transpose, empty src shape"); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_SHAPE_INVALID, error.c_str()); + GELOGE(ACL_ERROR_GE_SHAPE_INVALID, "[Trans][Shape]Failed, empty src shape"); return false; } for (auto dim : src_shape) { if (dim < 0) { std::string error = "Failed to transpose, negative dim in src shape " + FmtToStr(ShapeToString(src_shape)); - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_SHAPE_INVALID, error.c_str()); return false; } } if (perm_arg.size() != src_shape.size()) { std::string error = "Failed to transpose, the size of src shape" + FmtToStr(src_shape.size()) + " and perm arg" + FmtToStr(perm_arg.size()) + " are different"; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_SHAPE_INVALID, error.c_str()); return false; } @@ -73,7 +73,7 @@ bool IsShapeArgValid(const std::vector &src_shape, const std::vector(perm) >= perm_arg.size() || ++exists[perm] > 1) { std::string error = "Failed to transpose, duplicated perm arg " + FmtToStr(perm) + ", perm arg " + FmtToStr(JoinToString(perm_arg)); - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str()); + GE_ERRORLOG_AND_ERRORMSG(ACL_ERROR_GE_PARAM_INVALID, error.c_str()); return false; } } @@ -82,12 +82,14 @@ bool IsShapeArgValid(const std::vector &src_shape, const std::vector &src_shape, DataType src_data_type, const std::vector &perm_arg) { if (src == nullptr) { - GELOGE(PARAM_INVALID, "Failed to transpose, the src is null"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Trans][Param]Failed, the src is null"); return false; } if (GetSizeByDataType(src_data_type) < 0) { - GELOGE(UNSUPPORTED, "Failed to transpose, the data type %s is not support", + GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Trans][Param]Failed, the data type %s is not support", TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to transpose, the data type %s is not support", + TypeUtils::DataTypeToSerialString(src_data_type).c_str()); return false; } return IsShapeArgValid(src_shape, perm_arg); @@ -173,10 +175,15 @@ Status Transpose(const uint8_t *src, const std::vector &src_shape, Data static_cast(data_size)); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, - "Failed to transpose, src shape %s, perm arg %s, dst shape %s, " + "[Operate][Memory]Failed to transpose, src shape %s, perm arg %s, dst shape %s, " "failed to write to dst offset %ld, current dim offset %s", ShapeToString(src_shape).c_str(), ShapeToString(perm_arg).c_str(), ShapeToString(dst_shape).c_str(), dst_offset_bytes, ShapeToString(dst_indexes).c_str()); + REPORT_CALL_ERROR("E19999", "Failed to transpose, src shape %s, perm arg %s, dst shape %s, " + "failed to write to dst offset %ld, current dim offset %s", + ShapeToString(src_shape).c_str(), ShapeToString(perm_arg).c_str(), + ShapeToString(dst_shape).c_str(), + dst_offset_bytes, ShapeToString(dst_indexes).c_str()); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } AddOne(dst_shape, dst_indexes); diff --git a/ge/common/formats/formats.cc b/ge/common/formats/formats.cc index 353606d2..0c72a898 100755 --- a/ge/common/formats/formats.cc +++ b/ge/common/formats/formats.cc @@ -44,7 +44,12 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransFormat(const TransArg auto src_shape_size = GetItemNumByShape(args.src_shape); if (args.data == nullptr && src_shape_size != 0) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Invalid input null data"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Shape]Failed, input data is null " + "or shape size not euqal to 0, src_shape %s", + ShapeToString(args.src_shape).c_str()); + REPORT_CALL_ERROR("E19999","Failed to check shape, input data is null " + "or shape size not equal to 0, src_shape %s", + ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_PARAM_INVALID; } @@ -82,7 +87,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransDataType(const CastAr } if (args.data == nullptr && args.src_data_size != 0) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Invalid input null data"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Failed, input data is null " + "or data size not equal to 0, src_data_size %ld", args.src_data_size); return ACL_ERROR_GE_PARAM_INVALID; } diff --git a/ge/common/helper/model_helper.cc b/ge/common/helper/model_helper.cc index 02c0a8f0..e95c3429 100644 --- a/ge/common/helper/model_helper.cc +++ b/ge/common/helper/model_helper.cc @@ -87,12 +87,13 @@ Status ModelHelper::SaveSizeToModelDef(const GeModelPtr &ge_model) { std::shared_ptr model_task_def = ge_model->GetModelTaskDefPtr(); if (model_task_def == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Create model task def ptr failed"); - return ACL_ERROR_GE_MEMORY_ALLOCATION; + GELOGD("SaveSizeToModelDef task_info_size is 0."); + om_info.push_back(0); + } else { + size_t partition_task_size = model_task_def->ByteSizeLong(); + GELOGD("SaveSizeToModelDef task_info_size is %zu", partition_task_size); + om_info.push_back(partition_task_size); } - size_t partition_task_size = model_task_def->ByteSizeLong(); - GELOGD("SaveSizeToModelDef task_info_size is %zu", partition_task_size); - om_info.push_back(partition_task_size); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(*(ge_model.get()), "om_info_list", om_info), GELOGE(FAILED, "SetListInt of om_info_list failed."); @@ -598,6 +599,7 @@ Status ModelHelper::GenerateGeRootModel(OmFileLoadHelper &om_load_helper) { is_first_model = false; root_model_->SetRootGraph(GraphUtils::GetComputeGraph(cur_model->GetGraph())); root_model_->SetModelId(cur_model->GetModelId()); + root_model_->SetModelName(cur_model->GetName()); model_ = cur_model; continue; } diff --git a/ge/common/helper/om_file_helper.cc b/ge/common/helper/om_file_helper.cc index 3702e8f8..cd13c5d8 100644 --- a/ge/common/helper/om_file_helper.cc +++ b/ge/common/helper/om_file_helper.cc @@ -416,8 +416,7 @@ Status OmFileSaveHelper::SaveRootModel(const SaveParam &save_param, const char * if (is_offline) { ret = FileSaver::SaveToFile(output_file, model_header_, model_partition_tabels, all_model_partitions); } else { - GELOGW("do not support save ge root model to buff now"); - return FAILED; + ret = FileSaver::SaveToBuffWithFileHeader(model_header_, model_partition_tabels, all_model_partitions, model); } if (ret == SUCCESS) { GELOGD("Save model success without encrypt."); diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index 0cf74b1f..fbbf1f04 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -24,6 +24,7 @@ #include "graph/types.h" #include "runtime/base.h" #include "graph/load/model_manager/davinci_model.h" +#include "mmpa/mmpa_api.h" namespace { const char *const kTrainingTrace = "training_trace"; @@ -31,7 +32,6 @@ const char *const kFpPoint = "fp_point"; const char *const kBpPoint = "bp_point"; #ifdef DAVINCI_SUPPORT_PROFILING -const size_t kReportMaxLen = 2048; const int32_t kMaxDeviceNum = 256; const uint32_t kInteval = 2; const std::string kConfigNumsdev = "devNums"; @@ -47,6 +47,10 @@ const std::string kOptype = "op_type"; const std::string kBlockDim = "block_dims"; const std::string kTaskId = "task_id"; const std::string kStreamId = "stream_id"; +const std::string kThreadId = "thread_id"; +const std::string kIndexId = "index_id"; +const std::string kTimeStamp = "time_stamp"; +const std::string kTagId = "tag_id"; const std::string kShapeType = "shape_type"; const std::string kCurIterNum = "cur_iter_num"; const std::string kTaskType = "task_type"; @@ -287,27 +291,80 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin #endif } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfileStepInfo( + uint64_t index_id, uint64_t model_id, uint16_t tag_id, rtStream_t stream, int32_t device_id) { +#ifdef DAVINCI_SUPPORT_PROFILING + rtError_t rt_ret = RT_ERROR_NONE; +#ifndef ONLY_COMPILE_OPEN_SRC + GELOGD("Profiling Step Info TraceTask execute async start, index_id = %lu, model_id = %lu, tag_id = %u", + index_id, model_id, tag_id); + rt_ret = rtProfilerTraceEx(index_id, model_id, tag_id, stream); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "[Call][rtProfilerTraceEx] failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + GELOGD("Profiling Step Info TraceTask execute async success, index_id = %lu, model_id = %lu, tag_id = %u", + index_id, model_id, tag_id); +#endif + + mmTimespec timespec = mmGetTickCount(); + // 1000 ^ 3 converts second to nanosecond + int64_t time = timespec.tv_sec * 1000 * 1000 * 1000 + timespec.tv_nsec; + uint32_t task_id = 0; + uint32_t stream_id = 0; + rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "[Get][RtsInfo] task_id and stream_id failed, ret: 0x%X.", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + GELOGD("Get profiling args, task_id[%u], stream_id[%u]", task_id, stream_id); + + Json step_info; + step_info[kIndexId] = index_id; + step_info[kModelId] = model_id; + step_info[kTimeStamp] = time; + step_info[kTagId] = tag_id; + step_info[kTaskId] = task_id; + step_info[kStreamId] = stream_id; + step_info[kThreadId] = mmGetTid(); + + std::string reported_data; + try { + reported_data = step_info.dump(kInteval, ' ', false, Json::error_handler_t::ignore); + } catch (std::exception &e) { + GELOGE(FAILED, "Failed to convert JSON to string, reason: %s.", e.what()); + } catch (...) { + GELOGE(FAILED, "Failed to convert JSON to string."); + } + reported_data.append(",") + .append("\n"); + ReportData(device_id, reported_data, "step_info"); +#endif + return SUCCESS; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportData( const int32_t &device_id, const string &data, const string &tag_name) { #ifdef DAVINCI_SUPPORT_PROFILING ReporterData reporter_data{}; int ret = -1; int32_t cb_ret = -1; - size_t index = data.size() / kReportMaxLen; + size_t report_max_len = reporter_max_len_; + size_t index = data.size() / report_max_len; if (index >= 1) { reporter_data.deviceId = device_id; ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, tag_name.c_str(), tag_name.size()); GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag [%s] memcpy error!", tag_name.c_str()); return;); for (size_t i = 0; i < index; ++i) { - reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * i; - reporter_data.dataLen = kReportMaxLen; + reporter_data.data = (unsigned char *)data.c_str() + report_max_len * i; + reporter_data.dataLen = report_max_len; cb_ret = CallMsprofReport(reporter_data); GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data [%s] failed, ret:%d", tag_name.c_str(), cb_ret); return;); } - reporter_data.dataLen = data.size() - kReportMaxLen * index; + reporter_data.dataLen = data.size() - report_max_len * index; if (reporter_data.dataLen != 0) { - reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * index; + reporter_data.data = (unsigned char *)data.c_str() + report_max_len * index; cb_ret = CallMsprofReport(reporter_data); GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data [%s] failed, ret:%d", tag_name.c_str(), cb_ret); return;); @@ -745,15 +802,32 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin return execute_model_prof_on; } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() { if (prof_cb_.msprofReporterCallback == nullptr) { GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr."); return ge::PARAM_INVALID; } - return prof_cb_.msprofReporterCallback( + int32_t cb_ret = prof_cb_.msprofReporterCallback( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_INIT), nullptr, 0); + if (cb_ret != MSPROF_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Profiling reporter init failed, ret = %d.", cb_ret); + GELOGE(INTERNAL_ERROR, "[Init][ProfilingReporter] profiling init failed, ret = %d.", cb_ret); + return INTERNAL_ERROR; + } + + cb_ret = prof_cb_.msprofReporterCallback( + static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), + static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_DATA_MAX_LEN), + &reporter_max_len_, sizeof(uint32_t)); + if (cb_ret != MSPROF_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Get profiling reporter data max len failed, ret = %d.", cb_ret); + GELOGE(INTERNAL_ERROR, "[Init][ProfilingReporter] Get profiling reporter data max len failed, ret = %d.", cb_ret); + return INTERNAL_ERROR; + } + + return SUCCESS; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit() const { diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index 34acee0e..ab344204 100755 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -27,6 +27,7 @@ #include "framework/common/ge_types.h" #include "external/register/register_types.h" #include "toolchain/prof_callback.h" +#include "runtime/stream.h" using std::map; using std::string; @@ -88,7 +89,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { void ProfilingTaskDescInfo(uint32_t model_id, const std::vector &task_desc_info, const int32_t &device_id); void ProfilingOpInputOutInfo(const TaskDescInfo &task, Json &task_json); - Status PluginInit() const; + Status PluginInit(); void PluginUnInit() const; Status CallMsprofReport(ReporterData &reporter_data) const; struct MsprofCallback &GetMsprofCallback() { return prof_cb_; } @@ -97,6 +98,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { void GetFpBpPoint(std::string &fp_point, std::string &bp_point); void GetOpInputOutputInfo(const OpDescPtr &op, TaskDescInfo &task_desc_info) const; void ReportData(const int32_t &device_id, const std::string &data, const std::string &tag_name); + Status ProfileStepInfo(uint64_t index_id, uint64_t model_id, uint16_t tag_id, rtStream_t stream, int32_t device_id); private: Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf); Status ParseOptions(const std::string &options); @@ -119,6 +121,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { MsprofCallback prof_cb_; std::string fp_point_; std::string bp_point_; + uint32_t reporter_max_len_ = 0; }; } // namespace ge #endif // GE_COMMON_PROFILING_PROFILING_MANAGER_H_ diff --git a/ge/common/proto/insert_op.proto b/ge/common/proto/insert_op.proto index bf918b20..7d708865 100644 --- a/ge/common/proto/insert_op.proto +++ b/ge/common/proto/insert_op.proto @@ -88,6 +88,7 @@ message AippOpParams { int32 right_padding_size = 69; int32 top_padding_size = 70; int32 bottom_padding_size = 71; + float padding_value = 72; int32 mean_chn_0 = 10; int32 mean_chn_1 = 11; diff --git a/ge/common/tbe_kernel_store.cc b/ge/common/tbe_kernel_store.cc index 2fb9a04a..efbb46ae 100755 --- a/ge/common/tbe_kernel_store.cc +++ b/ge/common/tbe_kernel_store.cc @@ -15,6 +15,8 @@ */ #include "common/tbe_kernel_store.h" +#include "graph/utils/attr_utils.h" +#include "graph/debug/ge_attr_define.h" namespace ge { @@ -31,6 +33,15 @@ void TBEKernelStore::LoadTBEKernelBinToOpDesc(const std::shared_ptr GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, kernel_bin), GELOGW("LoadKernelTBEBinToOpDesc: SetExtAttr for kernel_bin failed");) GELOGI("Load tbe kernel:%s, %zu", kernel_bin->GetName().c_str(), kernel_bin->GetBinDataSize()); + + std::string atomic_kernel_name; + (void) AttrUtils::GetStr(op_desc, ATOMIC_ATTR_TBE_KERNEL_NAME, atomic_kernel_name); + if (!atomic_kernel_name.empty()) { + GELOGI("Get atomic kernel name is %s.", atomic_kernel_name.c_str()); + auto atomic_kernel_bin = FindKernel(atomic_kernel_name); + GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(EXT_ATTR_ATOMIC_TBE_KERNEL, atomic_kernel_bin), + GELOGW("LoadKernelTBEBinToOpDesc: SetExtAttr for atomic kernel_bin failed");) + } } } } diff --git a/ge/common/types.cc b/ge/common/types.cc index 90ff9fe4..33b7f437 100644 --- a/ge/common/types.cc +++ b/ge/common/types.cc @@ -90,6 +90,8 @@ REGISTER_OPTYPE_DEFINE(DEPCONVOLUTION, "ConvolutionDepthwise"); REGISTER_OPTYPE_DEFINE(DROPOUT, "Dropout"); REGISTER_OPTYPE_DEFINE(DROPOUTGENMASK, "DropOutGenMask"); REGISTER_OPTYPE_DEFINE(DROPOUTDOMASK, "DropOutDoMask"); +REGISTER_OPTYPE_DEFINE(DROPOUTDOMASKV3, "DropOutDoMaskV3"); +REGISTER_OPTYPE_DEFINE(DROPOUTDOMASKV3D, "DropOutDoMaskV3D"); REGISTER_OPTYPE_DEFINE(CONCAT, "Concat"); REGISTER_OPTYPE_DEFINE(ROIPOOLING, "ROIPooling"); REGISTER_OPTYPE_DEFINE(PROPOSAL, "Proposal"); diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index 04654f99..2fca1aa6 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -8,6 +8,7 @@ set(PROTO_LIST ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) +protobuf_generate(ge_static PROTO_STATIC_SRCS PROTO_STATIC_HDRS ${PROTO_LIST}) set(SRC_LIST "ge_executor.cc" @@ -15,6 +16,7 @@ set(SRC_LIST "../common/ge/plugin_manager.cc" "../common/ge/op_tiling_manager.cc" "../common/dump/dump_properties.cc" + "../common/dump/exception_dumper.cc" "../common/dump/dump_manager.cc" "../common/dump/dump_op.cc" "../common/dump/opdebug_register.cc" @@ -162,7 +164,7 @@ set(SRC_LIST ) ######## libge_executor.a ######## -add_library(ge_executor STATIC ${SRC_LIST} ${PROTO_HDRS}) +add_library(ge_executor STATIC ${SRC_LIST} ${PROTO_STATIC_HDRS}) target_compile_options(ge_executor PRIVATE $<$,$>:-fvisibility=hidden -O2 -Werror -Wno-deprecated-declarations -fno-common> @@ -178,6 +180,7 @@ target_compile_definitions(ge_executor PRIVATE google=ascend_private $,OS_TYPE=WIN,OS_TYPE=0> $<$:SECUREC_USING_STD_SECURE_LIB=0 NOMINMAX> + $<$:ONLY_COMPILE_OPEN_SRC> LOG_CPP ) @@ -191,7 +194,7 @@ target_include_directories(ge_executor SYSTEM PRIVATE ${METADEF_DIR}/inc/external/graph ${METADEF_DIR}/inc/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_static #### yellow zone #### ${GE_CODE_DIR}/../inc ${GE_CODE_DIR}/../inc/cce @@ -212,6 +215,7 @@ target_link_libraries(ge_executor PRIVATE add_library(ge_executor_shared SHARED ${SRC_LIST} ${PROTO_HDRS}) target_compile_options(ge_executor_shared PRIVATE + -fno-common -Werror -O2 -Wno-deprecated-declarations @@ -223,6 +227,7 @@ target_compile_definitions(ge_executor_shared PRIVATE DAVINCI_SUPPORT_PROFILING google=ascend_private FUNC_VISIBILITY + $<$:ONLY_COMPILE_OPEN_SRC> ) target_include_directories(ge_executor_shared PRIVATE diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc index 44b2dbfa..4081bdf2 100755 --- a/ge/executor/ge_executor.cc +++ b/ge/executor/ge_executor.cc @@ -30,6 +30,8 @@ #include "single_op/single_op_manager.h" #include "graph/load/model_manager/davinci_model.h" #include "opskernel_manager/ops_kernel_builder_manager.h" +#include "graph/opsproto_manager.h" +#include "ge_local_engine/engine/host_cpu_engine.h" using std::string; using std::vector; @@ -199,6 +201,33 @@ bool IsDynmaicDimsSizeMatchModel(const vector cur_dynamic_dims, namespace ge { bool GeExecutor::isInit_ = false; +static void InitOpsProtoManager() { + string opsproto_path; + const char *path_env = std::getenv("ASCEND_OPP_PATH"); + if (path_env != nullptr) { + string path = path_env; + string file_path = RealPath(path.c_str()); + if (file_path.empty()) { + GELOGE(FAILED, "[Check][EnvPath]ASCEND_OPP_PATH path [%s] is invalid.", path.c_str()); + REPORT_INPUT_ERROR("E68016", {"ASCEND_OPP_PATH", path}); + return; + } + opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/"); + GELOGI("Get opsproto so path from env : %s", path.c_str()); + } else { + string path_base = PluginManager::GetPath(); + GELOGI("path_base is %s", path_base.c_str()); + path_base = path_base.substr(0, path_base.rfind('/')); + path_base = path_base.substr(0, path_base.rfind('/') + 1); + opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/"); + } + GELOGI("Get opsproto path is %s", opsproto_path.c_str()); + OpsProtoManager *manager = OpsProtoManager::Instance(); + map option_tmp; + option_tmp.emplace(std::pair(string("ge.opsProtoLibPath"), opsproto_path)); + (void)manager->Initialize(option_tmp); +} + GeExecutor::GeExecutor() {} Status GeExecutor::Initialize() { @@ -208,6 +237,16 @@ Status GeExecutor::Initialize() { return ge::SUCCESS; } + OpTilingManager::GetInstance().LoadSo(); + + Status init_hostcpu_engine_status = HostCpuEngine::GetInstance().Initialize(); + if (init_hostcpu_engine_status != SUCCESS) { + GELOGE(init_hostcpu_engine_status, "Failed to initialize HostCpuEngine"); + return init_hostcpu_engine_status; + } + + InitOpsProtoManager(); + std::vector mem_type(1, RT_MEMORY_HBM); mem_type.push_back(RT_MEMORY_P2P_DDR); auto ret = MemManager::Instance().Initialize(mem_type); diff --git a/ge/executor/proto/insert_op.proto b/ge/executor/proto/insert_op.proto index bf918b20..7d708865 100644 --- a/ge/executor/proto/insert_op.proto +++ b/ge/executor/proto/insert_op.proto @@ -88,6 +88,7 @@ message AippOpParams { int32 right_padding_size = 69; int32 top_padding_size = 70; int32 bottom_padding_size = 71; + float padding_value = 72; int32 mean_chn_0 = 10; int32 mean_chn_1 = 11; diff --git a/ge/ge_inference.mk b/ge/ge_inference.mk index 5d5e734c..32fc206d 100755 --- a/ge/ge_inference.mk +++ b/ge/ge_inference.mk @@ -114,6 +114,7 @@ OMG_HOST_SRC_FILES := \ graph/passes/atomic_addr_clean_pass.cc \ graph/passes/mark_same_addr_pass.cc \ graph/passes/mark_graph_unknown_status_pass.cc \ + graph/passes/mark_node_unknown_shape_pass.cc \ graph/passes/mark_agnostic_pass.cc \ graph/common/omg_util.cc \ graph/common/bcast.cc \ @@ -222,6 +223,7 @@ OMG_HOST_SRC_FILES := \ graph/passes/hccl_group_pass.cc \ graph/passes/memcpy_addr_async_pass.cc \ graph/passes/set_input_output_offset_pass.cc \ + graph/passes/buffer_pool_memory_pass.cc \ OMG_DEVICE_SRC_FILES := $(OMG_HOST_SRC_FILES) diff --git a/ge/ge_local_engine/CMakeLists.txt b/ge/ge_local_engine/CMakeLists.txt index 00142cfe..ab767ccb 100755 --- a/ge/ge_local_engine/CMakeLists.txt +++ b/ge/ge_local_engine/CMakeLists.txt @@ -20,6 +20,8 @@ set(OPS_KERNEL_SRC_LIST ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) +protobuf_generate(ge_ops_shared PROTO_OPS_SHARED_SRCS PROTO_OPS_SHARED_HDRS ${PROTO_LIST}) +protobuf_generate(ge_ops_static PROTO_OPS_STATIC_SRCS PROTO_OPS_STATIC_HDRS ${PROTO_LIST}) ############ libge_local_engine.so ############ add_library(ge_local_engine SHARED ${SRC_LIST} ${PROTO_HDRS}) @@ -119,7 +121,7 @@ set_target_properties(atc_ge_local_engine PROPERTIES ) ############ libge_local_opskernel_builder.so ############ -add_library(ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_HDRS}) +add_library(ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_OPS_SHARED_HDRS}) target_compile_options(ge_local_opskernel_builder PRIVATE -Werror @@ -143,7 +145,7 @@ target_include_directories(ge_local_opskernel_builder PRIVATE ${METADEF_DIR}/inc/external/graph ${METADEF_DIR}/inc/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_ops_shared #### yellow zone #### ${GE_CODE_DIR}/../inc #### blue zone #### @@ -166,7 +168,7 @@ target_link_libraries(ge_local_opskernel_builder PRIVATE ) ############ atclib/libge_local_opskernel_builder.so ############ -add_library(atc_ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_HDRS}) +add_library(atc_ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_OPS_SHARED_HDRS}) target_compile_options(atc_ge_local_opskernel_builder PRIVATE -Werror @@ -190,7 +192,7 @@ target_include_directories(atc_ge_local_opskernel_builder PRIVATE ${METADEF_DIR}/inc/external/graph ${METADEF_DIR}/inc/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_ops_shared #### yellow zone #### ${GE_CODE_DIR}/../inc #### blue zone #### @@ -218,7 +220,7 @@ set_target_properties(atc_ge_local_opskernel_builder PROPERTIES ) ############ libge_local_opskernel_builder.a ############ -add_library(ge_local_opskernel_builder_static STATIC ${OPS_KERNEL_SRC_LIST} ${PROTO_HDRS}) +add_library(ge_local_opskernel_builder_static STATIC ${OPS_KERNEL_SRC_LIST} ${PROTO_OPS_STATIC_HDRS}) target_compile_options(ge_local_opskernel_builder_static PRIVATE -Werror @@ -243,7 +245,7 @@ target_include_directories(ge_local_opskernel_builder_static PRIVATE ${METADEF_DIR}/inc/external/graph ${METADEF_DIR}/inc/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_ops_static #### yellow zone #### ${GE_CODE_DIR}/../inc #### blue zone #### diff --git a/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc b/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc index b2f3d095..90d95217 100755 --- a/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc +++ b/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc @@ -38,6 +38,7 @@ REGISTER_OP_CREATOR(ExpandDims, GeDeletedOp); REGISTER_OP_CREATOR(Reshape, GeDeletedOp); REGISTER_OP_CREATOR(ReFormat, GeDeletedOp); REGISTER_OP_CREATOR(Squeeze, GeDeletedOp); +REGISTER_OP_CREATOR(Unsqueeze, GeDeletedOp); REGISTER_OP_CREATOR(Size, GeDeletedOp); REGISTER_OP_CREATOR(Shape, GeDeletedOp); REGISTER_OP_CREATOR(ShapeN, GeDeletedOp); diff --git a/ge/ge_runner.mk b/ge/ge_runner.mk index 421d41e8..49515fe4 100644 --- a/ge/ge_runner.mk +++ b/ge/ge_runner.mk @@ -114,6 +114,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/passes/atomic_addr_clean_pass.cc \ graph/passes/mark_same_addr_pass.cc \ graph/passes/mark_graph_unknown_status_pass.cc \ + graph/passes/mark_node_unknown_shape_pass.cc \ graph/passes/mark_agnostic_pass.cc \ graph/partition/dynamic_shape_partition.cc \ graph/partition/stage_partition.cc \ @@ -246,6 +247,7 @@ LIBGE_LOCAL_SRC_FILES := \ graph/passes/end_of_sequence_add_control_pass.cc \ graph/passes/memcpy_addr_async_pass.cc \ graph/passes/set_input_output_offset_pass.cc \ + graph/passes/buffer_pool_memory_pass.cc \ graph/preprocess/graph_preprocess.cc \ graph/preprocess/insert_op/ge_aipp_op.cc \ graph/preprocess/insert_op/util_insert_aipp_op.cc \ diff --git a/ge/ge_runtime/task/label_goto_task.cc b/ge/ge_runtime/task/label_goto_task.cc index d357accb..ad93a98f 100644 --- a/ge/ge_runtime/task/label_goto_task.cc +++ b/ge/ge_runtime/task/label_goto_task.cc @@ -16,14 +16,12 @@ #include "ge_runtime/task/label_goto_task.h" #include "ge_runtime/task/task_factory.h" +#include "framework/common/util.h" namespace ge { namespace model_runner { LabelGotoTask::LabelGotoTask(const ModelContext &model_context, const std::shared_ptr &task_info) - : TaskRepeater(model_context, task_info), - task_info_(task_info), - stream_(nullptr), - label_(nullptr) { + : TaskRepeater(model_context, task_info), task_info_(task_info) { if (task_info_ == nullptr) { GELOGW("task_info_ is null!"); return; @@ -42,29 +40,78 @@ LabelGotoTask::LabelGotoTask(const ModelContext &model_context, const std::share label_ = label_list[label_id]; } -LabelGotoTask::~LabelGotoTask() {} +LabelGotoTask::~LabelGotoTask() { + GE_FREE_RT_LOG(label_info_); + GE_FREE_RT_LOG(index_value_); +} bool LabelGotoTask::Distribute() { GELOGI("LabelGotoTask Distribute start."); + if (!CheckParamValid()) { + return false; + } + + const std::vector label_list = { label_ }; + rtError_t rt_ret = rtMalloc(&index_value_, sizeof(uint64_t), RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: %#x", rt_ret); + return false; + } + + uint64_t branch_index = 0; + rt_ret = rtMemcpy(index_value_, sizeof(uint64_t), &branch_index, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: %#x", rt_ret); + return false; + } + + uint32_t label_info_size = sizeof(rtLabelDevInfo) * label_list.size(); + rt_ret = rtMalloc(&label_info_, label_info_size, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: %#x", rt_ret); + return false; + } + + rt_ret = rtLabelListCpy(label_list.data(), label_list.size(), label_info_, label_info_size); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: %#x", rt_ret); + return false; + } + + rt_ret = rtLabelSwitchByIndex(index_value_, label_list.size(), label_info_, stream_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: %#x", rt_ret); + return false; + } + + GELOGI("DistributeTask end."); + return true; +} + +bool LabelGotoTask::CheckParamValid() { if (stream_ == nullptr) { GELOGE(PARAM_INVALID, "stream is null!"); return false; } + if (label_ == nullptr) { GELOGE(PARAM_INVALID, "label is null!"); return false; } - rtError_t rt_ret = rtLabelGotoEx(label_, stream_); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + + if (label_info_ != nullptr) { + GELOGE(PARAM_INVALID, "label_info_ has dirty data."); + return false; + } + + if (index_value_ != nullptr) { + GELOGE(PARAM_INVALID, "index_value_ has dirty data."); return false; } - GELOGI("DistributeTask end."); return true; } REGISTER_TASK(TaskInfoType::LABEL_GOTO, LabelGotoTask, LabelGotoTaskInfo); - } // namespace model_runner } // namespace ge diff --git a/ge/ge_runtime/task/label_goto_task.h b/ge/ge_runtime/task/label_goto_task.h index 4fd6d1bc..addbb700 100644 --- a/ge/ge_runtime/task/label_goto_task.h +++ b/ge/ge_runtime/task/label_goto_task.h @@ -31,9 +31,13 @@ class LabelGotoTask : public TaskRepeater { bool Distribute() override; private: + bool CheckParamValid(); + std::shared_ptr task_info_; - void *stream_; - void *label_; + void *stream_{nullptr}; + void *label_{nullptr}; + void *label_info_{nullptr}; + void *index_value_{nullptr}; }; } // namespace model_runner } // namespace ge diff --git a/ge/generator/ge_generator.cc b/ge/generator/ge_generator.cc index fd39552d..a800c415 100644 --- a/ge/generator/ge_generator.cc +++ b/ge/generator/ge_generator.cc @@ -36,6 +36,7 @@ #include "graph/utils/type_utils.h" #include "init/gelib.h" #include "model/ge_model.h" +#include "analyzer/analyzer.h" using std::map; using std::string; @@ -50,9 +51,14 @@ const char *const kFileNameSuffix = "online"; const char *const kAicpuAllshape = "_AllShape"; constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; const int64_t kDynamicDimValue = -2; +const int kDefaultDeviceId = 0; +const int kDefaultJobId = 0; +const int32_t kFuzzBuildPattern = 1; std::map engine_type_map{ - {ge::ENGINE_SYS, kEngineNameDefault}, {ge::ENGINE_AICORE, kAIcoreEngine}, {ge::ENGINE_VECTOR, kVectorEngine}}; + {ge::ENGINE_SYS, kEngineNameDefault}, + {ge::ENGINE_AICORE, kAIcoreEngine}, + {ge::ENGINE_VECTOR, kVectorEngine}}; bool ContainsDynamicInpus(const ge::OpDesc &op_desc) { for (auto &tensor_desc : op_desc.GetAllInputsDescPtr()) { @@ -63,6 +69,10 @@ bool ContainsDynamicInpus(const ge::OpDesc &op_desc) { } return false; } +// if optional in/out, format is format_reserved and dtype is dt_undefined +bool IsOptional(const ge::GeTensorDesc &tensor_desc) { + return tensor_desc.GetFormat() == ge::FORMAT_RESERVED && tensor_desc.GetDataType() == ge::DT_UNDEFINED; +} } // namespace namespace ge { @@ -83,8 +93,9 @@ static Status CheckEngineTypeSupport(const NodePtr &node, OpEngineType engine_ty } else { ErrorManager::GetInstance().ATCReportErrMessage("E14001", {"opname", "optype", "value", "reason"}, {op_desc->GetName(), op_desc->GetType(), "engine type", - "it only support kEngineNameDefault/kAIcoreEngine/kVectorEngine"}); - GELOGE(FAILED, "CheckEngineType: engine type: %d not support", static_cast(engine_type)); + "it only support default/AIcoreEngine/VectorEngine"}); + GELOGE(FAILED, "[Check][EngineType]value:%d not support, " + "only support default/AIcoreEngine/VectorEngine now", static_cast(engine_type)); return FAILED; } @@ -149,7 +160,7 @@ static Status CheckEngineTypeSupport(const NodePtr &node, OpEngineType engine_ty } static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, const GeTensorDesc &tensor, int32_t index, - bool attr) { + bool attr, int32_t &data_index) { GE_CHECK_NOTNULL_EXEC(graph, return PARAM_INVALID); GE_CHECK_NOTNULL_EXEC(node, return PARAM_INVALID); @@ -188,17 +199,21 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, const (void)AttrUtils::SetBool(data_op, "_is_single_op", true); - GE_CHK_BOOL_EXEC(data_op->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add input desc fail."); - GE_CHK_BOOL_EXEC(data_op->AddOutputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add output desc fail."); - if (attr) { - GE_CHK_BOOL_EXEC(AttrUtils::SetInt(data_op, ATTR_NAME_INDEX, index), return FAILED, "Set index fail."); + GE_CHK_BOOL_EXEC(data_op->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, + "[Add][InputDesc]fail for node:%s", data_op->GetName().c_str()); + GE_CHK_BOOL_EXEC(data_op->AddOutputDesc(tensor) == GRAPH_SUCCESS, return FAILED, + "[Add][OutputDesc]fail for node:%s", data_op->GetName().c_str()); + if (attr && !is_const) { + GE_CHK_BOOL_EXEC(AttrUtils::SetInt(data_op, ATTR_NAME_INDEX, data_index), return FAILED, + "[Set][Attr:%s]fail for node:%s", ATTR_NAME_INDEX.c_str(), data_op->GetName().c_str()); + ++data_index; } ge::NodePtr arg_node = graph->AddNode(data_op); - GE_CHK_BOOL_EXEC(arg_node != nullptr, return FAILED, "Insert Data node fail."); + GE_CHK_BOOL_EXEC(arg_node != nullptr, return FAILED, "Insert Data node fail"); GE_CHK_STATUS(GraphUtils::AddEdge(arg_node->GetOutDataAnchor(0), node->GetInDataAnchor(index)), - "Add edge[%s->%s] fail.", data_op->GetName().c_str(), node->GetName().c_str()); + "[Add][Edge]fail from node:%s to node:%s", data_op->GetName().c_str(), node->GetName().c_str()); return SUCCESS; } @@ -213,20 +228,23 @@ static Status AddOutputs(const ComputeGraphPtr &graph, const NodePtr &node, cons for (const auto &out_desc : outputs) { GeTensorDesc tensor = out_desc.GetTensorDesc(); TensorUtils::SetInputTensor(tensor, true); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add input desc fail"); + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, + "[Add][InputDesc]fail for node:%s", op_desc->GetName().c_str()); TensorUtils::SetInputTensor(tensor, false); TensorUtils::SetOutputTensor(tensor, true); - GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add output desc fail"); + GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(tensor) == GRAPH_SUCCESS, return FAILED, + "[Add][OutputDesc]fail for node:%s", op_desc->GetName().c_str()); count++; } GE_CHECK_NOTNULL_EXEC(graph, return PARAM_INVALID); ge::NodePtr out_node = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(out_node != nullptr, return FAILED, "Insert Output node fail."); + GE_CHK_BOOL_EXEC(out_node != nullptr, return FAILED, + "[Add][Node:%s]fail in graph:%u", op_desc->GetName().c_str(), graph->GetGraphID()); GE_CHECK_NOTNULL_EXEC(node, return PARAM_INVALID); for (int32_t i = 0; i < count; ++i) { GE_CHK_STATUS(GraphUtils::AddEdge(node->GetOutDataAnchor(i), out_node->GetInDataAnchor(i)), - "Add edge[%s->%s] fail.", node->GetName().c_str(), out_node->GetName().c_str()); + "[Add][Edge]fail from node:%s to node:%s", node->GetName().c_str(), out_node->GetName().c_str()); } return SUCCESS; @@ -283,13 +301,44 @@ static Status ResetTensorVecShape(const vector &inputs, vectorGetName().c_str()); + GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); + for (const auto &node : ge_root_model->GetRootGraph()->GetAllNodes()) { + GE_CHECK_NOTNULL(node); + GE_CHECK_NOTNULL(node->GetOpDesc()); + GELOGD("Delete fuzz build attr of %s after build.", node->GetName().c_str()); + node->GetOpDesc()->DelAttr(ATTR_NAME_FUZZ_BUILD); + } + (void)AttrUtils::GetListNamedAttrs(op_desc, ATTR_NAME_FUZZ_BUILD_RES_ATTRS, fuzz_build_attrs); + if (!fuzz_build_attrs.empty()) { + GELOGD("%s has split, get ATTR_NAME_FUZZ_BUILD_RES_ATTRS directly.", op_desc->GetName().c_str()); + return SUCCESS; + } else { + GELOGW("%s build with fuzz build pattern, but not set ATTR_NAME_FUZZ_BUILD_RES_ATTRS.", op_desc->GetName().c_str()); + } + return SUCCESS; +} + +static bool HasShapeRange(const vector &inputs) { + for (const auto &input : inputs) { + vector> shape_range; + (void)input.GetTensorDesc().GetShapeRange(shape_range); + if (!shape_range.empty()) { + GELOGD("Has set shape range."); + return true; + } + } + return false; +} + class GeGenerator::Impl { public: Impl(OmgContext &omg_context) : omg_context_(omg_context) {} ~Impl() = default; Status BuildModel(const Graph &graph, const vector &inputs, GeRootModelPtr &ge_models); - Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model); Status SaveRootModel(const string &file_name_prefix, GeRootModelPtr &model, ModelBufferData &model_buff); @@ -554,6 +603,42 @@ bool GeGenerator::Impl::SetOmSystemInfo(AttrHolder &obj) { return true; } +Status GeGenerator::SetModelNameForDump(const GeRootModelPtr &ge_root_model) { + bool is_unknown_shape = false; + Status ret = ge_root_model->CheckIsUnknownShape(is_unknown_shape); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Check][IsUnknownShape]Check root model is unknown shape failed, model id:%u", + ge_root_model->GetModelId()); + REPORT_CALL_ERROR("E19999", "Check root model is unknown shape failed, model id:%u", + ge_root_model->GetModelId()); + return FAILED; + } + GeModelPtr model_root = nullptr; + if (is_unknown_shape) { + model_root = MakeShared(); + GE_CHECK_NOTNULL(model_root); + model_root->SetGraph(GraphUtils::CreateGraphFromComputeGraph(ge_root_model->GetRootGraph())); + ge_root_model->SetSubgraphInstanceNameToModel(ge_root_model->GetRootGraph()->GetName(), model_root); + } + + ModelHelper model_helper; + string model_name; + GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); + Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), + model_name); + if (name_ret != SUCCESS) { + ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"}); + GELOGE(FAILED, "[Check][GetModelNameStep]Get model_name failed. Param --output is invalid, root graph name: %s", + ge_root_model->GetRootGraph()->GetName().c_str()); + return PARAM_INVALID; + } + map name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel(); + GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()]; + GE_CHECK_NOTNULL(ge_model); + ge_model->SetName(model_name); + return SUCCESS; +} + Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_prefix, const vector &inputs, ModelBufferData &model, bool is_offline) { rtContext_t ctx = nullptr; @@ -588,20 +673,10 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr } GE_CHECK_NOTNULL(ge_root_model); - GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); - ModelHelper model_helper; - string model_name = ""; - Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), - model_name); - if (name_ret != SUCCESS) { - ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"}); - GELOGE(FAILED, "Get model_name failed. Param --output is invalid."); - return PARAM_INVALID; + ret = SetModelNameForDump(ge_root_model); + if (ret != SUCCESS) { + return ret; } - map name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel(); - GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()]; - GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model cannot be null"); - ge_model->SetName(model_name); ret = impl_->SaveRootModel(file_name_prefix, ge_root_model, model); if (ret != SUCCESS) { GELOGE(ret, "Save model failed"); @@ -654,6 +729,34 @@ namespace { } } +bool GeGenerator::CheckNoAicore(const ComputeGraphPtr &graph) { + for (const auto &node : graph->GetDirectNode()) { + if (node == nullptr) { + continue; + } + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + if (op_desc->GetOpEngineName() == kAIcoreEngine) { + return false; + } + } + return true; +} + +void GeGenerator::RemoveConst(const vector &inputs, vector &outputs) { + for (auto &input : inputs) { + GeTensorDesc input_desc = input.GetTensorDesc(); + bool is_const = false; + (void)AttrUtils::GetBool(input_desc, CONST_ATTR_NAME_INPUT, is_const); + bool is_optional = IsOptional(input_desc); + if (!is_optional && !is_const) { + outputs.emplace_back(input); + } + } +} + Status GeGenerator::CheckForSingleOp(OpDescPtr &op_desc, const vector &inputs, const vector &outputs) { GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID); @@ -676,7 +779,8 @@ Status GeGenerator::CheckForSingleOp(OpDescPtr &op_desc, const vector Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff, - bool is_offline) { + bool is_offline, int32_t compile_flag) { + GELOGD("Inputs size is %zu, outputs size is %zu.", inputs.size(), outputs.size()); GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID); impl_->is_offline_ = is_offline; if (!is_offline) { @@ -698,6 +802,16 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in OpDescPtr op_desc_tmp = AttrUtils::CloneOpDesc(op_desc); GE_CHECK_NOTNULL(op_desc_tmp); + bool fuzz_compile_flag = false; + if (!HasShapeRange(inputs) && compile_flag == kFuzzBuildPattern) { + fuzz_compile_flag = true; + } + if (!AttrUtils::SetBool(op_desc, ATTR_NAME_FUZZ_BUILD, fuzz_compile_flag)) { + GELOGE(FAILED, "[Set][ATTR_NAME_FUZZ_BUILD] Failed to set attr for %s.", op_desc->GetName().c_str()); + return FAILED; + } + impl_->omg_context_.fuzz_compile_flag = fuzz_compile_flag; + // 1. Create ComputeGraph. string name = ge::CurrentTimeInStr() + "_" + model_file_name; Graph graph; @@ -710,7 +824,7 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in auto node = comp_graph->FindNode(op_desc->GetName()); Status ret = CheckEngineTypeSupport(node, engine_type); if (ret != SUCCESS) { - GELOGE(ret, "check engine type failed."); + GELOGE(ret, "[Check][EngineType]value:%d for node:%s not support", engine_type, node->GetName().c_str()); return ret; } } @@ -718,7 +832,9 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in GELOGI("ATC parser success in single op build."); GeRootModelPtr ge_root_model = nullptr; - GE_CHK_STATUS_RET_NOLOG(impl_->BuildModel(graph, inputs, ge_root_model)); + vector data_inputs; + RemoveConst(inputs, data_inputs); + GE_CHK_STATUS_RET_NOLOG(impl_->BuildModel(graph, data_inputs, ge_root_model)); map op_attrs = op_desc_tmp->GetAllAttrs(); GE_CHECK_NOTNULL(ge_root_model); GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); @@ -734,7 +850,7 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in bool all_shape = false; (void)AttrUtils::GetBool(op_desc, kAicpuAllshape, all_shape); - if (all_shape) { + if (all_shape && CheckNoAicore(root_graph)) { GELOGD("Get aicpu all_shape kernel!"); vector inputs_dynamic; vector outputs_dynamic; @@ -742,6 +858,19 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in GE_CHK_STATUS_RET_NOLOG(ResetTensorVecShape(outputs, outputs_dynamic)); GE_CHK_STATUS_RET_NOLOG( impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs_dynamic, outputs_dynamic)); + } else if (fuzz_compile_flag) { + GELOGD("Get fuzz build result of %s.", op_desc->GetName().c_str()); + (void)AttrUtils::SetInt(ge_model, ATTR_NAME_BUILD_MODE, fuzz_compile_flag); + GeAttrValue::LIST_NAMED_ATTRS fuzz_build_attrs; + if (GetFuzzBuildAttrs(op_desc, ge_root_model, fuzz_build_attrs) != SUCCESS) { + GELOGE(FAILED, "[Get][FuzzRet]Failed to get fuzz build result of %s.", op_desc->GetName().c_str()); + return FAILED; + } + if (!fuzz_build_attrs.empty()) { + GE_CHK_BOOL_EXEC(AttrUtils::SetListNamedAttrs(ge_model, ATTR_NAME_FUZZ_BUILD_RES_ATTRS, fuzz_build_attrs), + return FAILED, "Set ATTR_NAME_FUZZ_BUILD_RES_ATTRS failed."); + } + GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs, outputs)); } else { GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs, outputs)); } @@ -757,15 +886,17 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in * @param [in] vector &inputs: Operator input data description information. * @param [in] vector &outputs: Operator output data description information. * @param [in] const string &model_file_name: Offline model filename. + * @param [in] compile_flag: op build flag from atc * @return SUCCESS handle successfully / others handle failed */ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, - const vector &outputs, const string &model_file_name) { + const vector &outputs, const string &model_file_name, + int32_t compile_flag) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGI("Start to build single op offline model, input size: %zu, output size: %zu", inputs.size(), outputs.size()); ModelBufferData model_buff; OpEngineType engine_type = ENGINE_SYS; - Status status = BuildSingleOp(op_desc, inputs, outputs, model_file_name, engine_type, model_buff, true); + Status status = BuildSingleOp(op_desc, inputs, outputs, model_file_name, engine_type, model_buff, true, compile_flag); GELOGI("Finish build single offline model, status: %u", status); return status; } @@ -777,9 +908,11 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs: Operator input data description information. * @param [in] vector &outputs: Operator output data description information. * @param [in] engine_type: specific engine. + * @param [in] compile_flag: op build flag, compile flag by acl * @param [out] ModelBufferData &Model_buff: Model_buff: model buffer of the op. * @return SUCCESS handle successfully / others handle failed */ + Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, OpEngineType engine_type, ModelBufferData &model_buff) { @@ -790,6 +923,17 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, + const vector &outputs, OpEngineType engine_type, int32_t compile_flag, + ModelBufferData &model_buff) { + ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + GELOGI("Start to build single op online, input size: %zu, output size: %zu", inputs.size(), outputs.size()); + Status status = BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false, + compile_flag); + GELOGI("Finish build single online model, status: %u", status); + return status; +} + Status GeGenerator::BuildSingleOpGraph(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, std::string graph_name, Graph &graph) { ge::ComputeGraphPtr compute_graph = MakeShared(graph_name); @@ -801,18 +945,19 @@ Status GeGenerator::BuildSingleOpGraph(OpDescPtr &op_desc, const vectorGetAllInputsDescPtr()) { GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR); if (!IsNeedConnectInputOpForSingleOp(*input_desc)) { continue; } - GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false)); + GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false, data_index)); arg_index++; } } else { for (const auto &in_desc : inputs) { - GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, in_desc.GetTensorDesc(), arg_index, true)); + GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, in_desc.GetTensorDesc(), arg_index, true, data_index)); arg_index++; } } @@ -871,13 +1016,12 @@ Status GeGenerator::Impl::SaveRootModel(const string &file_name_prefix, GeRootMo "ge root model has no sub model") GeModelPtr model_root = nullptr; if (is_unknown_shape) { - model_root = make_shared(); - model_root->SetGraph(GraphUtils::CreateGraphFromComputeGraph(ge_root_model->GetRootGraph())); - ge_root_model->SetSubgraphInstanceNameToModel(ge_root_model->GetRootGraph()->GetName(), model_root); - model_root->SetName(ge_root_model->GetRootGraph()->GetName()); + auto name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel(); + model_root = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()]; } else { model_root = ge_root_model->GetSubgraphInstanceNameToModel().begin()->second; } + GE_CHECK_NOTNULL(model_root); // set atc version if (!SetAtcVersionInfo(*(model_root.get()))) { GELOGW("SetPackageVersionInfo of atc failed!"); @@ -915,6 +1059,13 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector static std::atomic atomic_session_id(0); auto session_id = atomic_session_id.fetch_add(1); + // This is a temporary add for graph with variable + auto version = static_cast(SessionVersion::ClOUD_VERSION); + ret = VarManager::Instance(session_id)->Init(version, session_id, kDefaultDeviceId, kDefaultJobId); + GELOGI("Start init var instance, session_id %lu", session_id); + if (ret != SUCCESS) { + GELOGW("Failed init var instance, session_id %lu", session_id); + } if (is_singleop_unregistered_) { ret = graph_manager_.BuildGraphForUnregisteredOp(graph_id, inputs, ge_root_model, session_id); } else { @@ -924,13 +1075,13 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); if (ret != SUCCESS) { GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", graph_id); - VarManagerPool::Instance().RemoveVarManager(session_id); - return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED; + ret = GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED; } + RtContextUtil::GetInstance().DestroyRtContexts(session_id); + Analyzer::GetInstance()->DestroySessionJsonObject(session_id); VarManagerPool::Instance().RemoveVarManager(session_id); - - return SUCCESS; + return ret; } Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph) { diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc index 0883d895..591c8d02 100644 --- a/ge/graph/build/graph_builder.cc +++ b/ge/graph/build/graph_builder.cc @@ -77,6 +77,8 @@ Status HandleSubgraphNode(NodePtr &src_node, OutDataAnchorPtr &src_out_anchor) { Status HandleSubgraphDataNode(NodePtr &src_node, OutDataAnchorPtr &src_out_anchor) { uint32_t index = 0; if (!AttrUtils::GetInt(src_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, index)) { + REPORT_INNER_ERROR("E19999", "get attr:%s failed from node:%s", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), src_node->GetName().c_str()); GELOGE(FAILED, "Get attr ATTR_NAME_PARENT_NODE_INDEX failed, node:%s.", src_node->GetName().c_str()); return FAILED; } @@ -109,6 +111,8 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) { GE_CHECK_NOTNULL(graph); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "check gelib instance null, graph:%s", + graph->GetName().c_str()); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GraphBuilder: GE is not initialized"); return GE_CLI_GE_NOT_INITIALIZED; } @@ -121,6 +125,8 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) { (void)instance_ptr->DNNEngineManagerObj().GetDNNEngineName(node_ptr); kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName(); if (kernel_lib_name.empty()) { + REPORT_INNER_ERROR("E19999", "op kernel lib is empty in node:%s(%s)", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); return INTERNAL_ERROR; @@ -129,12 +135,16 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) { auto ret = SetInputSize(node_ptr); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set node:%s(%s) inputDesc size failed", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); GELOGE(ret, "Set node inputDesc size failed, node name is %s", node_ptr->GetName().c_str()); return ret; } ret = OpsKernelBuilderManager::Instance().CalcOpRunningParam(*node_ptr); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call Calculate op:%s(%s) running param failed", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); GELOGE(ret, "Calculate op running param failed, node name is %s", node_ptr->GetName().c_str()); return ret; } @@ -191,6 +201,7 @@ Status GraphBuilder::UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, GeRootModelPtr &ge_root_model_ptr, uint64_t session_id) { if (comp_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "check compute_graph nullptr, session_id:%lu", session_id); GELOGE(GE_GRAPH_PARAM_NULLPTR, "Graph build comp_graph is null."); return GE_GRAPH_PARAM_NULLPTR; } @@ -302,6 +313,8 @@ Status GraphBuilder::SetConstantInputOffset(ComputeGraphPtr &comp_graph) { std::vector weights = OpDescUtils::MutableWeights(peer_node); if (weights.empty()) { + REPORT_INNER_ERROR("E19999", "check weights size of node %s(%s) is empty", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "weights size of node %s is empty", node->GetName().c_str()); return FAILED; } @@ -382,54 +395,6 @@ Status GraphBuilder::BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPt return BuildForUnknownShapeGraph(comp_graph, ge_model_ptr, session_id); } -static Status InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_anchor, - const std::vector &in_anchors, const std::string &name) { - GE_CHECK_NOTNULL(out_anchor); - NodePtr in_node = out_anchor->GetOwnerNode(); - GE_CHECK_NOTNULL(in_node); - OpDescBuilder op_desc_builder(name, MEMCPYADDRASYNC); - OpDescPtr op_desc = op_desc_builder.AddInput("x", in_node->GetOpDesc()->GetOutputDesc(0)) - .AddOutput("y", in_node->GetOpDesc()->GetOutputDesc(0)) - .Build(); - (void)AttrUtils::SetBool(op_desc, ATTR_NO_NEED_CONSTANT_FOLDING, false); - if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(op_desc)) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Insert IDENTITY node %s after %s failed.", name.c_str(), in_node->GetName().c_str()); - return FAILED; - } - return SUCCESS; -} - -static Status GenerateTaskForConstant(const std::shared_ptr &graph) { - for (auto &node : graph->GetDirectNode()) { - // CONSTANT not generate task, so insert IDENTITY between CONSTANT and NETOUTPUT - auto op_desc = node->GetOpDesc(); - if (op_desc == nullptr) { - continue; - } - auto op_type = op_desc->GetType(); - if (op_type == NETOUTPUT) { - for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { - const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); - NodePtr in_node = peer_out_anchor->GetOwnerNode(); - GE_CHECK_NOTNULL(in_node); - - std::string in_node_op_type = in_node->GetType(); - if (in_node_op_type == CONSTANT) { - GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str()); - std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy"; - if (InsertMemcpyNode(graph, peer_out_anchor, {in_data_anchor}, name) != SUCCESS) { - GELOGE(FAILED, "Insert memcpy between %s and %s failed.", - in_node->GetName().c_str(), node->GetName().c_str()); - return FAILED; - } - } - } - } - } - return SUCCESS; -} - Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag(); com_graph->SetGraphUnknownFlag(false); @@ -466,6 +431,8 @@ Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), + REPORT_INNER_ERROR("E19999", "Multiply result is out of range when calc profiling ar log id " + "for node:%s(%s)", op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Multiply result is out of range."); return FAILED); int64_t log_id = i * kProfilingArStep + kProfilingArStartLogid; @@ -512,9 +479,6 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, !sub_graph->GetParentGraph()->GetGraphUnknownFlag()) { continue; } - - GE_CHK_STATUS_RET(GenerateTaskForConstant(sub_graph), "Generate task For constant node in subgraph failed."); - if (sub_graph->GetGraphUnknownFlag()) { // unknown shape build flow GE_CHK_STATUS_RET(BuildForUnknownShapeGraph(sub_graph, ge_model_ptr, session_id), @@ -545,16 +509,19 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr int64_t memory_size = 0; if (!AttrUtils::GetInt(model_ptr, ATTR_MODEL_MEMORY_SIZE, memory_size)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s fail in model", ATTR_MODEL_MEMORY_SIZE.c_str()); GELOGE(INTERNAL_ERROR, "Get memory size fail."); return INTERNAL_ERROR; } int64_t p2p_memory_size = 0; if (!AttrUtils::GetInt(model_ptr, ATTR_MODEL_P2P_MEMORY_SIZE, p2p_memory_size)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s fail in model", ATTR_MODEL_P2P_MEMORY_SIZE.c_str()); GELOGE(INTERNAL_ERROR, "Get p2p memory size fail."); return INTERNAL_ERROR; } int64_t weight_size = 0; if (!AttrUtils::GetInt(model_ptr, ATTR_MODEL_WEIGHT_SIZE, weight_size)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s fail in model", ATTR_MODEL_WEIGHT_SIZE.c_str()); GELOGE(INTERNAL_ERROR, "Get weight memory size fail."); return INTERNAL_ERROR; } @@ -664,6 +631,7 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) { Status GraphBuilder::UpdateDataInputSize(const ge::NodePtr &node_ptr) { const auto &op_desc = node_ptr->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "check op_desc is nullptr"); GELOGE(FAILED, "Op desc is nullptr."); return FAILED; } @@ -681,6 +649,8 @@ Status GraphBuilder::UpdateDataInputSize(const ge::NodePtr &node_ptr) { int64_t real_dim_size = 0; ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc, real_dim_size); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed for op:%s(%s) index:0", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Get tensor size in bytes failed."); return FAILED; } @@ -688,6 +658,8 @@ Status GraphBuilder::UpdateDataInputSize(const ge::NodePtr &node_ptr) { ge::GeTensorDesc input_desc = op_desc->GetInputDesc(0); ge::TensorUtils::SetSize(input_desc, real_dim_size); if (op_desc->UpdateInputDesc(0, input_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input desc size failed for op:%s(%s) index:0", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Update input desc size failed."); return FAILED; } @@ -716,6 +688,8 @@ Status GraphBuilder::CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc) int64_t real_dim_size = 0; ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc, real_dim_size); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed for op:%s(%s) index:0 ", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Get tensor size in bytes failed."); return FAILED; } @@ -723,6 +697,8 @@ Status GraphBuilder::CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc) ge::TensorUtils::SetSize(output_desc, real_dim_size); GELOGI("Update dynamic shape graph data output size to [%ld].", real_dim_size); if (op_desc->UpdateOutputDesc(0, output_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output desc size failed for op:%s(%s) index:0 ", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Update dynamic shape graph data output desc size failed."); return FAILED; } @@ -740,6 +716,8 @@ Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph) { GE_CHK_STATUS_RET(ret, "Graph partition Failed."); const auto &graph_2_subgraphlist = graph_partitioner_.GetSubGraphMap(); if (graph_2_subgraphlist.find(comp_graph) == graph_2_subgraphlist.end()) { + REPORT_INNER_ERROR("E19999", "find subgraphlis in graph:%s failed", + comp_graph->GetName().c_str()); GELOGE(FAILED, "Find subgraph failed."); return FAILED; } @@ -768,6 +746,9 @@ Status GraphBuilder::AddOutputMemTypeForNode(const NodePtr &node) { mem_type); if (!AttrUtils::SetInt(src_desc->MutableOutputDesc(src_out_anchor->GetIdx()), ATTR_OUTPUT_MEMORY_TYPE, mem_type)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s for node:%s(%s) out_index:%u failed", + ATTR_OUTPUT_MEMORY_TYPE.c_str(), src_desc->GetName().c_str(), src_desc->GetType().c_str(), + src_out_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Set out_memory_type attr for [%s:%d] failed.", src_desc->GetName().c_str(), src_out_anchor->GetIdx()); return INTERNAL_ERROR; diff --git a/ge/graph/build/label_allocator.cc b/ge/graph/build/label_allocator.cc index 28d0e084..b6ef8dc9 100644 --- a/ge/graph/build/label_allocator.cc +++ b/ge/graph/build/label_allocator.cc @@ -28,6 +28,7 @@ LabelAllocator::LabelAllocator(const ComputeGraphPtr &graph) : compute_graph_(gr Status LabelAllocator::AssignFunctionalLabels() { if (compute_graph_ == nullptr) { + REPORT_INNER_ERROR("E19999", "check param compute_graph nullptr"); GELOGE(INTERNAL_ERROR, "ComputeGraph not set, Assign labels failed."); return INTERNAL_ERROR; } @@ -46,11 +47,15 @@ Status LabelAllocator::AssignFunctionalLabels() { for (auto node : functional_nodes) { LabelMakerPtr maker = LabelMakerFactory::Instance().Create(node->GetType(), compute_graph_, node); if (maker == nullptr) { + REPORT_CALL_ERROR("E19999", "Check Node:%s(%s) label maker not registed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Node: %s label maker not registed.", node->GetType().c_str()); return INTERNAL_ERROR; } if (maker->Run(label_index) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Node:%s(%s) run label maker failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Node: %s run label maker failed.", node->GetType().c_str()); return INTERNAL_ERROR; } @@ -63,6 +68,7 @@ Status LabelAllocator::AssignFunctionalLabels() { bool LabelAllocator::CollectFunctionalNode(ComputeGraphPtr &graph, std::set &functional_nodes) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "check param compute_graph nullptr"); GELOGE(INTERNAL_ERROR, "Sub ComputeGraph is null."); return false; } @@ -74,12 +80,16 @@ bool LabelAllocator::CollectFunctionalNode(ComputeGraphPtr &graph, std::setGetParentNode(); if (func_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Parent node not set in node:%s(%s), graph:%s", + func_node->GetName().c_str(), func_node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Parent functional node not set: %s.", graph->GetName().c_str()); return false; } ComputeGraphPtr owner_graph = func_node->GetOwnerComputeGraph(); if (owner_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "ComputeGraph owner not set in node:%s(%s), graph:%s", + func_node->GetName().c_str(), func_node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "ComputeGraph owner not set: %s.", func_node->GetName().c_str()); return false; } diff --git a/ge/graph/build/logical_stream_allocator.cc b/ge/graph/build/logical_stream_allocator.cc index 8ea7fe71..88b4a97f 100644 --- a/ge/graph/build/logical_stream_allocator.cc +++ b/ge/graph/build/logical_stream_allocator.cc @@ -33,13 +33,21 @@ using std::queue; namespace ge { LogicalStreamPass::LogicalStreamPass(const string &name) : name_(name) {} -const string &LogicalStreamPass::GetName() const { return name_; } +const string &LogicalStreamPass::GetName() const { + return name_; +} -bool LogicalStreamPass::IsEngineSkip(const Subgraph &subgraph) const { return subgraph.engine_conf.skip_assign_stream; } +bool LogicalStreamPass::IsEngineSkip(const Subgraph &subgraph) const { + return subgraph.engine_conf.skip_assign_stream; +} -bool LogicalStreamPass::IsEngineAttach(const Subgraph &subgraph) const { return subgraph.engine_conf.attach; } +bool LogicalStreamPass::IsEngineAttach(const Subgraph &subgraph) const { + return subgraph.engine_conf.attach; +} -bool LogicalStreamPass::IsEngineIndependent(const Subgraph &subgraph) const { return subgraph.engine_conf.independent; } +bool LogicalStreamPass::IsEngineIndependent(const Subgraph &subgraph) const { + return subgraph.engine_conf.independent; +} bool LogicalStreamPass::HasStreamLabel(const Subgraph &subgraph) const { return !subgraph.subgraph_info.GetStreamLabel().empty(); @@ -60,14 +68,14 @@ Status AssignByLabelPass::Run(ComputeGraphPtr graph, const vector & // Subgraphs of the same stream_label are assigned to the same stream, // and different stream_labels are assigned new streams. auto iter = label_streams.find(stream_label); - if (iter != label_streams.end()) { - subgraph->stream_id = iter->second; - } else { + if (iter == label_streams.end()) { subgraph->stream_id = next_stream; - GELOGI("Assign new stream %ld for label %s.", next_stream, stream_label.c_str()); + GELOGI("[Assign][NewStreamId] %ld for label %s.", next_stream, stream_label.c_str()); label_streams.emplace(stream_label, next_stream); - ++next_stream; + next_stream++; + } else { + subgraph->stream_id = iter->second; } changed = true; } @@ -92,15 +100,15 @@ Status IndependentStreamPass::Run(ComputeGraphPtr graph, const vectorsubgraph_info.GetStreamLabel(); auto &label_streams = engine_streams[engine]; auto iter = label_streams.find(stream_label); - if (iter != label_streams.end()) { - subgraph->stream_id = iter->second; - } else { + if (iter == label_streams.end()) { subgraph->stream_id = next_stream; - GELOGI("Assign new independent stream %ld for engine %s (label: %s).", next_stream, engine.c_str(), + GELOGI("[Assign][NewStreamId:independent] %ld for engine %s (label: %s).", next_stream, engine.c_str(), stream_label.c_str()); label_streams.emplace(stream_label, next_stream); - ++next_stream; + next_stream++; + } else { + subgraph->stream_id = iter->second; } changed = true; } @@ -121,14 +129,16 @@ Status AssignByDependencyPass::Run(ComputeGraphPtr graph, const vectorstream_id = reusable_subgraph->stream_id; } else { int64_t stream_id = AssignNewStream(reusable_subgraph); subgraph->stream_id = stream_id; - GELOGI("Reusable subgraph %s has not been assigned a stream, now assign new stream %ld.", - reusable_subgraph->name.c_str(), stream_id); + GELOGI("[Assign][NewStreamId] %ld for Reusable subgraph %s cause has not been assigned before.", + stream_id, reusable_subgraph->name.c_str()); } if (reusable_subgraph->reused_subgraph != nullptr) { @@ -137,11 +147,10 @@ Status AssignByDependencyPass::Run(ComputeGraphPtr graph, const vectorreused_subgraph = reusable_subgraph; reused_subgraphs_.emplace_back(subgraph, reusable_subgraph); - GELOGI("Subgraph %s of engine %s reuses stream of subgraph %s of engine %s.", subgraph->name.c_str(), + GELOGI("[Reuse][Stream]Subgraph %s of engine %s reuses stream of subgraph %s of engine %s.", + subgraph->name.c_str(), subgraph->engine_conf.id.c_str(), reusable_subgraph->name.c_str(), reusable_subgraph->engine_conf.id.c_str()); - } else { - (void)AssignNewStream(subgraph); } changed = true; } @@ -191,13 +200,15 @@ bool AssignByDependencyPass::CouldReuse(const SubgraphPtr &subgraph, const Subgr auto iter = pld_subgraph_map.find(end_pld_pair.second); if (iter != pld_subgraph_map.end()) { const SubgraphPtr &pred_subgraph_succ = iter->second; - if (pred_subgraph_succ != subgraph && pred_subgraph_succ->engine_conf.id == pred_subgraph->engine_conf.id) { + if ((pred_subgraph_succ != subgraph) && + (pred_subgraph_succ->engine_conf.id == pred_subgraph->engine_conf.id)) { return false; } } } - if ((subgraph->engine_conf.id == pred_subgraph->engine_conf.id) || IsEngineAttach(*subgraph)) { + if ((subgraph->engine_conf.id == pred_subgraph->engine_conf.id) || + IsEngineAttach(*subgraph)) { return true; } @@ -249,7 +260,7 @@ int64_t AssignByDependencyPass::AssignNewStream(SubgraphPtr subgraph) { engine_stream_num_[engine_name] = stream_id + 1; } - GELOGI("Subgraph %s assigns new temp stream %ld (engine: %s).", subgraph->name.c_str(), stream_id, + GELOGI("[Assign][NewStreamId:temp]id:%ld for Subgraph %s (engine: %s).", stream_id, subgraph->name.c_str(), engine_name.c_str()); return stream_id; @@ -282,7 +293,7 @@ void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) { GELOGI("Subgraph %s of engine %s reuses default stream %ld.", subgraph->name.c_str(), subgraph->engine_conf.id.c_str(), context.default_stream); } else { - GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id); + GELOGI("[Update][StreamId]id:%ld for subgraph %s.", subgraph->stream_id, subgraph->name.c_str()); } } } @@ -293,7 +304,7 @@ void AssignByDependencyPass::UpdateReusedSubgraphs() { auto &cur_subgraph = item.first; auto &reused_graph = item.second; cur_subgraph->stream_id = reused_graph->stream_id; - GELOGI("Stream of subgraph %s has been updated to %ld.", cur_subgraph->name.c_str(), cur_subgraph->stream_id); + GELOGI("[Update][StreamId]id:%ld for subgraph %s.", cur_subgraph->stream_id, cur_subgraph->name.c_str()); } } @@ -309,6 +320,8 @@ Status SingleStreamPass::Run(ComputeGraphPtr graph, const vector &s if (!HasAssignedStream(*subgraph)) { const string &stream_label = subgraph->subgraph_info.GetStreamLabel(); if (!stream_label.empty()) { + REPORT_INNER_ERROR("E19999", "Stream labels are not supported in SingleStream mode " + "(subgraph: %s, stream label: %s)", subgraph->name.c_str(), stream_label.c_str()); GELOGE(INTERNAL_ERROR, "Stream labels are not supported (subgraph: %s, stream label: %s).", subgraph->name.c_str(), stream_label.c_str()); return INTERNAL_ERROR; @@ -326,11 +339,13 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vectorengine_conf.id; if (!IsEngineSkip(*subgraph) && !HasAssignedStream(*subgraph)) { + REPORT_INNER_ERROR("E19999", "Subgraph %s has not yet been assigned a stream (engine: %s)", + subgraph->name.c_str(), engine_name.c_str()); GELOGE(INTERNAL_ERROR, "Subgraph %s has not yet been assigned a stream (engine: %s).", subgraph->name.c_str(), engine_name.c_str()); return INTERNAL_ERROR; } else { - GELOGI("Subgraph %s is assigned stream %ld (engine: %s).", subgraph->name.c_str(), subgraph->stream_id, + GELOGI("[Assign][StreamId] %ld for Subgraph %s (engine: %s).", subgraph->stream_id, subgraph->name.c_str(), engine_name.c_str()); } } @@ -353,12 +368,12 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vectorGetName().c_str(), node->GetType().c_str(), subgraph->name.c_str(), context.default_stream, engine_name.c_str()); } else if (IsEngineSkip(*subgraph) && node->GetInNodes().empty()) { - GELOGD("Node %s of type %s in subgraph %s doesn't need to assign a stream (engine: %s).", + GELOGD("[Skip][StreamIdAssign]Node %s of type %s in subgraph %s doesn't need (engine: %s).", node->GetName().c_str(), node->GetType().c_str(), subgraph->name.c_str(), engine_name.c_str()); } else { node->GetOpDesc()->SetStreamId(stream_id); - GELOGD("Node %s of type %s in subgraph %s is assigned stream %ld (engine: %s).", node->GetName().c_str(), - node->GetType().c_str(), subgraph->name.c_str(), stream_id, engine_name.c_str()); + GELOGD("[Assign][StreamId]id:%ld for Node %s of type %s in subgraph %s (engine: %s).", stream_id, + node->GetName().c_str(), node->GetType().c_str(), subgraph->name.c_str(), engine_name.c_str()); } } } @@ -366,6 +381,48 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { + std::map> stream_op_map; + for (const SubgraphPtr &subgraph : subgraphs) { + auto compute_graph = subgraph->subgraph_info.GetSubGraph(); + for (const NodePtr &node : compute_graph->GetDirectNode()) { + OpDescPtr op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + if (op_desc->HasAttr(ATTR_NAME_PARALLEL_GROUP)) { + int64_t op_desc_stream_id = op_desc->GetStreamId(); + stream_op_map[op_desc_stream_id].push_back(op_desc); + } + } + } + for (const auto &itr : stream_op_map) { + if (itr.first == kInvalidStream) { + continue; + } + std::map group_2_stream_id; + for (const auto &op_desc : itr.second) { + std::string group_name; + if (!AttrUtils::GetStr(op_desc, ATTR_NAME_PARALLEL_GROUP, group_name)) { + GELOGE(FAILED, "[GetAttr][OpDesc]Get node %s ATTR_NAME_PARALLEL_GROUP failed.", op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Get node %s ATTR_NAME_PARALLEL_GROUP failed.", op_desc->GetName().c_str()); + return FAILED; + } + const auto &itr = group_2_stream_id.find(group_name); + int64_t new_stream_id = kInvalidStream; + int64_t old_stream_id = op_desc->GetStreamId(); + if (itr != group_2_stream_id.end()) { + new_stream_id = itr->second; + } else { + new_stream_id = context.next_stream++; + group_2_stream_id[group_name] = new_stream_id; + } + op_desc->SetStreamId(new_stream_id); + GELOGD("Node %s assigned stream %ld from stream %ld.", + op_desc->GetName().c_str(), new_stream_id, old_stream_id); + } + } + return SUCCESS; +} + int64_t UpdateForSkippedEnginePass::GetSingleInoutStream(const NodePtr &node) const { set stream_ids; @@ -387,8 +444,8 @@ int64_t UpdateForSkippedEnginePass::GetSingleInoutStream(const NodePtr &node) co if (stream_ids.size() == 1) { int64_t stream_id = *(stream_ids.begin()); - GELOGI("The stream of all input and output nodes of node %s (type: %s) is %ld.", node->GetName().c_str(), - node->GetType().c_str(), stream_id); + GELOGI("[Get][SingleStreamId]The stream of all input and output nodes of node %s (type: %s) is %ld.", + node->GetName().c_str(), node->GetType().c_str(), stream_id); return stream_id; } @@ -406,7 +463,7 @@ Status UpdateForSkippedEnginePass::Run(ComputeGraphPtr graph, const vectorGetOpDesc(); GE_CHECK_NOTNULL(op_desc); auto stream_id = op_desc->GetStreamId(); - if (stream_id != kInvalidStream && !HasStreamLabel(*subgraph)) { + if ((stream_id != kInvalidStream) && !HasStreamLabel(*subgraph)) { ops_without_label.emplace(op_desc); } } @@ -427,8 +484,8 @@ Status UpdateForSkippedEnginePass::Run(ComputeGraphPtr graph, const vectorSetStreamId(inout_stream); - GELOGI("Node %s of type %s reassign to stream %ld from stream %ld.", node->GetName().c_str(), - node->GetType().c_str(), inout_stream, stream_id); + GELOGI("[Reassign][StreamId]%ld for Node %s of type %s from stream %ld.", + inout_stream, node->GetName().c_str(), node->GetType().c_str(), stream_id); } } } @@ -455,7 +512,7 @@ Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vectorGetDirectNode()) { if (!IsHcomNode(node->GetType()) || - node->GetInDataNodes().size() <= 1) { + (node->GetInDataNodes().size() <= 1)) { continue; } @@ -565,7 +622,7 @@ Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &root_graph, const G RefreshContinuousStreams(root_graph); stream_num = context_.next_stream; - GELOGI("Assigned logical stream num: %ld.", stream_num); + GELOGI("[Assign][LogicalStream] At last, stream num: %ld.", stream_num); return SUCCESS; } @@ -575,7 +632,7 @@ Status LogicalStreamAllocator::DoAssign(const ComputeGraphPtr &graph, const Grap GE_CHECK_NOTNULL(graph); NodePtr parent_node = graph->GetParentNode(); - if (parent_node == nullptr || parent_node->GetOpDesc() == nullptr) { + if ((parent_node == nullptr) || (parent_node->GetOpDesc() == nullptr)) { context_.default_stream = kInvalidStream; } else { context_.default_stream = parent_node->GetOpDesc()->GetStreamId(); @@ -583,6 +640,8 @@ Status LogicalStreamAllocator::DoAssign(const ComputeGraphPtr &graph, const Grap auto iter = subgraph_map.find(graph); if (iter == subgraph_map.end()) { + REPORT_INNER_ERROR("E19999", "Graph %s not found in subgraph_map when do logical stream assign ", + graph->GetName().c_str()); GELOGE(FAILED, "Graph %s not found.", graph->GetName().c_str()); return FAILED; } @@ -597,7 +656,7 @@ Status LogicalStreamAllocator::DoAssign(const ComputeGraphPtr &graph, const Grap return status; } - GELOGD("Subgraphs of graph %s:", graph->GetName().c_str()); + GELOGD("[Show][Subgraphs] in graph %s", graph->GetName().c_str()); for (const auto &subgraph : subgraphs) { if (subgraph != nullptr) { GELOGD("subgraph: %s", subgraph->name.c_str()); @@ -622,6 +681,8 @@ Status LogicalStreamAllocator::ConvertSubgraphs(const vector &s const string &engine_name = subgraph_info->GetEngineName(); auto engine_conf_iter = engine_confs.find(engine_name); if ((engine_conf_iter == engine_confs.end()) || (engine_conf_iter->second == nullptr)) { + REPORT_INNER_ERROR("E19999", "Engine conf of subgraph %s not found (engine name: %s)", + subgraph_name.c_str(), engine_name.c_str()); GELOGE(INTERNAL_ERROR, "Engine conf of subgraph %s not found (engine name: %s).", subgraph_name.c_str(), engine_name.c_str()); @@ -655,6 +716,7 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &graph, const vec passes.emplace_back(MakeShared()); passes.emplace_back(MakeShared()); passes.emplace_back(MakeShared()); + passes.emplace_back(MakeShared()); passes.emplace_back(MakeShared()); passes.emplace_back(MakeShared()); } @@ -664,10 +726,11 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &graph, const vec Status status = pass->Run(graph, subgraphs, context_); if (status == SUCCESS) { - GELOGD("Stream pass %s return SUCCESS.", pass->GetName().c_str()); + GELOGD("[Show][Status]Stream pass %s return SUCCESS.", pass->GetName().c_str()); } else if (status == NOT_CHANGED) { - GELOGD("Stream pass %s return NOT_CHANGED.", pass->GetName().c_str()); + GELOGD("[Show][Status]Stream pass %s return NOT_CHANGED.", pass->GetName().c_str()); } else { + REPORT_CALL_ERROR("E19999", "Stream pass %s run failed.", pass->GetName().c_str()); GELOGE(status, "Stream pass %s failed.", pass->GetName().c_str()); return status; } @@ -686,7 +749,7 @@ void LogicalStreamAllocator::RefreshContinuousStreams(const ComputeGraphPtr &gra auto op_desc = node->GetOpDesc(); if (op_desc != nullptr) { int64_t stream_id = op_desc->GetStreamId(); - if (stream_id != kInvalidStream && stream_id < stream_num) { + if ((stream_id != kInvalidStream) && (stream_id < stream_num)) { stream_has_node[stream_id] = true; } } @@ -695,10 +758,10 @@ void LogicalStreamAllocator::RefreshContinuousStreams(const ComputeGraphPtr &gra context_.next_stream = 0; vector old_to_new_streams(stream_num, kInvalidStream); - for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) { + for (size_t old_stream = 0; old_stream < stream_has_node.size(); old_stream++) { if (stream_has_node[old_stream]) { old_to_new_streams[old_stream] = context_.next_stream; - ++context_.next_stream; + context_.next_stream++; } } @@ -706,7 +769,7 @@ void LogicalStreamAllocator::RefreshContinuousStreams(const ComputeGraphPtr &gra auto op_desc = node->GetOpDesc(); if (op_desc != nullptr) { int64_t stream_id = op_desc->GetStreamId(); - if (stream_id != kInvalidStream && stream_id < stream_num) { + if ((stream_id != kInvalidStream) && (stream_id < stream_num)) { op_desc->SetStreamId(old_to_new_streams[stream_id]); } } diff --git a/ge/graph/build/logical_stream_allocator.h b/ge/graph/build/logical_stream_allocator.h index b9aec611..2a94c254 100644 --- a/ge/graph/build/logical_stream_allocator.h +++ b/ge/graph/build/logical_stream_allocator.h @@ -149,6 +149,13 @@ class NodeStreamUpdatePass : public LogicalStreamPass { Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; }; +// assign stream by parallel group +class UpdateForParallelGroupPass : public LogicalStreamPass { + public: + STREAM_PASS_DEFAULT_FUNC(UpdateForParallelGroupPass); + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; +}; + // Update the stream of subgraphs to nodes. class UpdateForSkippedEnginePass : public LogicalStreamPass { public: diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc index 97a0aed6..72cd5b9a 100644 --- a/ge/graph/build/memory/binary_block_mem_assigner.cc +++ b/ge/graph/build/memory/binary_block_mem_assigner.cc @@ -70,7 +70,10 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { return SUCCESS; } if ((all_memory_size.front() <= 0) || (log(kLogBase) == 0)) { - GELOGE(FAILED, "Memory size:%ld is invalid.", all_memory_size.front()); + GELOGE(FAILED, "[Check][MemRangeStep]first mem_range_step:%ld less than 0,invalid," + "maybe has dynamic shape in graph", all_memory_size.front()); + REPORT_INNER_ERROR("E19999", "first mem_range_step:%ld less than 0,invalid," + "maybe has dynamic shape in graph", all_memory_size.front()); return FAILED; } // Memory size is 512 aligned, so it is not necessary to take less than 512 @@ -81,12 +84,18 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { GELOGD("Range number: %zu", range_number); vector> ranges(range_number); - GE_CHK_BOOL_EXEC((range_number != 0), return PARAM_INVALID, "range_number can't be 0."); + GE_CHK_BOOL_EXEC((range_number != 0), + REPORT_INNER_ERROR("E19999", "inner data[range_number] is 0, judge invalid"); + return PARAM_INVALID, + "[Check][RangeNumber]inner data is 0, judge invalid."); size_t range_number_limit = all_memory_size.size() / range_number; int64_t range_ceil = min_memory_size; for (size_t i = 1; i <= range_number; i++) { GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(static_cast(range_ceil), kRangeCeilInterval), - GELOGE(FAILED, "Multiply result is out of range."); + GELOGE(FAILED, "[Check][MemRangeCeil]Multiply result is out of range," + "range_ceil:%ld, interval:%u", range_ceil, kRangeCeilInterval); + REPORT_INNER_ERROR("E19999", "process mem_range_ceil,multiply result out of range," + "range_ceil:%ld, interval:%u", range_ceil, kRangeCeilInterval); return FAILED); range_ceil *= kRangeCeilInterval; // The block size of each interval is doubled every time. for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) { diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc index 41f24b94..9825d1ed 100755 --- a/ge/graph/build/memory/block_mem_assigner.cc +++ b/ge/graph/build/memory/block_mem_assigner.cc @@ -30,6 +30,7 @@ #include "graph/utils/node_utils.h" #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_utils.h" +#include "graph/utils/type_utils.h" #include "graph/debug/ge_attr_define.h" @@ -429,17 +430,14 @@ void SetLastUsedInputMemAttr(NodePtr &node, int input_index) { } auto node_op_desc = node->GetOpDesc(); if (node_op_desc != nullptr) { - auto input_desc = node_op_desc->GetInputDesc(input_index); - if (!ge::AttrUtils::SetInt(input_desc, ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE, true)) { + auto input_desc = node_op_desc->MutableInputDesc(input_index); + if (!ge::AttrUtils::SetInt(*input_desc, ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE, true)) { GELOGW("Set %s input[%d] ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE to true failed.", node_op_desc->GetName().c_str(), input_index); return; } GELOGD("Set %s input[%d] ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE to true success.", node_op_desc->GetName().c_str(), input_index); - if (node_op_desc->UpdateInputDesc(input_index, input_desc) != GRAPH_SUCCESS) { - GELOGW("Update %s input[%d] desc failed.", node_op_desc->GetName().c_str(), input_index); - } } } @@ -457,7 +455,16 @@ Status GetNoAlignSize(const ge::OpDesc &desc, uint32_t index, size_t &size) { DataType data_type = output_op_desc->GetDataType(); graphStatus graph_status = TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size); if (graph_status != GRAPH_SUCCESS) { - GELOGE(graph_status, "CalcTensorMemSize failed!"); + GELOGE(graph_status, "[Calculate][TensorSize]shape:%s, format:%s, data_type:%s, op:%s, out_index:%u", + shape.ToString().c_str(), + TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str(), + desc.GetName().c_str(), index); + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize fail, shape:%s, format:%s, data_type:%s, op:%s, out_index:%u", + shape.ToString().c_str(), + TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str(), + desc.GetName().c_str(), index); return FAILED; } size = static_cast(tensor_size); @@ -498,7 +505,7 @@ BlockMemAssigner::BlockMemAssigner(ComputeGraphPtr compute_graph, const map &all_memory_size) { } for (auto &out_anchor : n->GetAllOutDataAnchors()) { - GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); + auto output_desc = node_op_desc->GetOutputDescPtr(out_anchor->GetIdx()); int64_t size = 0; - GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); - GE_IF_BOOL_EXEC(size < 0, GELOGE(FAILED, "Node:%s size:%ld is invalid, maybe it is unknown shape node.", - node_op_desc->GetName().c_str(), size); + GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_desc, size) != SUCCESS, GELOGI("Get size failed")); + GE_IF_BOOL_EXEC(size < 0, + GELOGE(FAILED, "[Check][TensorSize]tensor_size:%ld is invalid, " + "maybe it is unknown shape node, Node_name:%s", + size, node_op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "tensor_size:%ld is invalid, " + "maybe it is unknown shape node, Node_name:%s", + size, node_op_desc->GetName().c_str()); return;); batch_all_memory_size[batch_label].emplace_back(size); if (batch_total_size.find(batch_label) == batch_total_size.end()) { @@ -678,22 +690,34 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou if (static_cast(out_index) < n->GetAllOutDataAnchors().size()) { auto out_anchor = n->GetOutDataAnchor(out_index); GE_IF_BOOL_EXEC(out_anchor == nullptr, - GELOGE(FAILED, "Node[%s] output[%u] anchor is null.", n->GetName().c_str(), out_index); + GELOGE(FAILED, "[Check][Anchor]Node[%s] output[%u] anchor is null.", + n->GetName().c_str(), out_index); + REPORT_INNER_ERROR("E19999", "output anchor is null, node_name: %s output_index: %u.", + n->GetName().c_str(), out_index); return false;); for (auto const &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) { GE_IF_BOOL_EXEC(peer_in_anchor == nullptr, - GELOGE(FAILED, "Node[%s] output[%u] peer_in_anchor 0 is null.", n->GetName().c_str(), out_index); + GELOGE(FAILED, "[Check][Anchor]Node[%s] output[%u] peer_in_anchor 0 is null.", + n->GetName().c_str(), out_index); + REPORT_INNER_ERROR("E19999", "output anchor peer is null, node_name: %s output_index: %u.", + n->GetName().c_str(), out_index); return false;); auto peer_node = peer_in_anchor->GetOwnerNode(); GE_IF_BOOL_EXEC(peer_node == nullptr, - GELOGE(FAILED, "Node[%s] output[%u] node is null.", n->GetName().c_str(), out_index); + GELOGE(FAILED, "[Check][Node]Node[%s] output[%u] peer node is null.", + n->GetName().c_str(), out_index); + REPORT_INNER_ERROR("E19999", "output anchor peer node is null, node_name: %s output_index: %u.", + n->GetName().c_str(), out_index); return false;); // Get the continuous input type of the node, default is false bool is_input_continuous = false; auto peer_in_node_desc = peer_node->GetOpDesc(); GE_IF_BOOL_EXEC(peer_in_node_desc == nullptr, - GELOGE(FAILED, "Node[%s] output[%u] nodedesc is null.", n->GetName().c_str(), out_index); + GELOGE(FAILED, "[Check][OpDesc]Node[%s] output[%u] nodedesc is null.", + n->GetName().c_str(), out_index); + REPORT_INNER_ERROR("E19999", "output anchor peer op_desc is null, node_name:%s output_index:%u.", + n->GetName().c_str(), out_index); return false;); // If GetBool fail, is_input_continuous is false. @@ -793,7 +817,10 @@ bool BlockMemAssigner::IsContinuousMemoryReuse(const NodePtr &n, const NodePtr & if ((in_anchor == nullptr) || (in_anchor->GetPeerOutAnchor() == nullptr) || (in_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) || (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr)) { - GELOGE(FAILED, "Node[%s] output[%u] peer input node desc is null.", n->GetName().c_str(), out_index); + GELOGE(FAILED, "[Check][OpDesc]Node[%s] output[%u] peer input node desc is null.", + n->GetName().c_str(), out_index); + REPORT_INNER_ERROR("E19999", "get output anchor peer op_desc fail, node_name: %s output_index: %u.", + n->GetName().c_str(), out_index); return false; } auto peer_out_node_desc = in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc(); @@ -1077,7 +1104,10 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, OpMemoryType mem_type, const NodePtr &n, uint32_t out_index, const vector &workspace_reuse_flag, const bool is_op_reuse_mem, const bool continuous, int64_t memory_type) { - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + n == nullptr, + REPORT_INNER_ERROR("E19999", "Input parameter n(type:node_ptr) is null, apply memory failed"); + return nullptr, "[Check][Param]Input parameter n(type:node_ptr) is null."); auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr); std::string batch_label; @@ -1129,7 +1159,12 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, } auto block = new (std::nothrow) MemoryBlock(block_size, node_op_desc->GetStreamId(), is_reuse_memory, memory_type); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "new an object failed."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + block == nullptr, + REPORT_INNER_ERROR("E19999", "new a memoryblock object failed. node_name:%s out_index:%u", + n->GetName().c_str(), out_index); + return nullptr, + "[New][Object]new MemoryBlock failed, node_name:%s out_index:%u", n->GetName().c_str(), out_index); // Data and netoutput need zero copy block block->is_zero_copy_ = IsZeroCopyBlock(n, continuous); @@ -1188,9 +1223,15 @@ void BlockMemAssigner::ContinuousOutRefCheck(bool &isAllOutputRef, bool &isOutpu Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vector &ranges, const bool is_op_reuse_mem) { - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return INTERNAL_ERROR, "input node is null."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + n == nullptr, + REPORT_INNER_ERROR("E19999", "Input parameter n(type:node_ptr) is null"); + return INTERNAL_ERROR, "[check][param]Input parameter n(type:NodePtr) is null."); auto node_op_desc = n->GetOpDesc(); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return INTERNAL_ERROR, "node_op_desc is null."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + node_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Input parameter n(type:OpDescPtr) is null"); + return INTERNAL_ERROR, "[Check][Param]Input parameter n(type:OpDescPtr) is null"); // continuous output support ref only when all output ref input bool isAllOutputRef = true; @@ -1204,7 +1245,9 @@ Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vectorGetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][OutRefStatus]continuous output node ref part input, not support, node_name:%s", n->GetName().c_str()); return INTERNAL_ERROR; } @@ -1215,7 +1258,9 @@ Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vector(node_op_desc->GetOutputsSize()); index++) { auto output_op_desc = node_op_desc->GetOutputDescPtr(index); if (output_op_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "Get output desc failed, node_name:%s, output_index:%u", n->GetName().c_str(), index); + REPORT_INNER_ERROR("E19999", "get output_desc failed, node_name:%s, output_index:%u", + n->GetName().c_str(), index); + GELOGE(INTERNAL_ERROR, "[Get][OutputDesc]node_name:%s, output_index:%u", n->GetName().c_str(), index); return INTERNAL_ERROR; } @@ -1226,7 +1271,9 @@ Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vectorGetName().c_str(), index); + REPORT_CALL_ERROR("E19999", "get tensor_size failed, node_name:%s, output_index:%u", + n->GetName().c_str(), index); + GELOGE(INTERNAL_ERROR, "[Get][TensorSize]node_name:%s, output_index:%u", n->GetName().c_str(), index); return INTERNAL_ERROR; } size_t align_size = static_cast(size); @@ -1266,7 +1313,9 @@ Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vectorlast_continuous_block_ = true; ++(block->ref_count_); } else { - GELOGE(INTERNAL_ERROR, "node apply continuous output memory failed. node_name:%s", n->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "apply continuousMemory failed, node_name:%s, total_size:%ld", + n->GetName().c_str(), total_size); + GELOGE(INTERNAL_ERROR, "[Apply][ContinuousMemory]node_name:%s, total_size:%ld", n->GetName().c_str(), total_size); return INTERNAL_ERROR; } return SUCCESS; @@ -1274,25 +1323,44 @@ Status BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vector &ranges, const bool is_op_reuse_mem, const bool continuous) { - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + n == nullptr, + REPORT_INNER_ERROR("E19999", "Input parameter n(type:NodePtr) is null"); + return nullptr, "[Check][Param]Input parameter n(type:NodePtr) is null"); auto node_op_desc = n->GetOpDesc(); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + node_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Input parameter n(type:OpDescPtr) is null"); + return nullptr, "[Check][Param]Input parameter n(type:OpDescPtr) is null"); MemoryBlock *block = nullptr; NodeIndexIO node_index_io(n, index, kOut); int64_t size = 0; auto output_op_desc = node_op_desc->GetOutputDescPtr(index); - GE_IF_BOOL_EXEC(output_op_desc == nullptr, return nullptr); + GE_IF_BOOL_EXEC( + output_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "get output_desc failed, node_name:%s, output_index:%u", + n->GetName().c_str(), index); + GELOGE(FAILED, "[Get][OutputDesc]node_name:%s, output_index:%u", n->GetName().c_str(), index); + return nullptr); GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed")); size_t no_align_size = 0; - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetNoAlignSize(*node_op_desc, index, no_align_size) != SUCCESS, - return nullptr, "Get no align size failed"); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + GetNoAlignSize(*node_op_desc, index, no_align_size) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Get no align size failed, node_name:%s, output_index:%u", + n->GetName().c_str(), index); + return nullptr, + "[Get][TensorSize]Get no align size, node_name:%s, output_index:%u", n->GetName().c_str(), index); std::string symbol; bool reuse_input = false; if (IsSymbolExist(node_index_io, symbol)) { block = symbol_blocks_[symbol]; - GE_IF_BOOL_EXEC(block == nullptr, GELOGE(FAILED, "Node %s ref block is nullptr.", node_op_desc->GetName().c_str()); - return nullptr); + GE_IF_BOOL_EXEC(block == nullptr, + REPORT_INNER_ERROR("E19999", "get ref block failed, node_name:%s, symbol:%s", + node_op_desc->GetName().c_str(), node_index_io.ToString().c_str()); + GELOGE(FAILED, "[Get][RefBlock]node_name:%s, symbol:%s", + node_op_desc->GetName().c_str(), node_index_io.ToString().c_str()); + return nullptr); // reduce old size size_t align_size = block->Size(); AlignMemOffset(align_size); @@ -1335,12 +1403,28 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, vector workspace_reuse_flag; block = ApplyMemory(block_size, size, no_align_size, kOutput, n, index, workspace_reuse_flag, is_op_reuse_mem, continuous, memory_type); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + block == nullptr, + REPORT_CALL_ERROR("E19999", "apply out Memory failed, node_name:%s, block_size:%ld, out_index:%u", + n->GetName().c_str(), block_size, index); + return nullptr, + "[Apply][Memory]node_name:%s, block_size:%ld, out_index:%u", + n->GetName().c_str(), block_size, index); } - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "Block is nullptr."); int out_count = 0; - GE_IF_BOOL_EXEC(index >= n->GetAllOutDataAnchors().size(), GELOGE(FAILED, "index is out of range."); return nullptr); + GE_IF_BOOL_EXEC( + index >= n->GetAllOutDataAnchors().size(), + REPORT_INNER_ERROR("E19999", "out index:%u exceed out_size:%lu, node_name:%s", + index, n->GetAllOutDataAnchors().size(), n->GetName().c_str()); + GELOGE(FAILED, "[Check][OutIndex]index:%u exceed out_size:%lu, node_name:%s", + index, n->GetAllOutDataAnchors().size(), n->GetName().c_str()); + return nullptr); auto out_data_anchor = n->GetOutDataAnchor(index); - GE_IF_BOOL_EXEC(out_data_anchor == nullptr, GELOGE(FAILED, "Out data anchor is nullptr."); return nullptr); + GE_IF_BOOL_EXEC( + out_data_anchor == nullptr, + REPORT_INNER_ERROR("E19999", "out anchor is null, index:%u, node_name:%s", index, n->GetName().c_str()); + GELOGE(FAILED, "[Check][OutAnchor]is null, index:%u, node_name:%s", index, n->GetName().c_str()); + return nullptr); for (const auto &in_anchor : out_data_anchor->GetPeerInDataAnchors()) { auto owner_node = in_anchor->GetOwnerNode(); auto op_desc = owner_node->GetOpDesc(); @@ -1546,8 +1630,14 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector GELOGD("Assign memory node[%s], output size[%zu], output memory type size[%zu]", op_desc->GetName().c_str(), op_desc->GetOutputsSize(), memorys_type.size()); if (has_mem_type_attr && (memorys_type.size() != op_desc->GetOutputsSize())) { - GELOGE(INTERNAL_ERROR, "fusion: node[%s], output memory size err[outputsize:%zu, memorysize:%zu]", - op_desc->GetName().c_str(), op_desc->GetOutputsSize(), memorys_type.size()); + REPORT_INNER_ERROR("E19999", "Attr[%s] size:%zu not equal to node output size:%zu, node_name:%s", + ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), memorys_type.size(), + op_desc->GetOutputsSize(), op_desc->GetName().c_str()); + GELOGE( + INTERNAL_ERROR, + "[Check][MemTypeAttr]Attr %s size:%zu not equal to node output size:%zu, node_name:%s", + ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), memorys_type.size(), + op_desc->GetOutputsSize(), op_desc->GetName().c_str()); return INTERNAL_ERROR; } @@ -1565,12 +1655,15 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector bool is_atomic = false; // If GetBool fail, is_atomic is false. (void)ge::AttrUtils::GetBool(op_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic); + bool is_buffer_pool_mem_supported = (op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_ID)) && + (op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_SIZE)) && (!root_unknown_shape_flag_); // Allocate memory for the current node and release node memory of the same size in the workspace GE_IF_BOOL_EXEC(ge_disable_reuse_mem_env_ != "1", for (auto iter = stream_workspace_blocks_.begin(); iter != stream_workspace_blocks_.end(); ++iter) { ReleaseMemorys(iter->second[stream_id], reusable_blocks_[iter->first][stream_id]); iter->second[stream_id].clear();}); - if (IsContinuousOutput(node)) { + bool need_apply_continuous_memory = IsContinuousOutput(node) && (!is_buffer_pool_mem_supported); + if (need_apply_continuous_memory) { return ApplyContinuousMemory(node, ranges, is_op_reuse_mem_); } for (uint32_t i = 0; i < static_cast(op_desc->GetOutputsSize()); i++) { @@ -1604,7 +1697,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector GE_IF_BOOL_EXEC(!no_need_assign_memory, no_need_assign_memory = IsAtomicOutputMemory(node, i, is_atomic, out_node_set_continuous_input);); } - no_need_assign_memory = (no_need_assign_memory || IsKnownSubgraphData(node)); + no_need_assign_memory = (no_need_assign_memory || IsKnownSubgraphData(node) || is_buffer_pool_mem_supported); if (no_need_assign_memory) { zero_memory_list_.emplace_back(node, kOutput, i, false); continue; @@ -1645,11 +1738,18 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector /// void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { (void)ge::GetContext().GetOption(OPTION_EXEC_DISABLE_REUSED_MEMORY, ge_disable_reuse_mem_env_); - GELOGD("Reuse memory %s", ge_disable_reuse_mem_env_ == "1" ? "close" : "open"); + GEEVENT("Reuse memory %s", ge_disable_reuse_mem_env_ == "1" ? "close" : "open"); string op_no_reuse_mem_str; const char *op_no_reuse_mem = std::getenv(OP_NO_REUSE_MEM); GE_IF_BOOL_EXEC(op_no_reuse_mem != nullptr, op_no_reuse_mem_str = string(op_no_reuse_mem); CheckAndGetOpReuseEnv(op_no_reuse_mem_str, op_no_reuse_mem_vec_, op_reuse_env_valid_);); + auto root_graph = GraphUtils::FindRootGraph(compute_graph_); + if (root_graph == nullptr) { + GELOGE(INTERNAL_ERROR, "[Check][RootGraph]Root graph is nullptr, graph:%s.", compute_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Root graph is nullptr, graph:%s.", compute_graph_->GetName().c_str()); + return; + } + root_unknown_shape_flag_ = root_graph->GetGraphUnknownFlag(); for (NodePtr &n : compute_graph_->GetAllNodes()) { auto node_op_desc = n->GetOpDesc(); @@ -1673,8 +1773,12 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { temp.size(), tvm_workspace_memory_type.size()); if (has_tvm_workspace_mem_type_attr && (temp.size() != tvm_workspace_memory_type.size())) { - GELOGE(INTERNAL_ERROR, "fusion: node[%s], tvm workspace memory size error![v_temp:%zu, workspace:%zu]", - n->GetName().c_str(), temp.size(), tvm_workspace_memory_type.size()); + REPORT_INNER_ERROR("E19999", "Attr[%s]size:%zu is not equal to workspace size:%zu, node_name:%s", + TVM_ATTR_NAME_WORKSPACE_TYPE.c_str(), tvm_workspace_memory_type.size(), + temp.size(), n->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Attr]Attr %s size:%zu is not equal to workspace size:%zu, node_name:%s", + TVM_ATTR_NAME_WORKSPACE_TYPE.c_str(), tvm_workspace_memory_type.size(), + temp.size(), n->GetName().c_str()); return; } for (size_t i = 0; i < temp.size(); i++) { @@ -2031,7 +2135,7 @@ void SetBlockOpMemOffset(MemoryBlock *block, int32_t child_block_level) { child_block_level++; for (MemoryBlock *child_block : block->ChildBlockList()) { - SetBlockOpMemOffset(child_block, child_block_level); + SetBlockOpMemOffset(child_block, child_block_level); } } @@ -2059,7 +2163,7 @@ void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) { Status BlockMemAssigner::Assign() { vector ranges; if (GetMemoryRanges(ranges) != SUCCESS) { - GELOGE(FAILED, "GetMemoryRanges Fail!"); + GELOGE(FAILED, "[Get][MemoryRanges] Fail!"); return FAILED; } GE_IF_BOOL_EXEC(ranges.empty(), return SUCCESS); @@ -2083,8 +2187,12 @@ bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, bool has_workspace_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type); if (has_workspace_mem_type_attr && (workspace_memory_type.size() <= index)) { - GELOGE(INTERNAL_ERROR, "node[%s], workspace_memory size error![index:%zu, workspace:%zu]", - node->GetName().c_str(), index, workspace_memory_type.size()); + REPORT_INNER_ERROR("E19999", "get workspace mem_type failed, " + "index %zu invalid, bigger than attr %s size:%zu, node_name:%s", + index, TVM_ATTR_NAME_WORKSPACE_TYPE.c_str(), + workspace_memory_type.size(), node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][WorkspaceMemType]index %zu invalid, bigger than attr %s size:%zu, node_name:%s", + index, TVM_ATTR_NAME_WORKSPACE_TYPE.c_str(), workspace_memory_type.size(), node->GetName().c_str()); return false; } memory_type = has_workspace_mem_type_attr ? workspace_memory_type[index] : RT_MEMORY_HBM; diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h index 199a84f9..474db17c 100755 --- a/ge/graph/build/memory/block_mem_assigner.h +++ b/ge/graph/build/memory/block_mem_assigner.h @@ -494,6 +494,8 @@ class BlockMemAssigner : public MemAssigner { /// @ [stream2][nodeid] /// DependStreamLife total_node_depend_stream_life_; + + bool root_unknown_shape_flag_ = false; }; } // namespace ge #endif // GE_GRAPH_BUILD_MEMORY_BLOCK_MEM_ASSIGNER_H_ diff --git a/ge/graph/build/memory/buffer_pool_mem_assigner.cc b/ge/graph/build/memory/buffer_pool_mem_assigner.cc new file mode 100644 index 00000000..d66fe038 --- /dev/null +++ b/ge/graph/build/memory/buffer_pool_mem_assigner.cc @@ -0,0 +1,234 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/build/memory/buffer_pool_mem_assigner.h" +#include "graph/common/omg_util.h" +#include "graph/utils/tensor_utils.h" +#include "framework/common/util.h" +#include "graph/compute_graph.h" +#include "graph/debug/ge_attr_define.h" +#include "common/math/math_util.h" +#include "common/util/error_manager/error_manager.h" + +namespace ge { +namespace { +const size_t kBufferPoolNodeMemInfoLength = 2; +const uint32_t kBufferPoolNodeOutputSizeIndex = 0; +const uint32_t kBufferPoolNodeOutputOffsetIndex = 1; +} // namespace + +Status BufferPoolMemAssigner::Assign() { + if (compute_graph_ == nullptr) { + GELOGE(PARAM_INVALID, "[Check][Graph]Graph is nullptr"); + REPORT_INNER_ERROR("E19999", "Input graph is nullptr"); + return PARAM_INVALID; + } + Status ret = InitAssigner(compute_graph_); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Init][Assigner]Graph:%s.", compute_graph_->GetName().c_str()); + return FAILED; + } + ret = AssignOutput(); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Assign][Output]Graph:%s.", compute_graph_->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + +Status BufferPoolMemAssigner::GetOutputMemoryType(const NodePtr &node, size_t idx, int64_t &memory_type) { + GE_CHECK_NOTNULL(node->GetOpDesc()); + memory_type = RT_MEMORY_HBM; + std::vector type_list; + bool has_mem_type = ge::AttrUtils::GetListInt(node->GetOpDesc(), ATTR_NAME_OUTPUT_MEM_TYPE_LIST, type_list); + if (has_mem_type && (type_list.size() != node->GetOpDesc()->GetOutputsSize() || idx >= type_list.size())) { + GELOGE(PARAM_INVALID, "[Check][OutputParam]Output param invalid, output size:%zu, mem type size:%zu, index:%zu.", + node->GetOpDesc()->GetOutputsSize(), type_list.size(), idx); + REPORT_INNER_ERROR("E19999", "Output param invalid, output size:%zu, mem type size:%zu, index:%zu.", + node->GetOpDesc()->GetOutputsSize(), type_list.size(), idx); + return PARAM_INVALID; + } + memory_type = has_mem_type ? type_list[idx] : RT_MEMORY_HBM; + return SUCCESS; +} + +Status BufferPoolMemAssigner::InitAssigner(const ComputeGraphPtr &graph) { + for (const NodePtr &node : graph->GetAllNodes()) { + int64_t buffer_pool_id = 0; + int64_t buffer_pool_size = 0; + bool get_attr = AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_ID, buffer_pool_id); + get_attr = get_attr && (AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_SIZE, buffer_pool_size)); + if (get_attr) { + std::string batch_label; + (void) AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label); + buffer_pool_nodes_[batch_label][buffer_pool_id].emplace_back(node); + auto iter = buffer_pool_size_[batch_label].find(buffer_pool_id); + if (iter == buffer_pool_size_[batch_label].end()) { + buffer_pool_size_[batch_label][buffer_pool_id] = buffer_pool_size; + } + Status ret = InitMemOffsetBase(node); + if (ret != SUCCESS) { + GELOGE(ret, "[Init][MemOffsetBase]Batch label:%s.", batch_label.c_str()); + REPORT_INNER_ERROR("E19999", "Failed to init offset base, batch label:%s.", batch_label.c_str()); + return ret; + } + } + } + + int64_t max_size = 0; + for (const auto &iter : buffer_pool_size_) { + std::string batch_label = iter.first; + int64_t batch_offset = mem_offset_base_; + for (const auto &buffer_pool : iter.second) { + int64_t buffer_pool_id = buffer_pool.first; + int64_t buffer_pool_size = buffer_pool.second; + buffer_pool_offset_base_[batch_label][buffer_pool_id] = batch_offset; + FMK_INT64_ADDCHECK(buffer_pool_size, kBufferPoolMemAlignSize); + AlignMemSize(buffer_pool_size, kBufferPoolMemAlignSize); + FMK_INT64_ADDCHECK(batch_offset, (buffer_pool_size + kBufferPoolMemAlignSize)); + batch_offset += (buffer_pool_size + kBufferPoolMemAlignSize); + } + int64_t batch_mem_size = batch_offset - mem_offset_base_; + GELOGI("[Init][Assigner]Get batch mem size, batch label:%s, mem size:%ld.", batch_label.c_str(), batch_mem_size); + if (max_size < batch_mem_size) { + max_size = batch_mem_size; + } + } + FMK_INT64_ADDCHECK(mem_offset_base_, max_size); + mem_offset_ = static_cast(mem_offset_base_ + max_size); + GELOGI("[Init][Assigner]Init buffer pool mem assigner successfully, " + "mem type:%ld, mem offset base:%ld, mem offset:%zu.", mem_type_, mem_offset_base_, mem_offset_); + return SUCCESS; +} + +Status BufferPoolMemAssigner::InitMemOffsetBase(const NodePtr &node) { + int64_t mem_type; + Status ret = GetOutputMemoryType(node, static_cast(kBufferPoolNodeOutIndex), mem_type); + if (ret != SUCCESS) { + GELOGE(ret, "[Get][MemType]Node:%s, index:%u.", node->GetName().c_str(), kBufferPoolNodeOutIndex); + REPORT_INNER_ERROR("E19999", "Failed to get output memory type, node:%s, index:%u.", + node->GetName().c_str(), kBufferPoolNodeOutIndex); + return ret; + } + if (mem_type_ != mem_type && init_offset_base_) { + GELOGE(PARAM_INVALID, "[Check][MemType]The memory type of all buffer pool nodes must be the same, node:%s, " + "required:%ld, actually: %ld", node->GetName().c_str(), mem_type_, mem_type); + REPORT_INNER_ERROR("E19999", "The memory type of all buffer pool nodes must be the same, node:%s, " + "required:%ld, actually: %ld", node->GetName().c_str(), mem_type_, mem_type); + return PARAM_INVALID; + } + if (!init_offset_base_) { + auto iter = mem_type_to_offset_.find(mem_type); + if (iter == mem_type_to_offset_.end()) { + GELOGE(PARAM_INVALID, "[Check][MemType]Memory type is not supported, node:%s, mem type:%ld.", + node->GetName().c_str(), mem_type); + REPORT_INNER_ERROR("E19999", "Memory type is not supported, node:%s, mem type:%ld.", + node->GetName().c_str(), mem_type); + return PARAM_INVALID; + } + mem_offset_base_ = static_cast(iter->second); + FMK_INT64_ADDCHECK(mem_offset_base_, (kBufferPoolMemAlignSize + kBufferPoolMemAlignSize)); + AlignMemSize(mem_offset_base_, kBufferPoolMemAlignSize); + // The HCOM nodes may access the previous 512 bytes. + mem_offset_base_ += kBufferPoolMemAlignSize; + mem_type_ = mem_type; + init_offset_base_ = true; + GELOGI("[Init][MemOffsetBase]Init offset base:%ld, memory type:%ld", mem_offset_base_, mem_type); + } + return SUCCESS; +} + +Status BufferPoolMemAssigner::AssignOutput() { + for (auto &batch_pool_nodes_map : buffer_pool_nodes_) { + std::string batch_label = batch_pool_nodes_map.first; + for (auto &pool_nodes_map : batch_pool_nodes_map.second) { + int64_t buffer_pool_id = pool_nodes_map.first; + auto iter_buffer_id_size = buffer_pool_size_[batch_label].find(buffer_pool_id); + if (iter_buffer_id_size == buffer_pool_size_[batch_label].end()) { + GELOGE(INTERNAL_ERROR, "[Get][BufferPoolSize]Pool id:%ld.", buffer_pool_id); + REPORT_INNER_ERROR("E19999", "Failed to get buffer pool size, pool id:%ld.", buffer_pool_id); + return INTERNAL_ERROR; + } + auto iter_buffer_id_offset = buffer_pool_offset_base_[batch_label].find(buffer_pool_id); + if (iter_buffer_id_offset == buffer_pool_offset_base_[batch_label].end()) { + GELOGE(INTERNAL_ERROR, "[Get][BufferPoolBaseOffset]Pool id:%ld.", buffer_pool_id); + REPORT_INNER_ERROR("E19999", "Failed to get buffer pool base offset, pool id:%ld.", buffer_pool_id); + return INTERNAL_ERROR; + } + int64_t buffer_pool_size = iter_buffer_id_size->second; + int64_t output_offset_base = iter_buffer_id_offset->second; + Status ret = AssignOutputInOneBufferPool(batch_label, output_offset_base, pool_nodes_map.second); + if (ret != SUCCESS) { + GELOGE(ret, "[Assign][OneBufferPool]Batch label:%s, pool id:%ld, pool size:%ld, offset base:%ld.", + batch_label.c_str(), buffer_pool_id, buffer_pool_size, output_offset_base); + REPORT_INNER_ERROR("E19999", "Failed to assign output memory, batch label:%s, " + "pool id:%ld, pool size:%ld, offset base:%ld.", + batch_label.c_str(), buffer_pool_id, buffer_pool_size, output_offset_base); + return ret; + } + GELOGI("[Assign][Output]Assign output successfully, batch label:%s, pool id:%ld, pool size:%ld, offset base:%ld.", + batch_label.c_str(), buffer_pool_id, buffer_pool_size, output_offset_base); + } + } + return SUCCESS; +} + +Status BufferPoolMemAssigner::AssignOutputInOneBufferPool(const std::string &batch_label, + int64_t output_offset_base, + const std::vector &buffer_pool_nodes) { + for (const NodePtr &node : buffer_pool_nodes) { + int64_t output_size = 0; + Status ret = GetMemorySize(node, output_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Get][MemSize]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get output size, node:%s.", node->GetName().c_str()); + return ret; + } + OpDescPtr op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + vector memory_size_and_offset; + bool get_attr = AttrUtils::GetListInt(op_desc, ATTR_NAME_BUFFER_POOL_NODE_SIZE_AND_OFFSET, memory_size_and_offset); + if (!get_attr || memory_size_and_offset.size() != kBufferPoolNodeMemInfoLength) { + GELOGE(PARAM_INVALID, "[Get][Attr]Node:%s, mem info size:%zu, required size:%zu.", + node->GetName().c_str(), memory_size_and_offset.size(), kBufferPoolNodeMemInfoLength); + REPORT_INNER_ERROR("E19999", "Failed to get pool node memory info, node:%s, info size:%zu, required size:%zu.", + node->GetName().c_str(), memory_size_and_offset.size(), kBufferPoolNodeMemInfoLength); + return PARAM_INVALID; + } + if (output_size != memory_size_and_offset[kBufferPoolNodeOutputSizeIndex]) { + GELOGE(PARAM_INVALID, "[Check][MemSize]Something wrong with memory size, pre size:%ld, curr size:%ld, node:%s.", + memory_size_and_offset[kBufferPoolNodeOutputSizeIndex], output_size, node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Something wrong with memory size, pre size:%ld, curr size:%ld, node:%s.", + memory_size_and_offset[kBufferPoolNodeOutputSizeIndex], output_size, node->GetName().c_str()); + return PARAM_INVALID; + } + + int64_t logical_offset = memory_size_and_offset[kBufferPoolNodeOutputOffsetIndex]; + vector output_list = {(output_offset_base + logical_offset)}; + op_desc->SetOutputOffset(output_list); + // log for IMAS tools + GELOGI("[IMAS]Set %s name[%s] optype[%s] %s[%u] offset to [%ld] streamid[%ld] memtype[%ld] " + "size[%zu] realsize[%zu] noalignsize[%zu] life time begin[%d] life time end[%d] " + "child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), + "output", kBufferPoolNodeOutIndex, output_list[kBufferPoolNodeOutIndex], op_desc->GetStreamId(), mem_type_, + static_cast(output_size), static_cast(output_size), static_cast(output_size), + 0, 0, 0, 0, 0, 0, 0, 0, batch_label.c_str()); + } + return SUCCESS; +} + +} // namespace ge diff --git a/ge/graph/build/memory/buffer_pool_mem_assigner.h b/ge/graph/build/memory/buffer_pool_mem_assigner.h new file mode 100644 index 00000000..6caed031 --- /dev/null +++ b/ge/graph/build/memory/buffer_pool_mem_assigner.h @@ -0,0 +1,83 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_BUILD_MEMORY_BUFFER_POOL_MEM_ASSIGNER_H_ +#define GE_GRAPH_BUILD_MEMORY_BUFFER_POOL_MEM_ASSIGNER_H_ + +#include +#include +#include +#include "graph/build/memory/mem_assigner.h" +#include "runtime/mem.h" + +namespace ge { +class BufferPoolMemAssigner : public MemAssigner { + public: + BufferPoolMemAssigner(ComputeGraphPtr compute_graph, const std::map &mem_type_to_offset) + : MemAssigner(), compute_graph_(compute_graph), + mem_type_(0), + mem_offset_(0), + mem_offset_base_(0), + init_offset_base_(false), + mem_type_to_offset_(mem_type_to_offset) {} + + BufferPoolMemAssigner(const BufferPoolMemAssigner &) = delete; + + BufferPoolMemAssigner &operator=(const BufferPoolMemAssigner &) = delete; + + ~BufferPoolMemAssigner() override = default; + + Status Assign() override; + + size_t GetMemOffset() const { return mem_offset_; } + + int64_t GetMemType() const { return mem_type_; } + + private: + static Status GetOutputMemoryType(const NodePtr &node, size_t idx, int64_t &memory_type); + + Status InitAssigner(const ComputeGraphPtr &graph); + + Status InitMemOffsetBase(const NodePtr &node); + + Status AssignOutput(); + + Status AssignOutputInOneBufferPool(const std::string &batch_label, + int64_t output_offset_base, + const std::vector &buffer_pool_nodes); + + ComputeGraphPtr compute_graph_; + + int64_t mem_type_; + + size_t mem_offset_; + + int64_t mem_offset_base_; + + bool init_offset_base_; + + std::map mem_type_to_offset_; + + // Use map to ensure that each visit is in the order of pool id + std::unordered_map>> buffer_pool_nodes_; + + // Use map to ensure that each visit is in the order of pool id + std::unordered_map> buffer_pool_size_; + + std::unordered_map> buffer_pool_offset_base_; +}; +} // namespace ge +#endif // GE_GRAPH_BUILD_MEMORY_BUFFER_POOL_MEM_ASSIGNER_H_ diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index e3736ee4..540626bb 100755 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -30,6 +30,7 @@ #include "graph/manager/graph_var_manager.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" +#include "graph/build/memory/buffer_pool_mem_assigner.h" namespace { const int kAllInputAddrIsAtomic = -1; @@ -99,7 +100,8 @@ Status VariableMemoryAssigner::AssignMemory2HasRefAttrNode() { Status GraphMemoryAssigner::AssignMemory() { ge::HybridMemAssignerPtr mem_assigner(new(std::nothrow) HybridMemAssigner(compute_graph_)); if (mem_assigner->Assign() != ge::SUCCESS) { - GELOGE(ge::FAILED, "Memory assigner failed"); + GELOGE(ge::FAILED, "[Assign][GraphMem]graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return ge::FAILED; } MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset()); @@ -115,7 +117,10 @@ Status GraphMemoryAssigner::AssignMemory() { auto variable_assigner = std::unique_ptr(new(std::nothrow) ge::VariableMemoryAssigner(compute_graph_)); if (variable_assigner == nullptr) { - GELOGE(ge::FAILED, "Alloc VariableMemoryAssigner failed."); + GELOGE(ge::FAILED, "[New][Object:VariableMemoryAssigner]graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "New Object:VariableMemoryAssigner failed, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return ge::FAILED; } @@ -134,7 +139,10 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() { auto variable_assigner = std::unique_ptr(new(std::nothrow) ge::VariableMemoryAssigner(compute_graph_)); if (variable_assigner == nullptr) { - GELOGE(ge::FAILED, "Alloc VariableMemoryAssigner failed."); + GELOGE(ge::FAILED, "[New][Object:VariableMemoryAssigner]graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "New Object:VariableMemoryAssigner failed, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return ge::FAILED; } if (variable_assigner->AssignVarAttr2Nodes() != ge::SUCCESS) { @@ -147,8 +155,10 @@ ge::Status GraphMemoryAssigner::AssignMemory2HasRefAttrNode() { auto variable_assigner = std::unique_ptr(new(std::nothrow) ge::VariableMemoryAssigner(compute_graph_)); if (variable_assigner == nullptr) { - GELOGE(ge::FAILED, "Alloc VariableMemoryAssigner failed."); - return ge::FAILED; + GELOGE(ge::FAILED, "[New][Object:VariableMemoryAssigner]graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "New Object:VariableMemoryAssigner failed, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); } if (variable_assigner->AssignMemory2HasRefAttrNode() != ge::SUCCESS) { return ge::FAILED; @@ -161,17 +171,18 @@ ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &out int64_t &batch_dim_num, int64_t &out_size) { graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size); if (graph_status != GRAPH_SUCCESS) { - GELOGE(FAILED, "Opdesc GetSize failed!"); + GELOGE(FAILED, "[Get][TensorSize]"); + REPORT_CALL_ERROR("E19999", "Get tensor size failed"); return FAILED; } GeShape output_shape = output_desc->GetShape(); std::vector output_dims = output_shape.GetDims(); if (dim_index >= static_cast(output_dims.size())) { - std::string error = "Invaild value" + FmtToStr(dim_index) + - " of attr _reuse_input_on_dim_index, which is out of data range [0," - + std::to_string(output_dims.size()) + ")"; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "Inner param dim_index value:%ld invalid, bigger than dim size:%lu in shape:%s", + dim_index, output_dims.size(), output_shape.ToString().c_str()); + GELOGE(FAILED, "[Check][Param:dim_index]value:%ld invalid, bigger than dim size:%lu in shape:%s", + dim_index, output_dims.size(), output_shape.ToString().c_str()); return FAILED; } @@ -187,14 +198,23 @@ ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &out graph_status = ge::TensorUtils::CalcTensorMemSize(output_shape, out_format, data_type, output_mem_size); if (graph_status != GRAPH_SUCCESS) { - GELOGE(graph_status, "Opdesc CalcTensorMemSize failed!"); + GELOGE(graph_status, "[Calc][TensorSize]"); return FAILED; } if (output_mem_size < 0) { - std::string error = "After calculating tensor memory size, output_mem_size" + FmtToStr(output_mem_size) + - " is out of data range [0," + std::to_string(INT64_MAX) + "]"; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "After calculating, tensor memory size:%ld invalid, less than 0. " + "shape:%s, format:%s, dtype:%s, maybe has dynamic shape", + output_mem_size, + output_shape.ToString().c_str(), + TypeUtils::FormatToSerialString(out_format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGE(FAILED, "[Check][TensorSize]value:%ld invalid after calc, less than 0. shape:%s, format:%s, dtype:%s, " + "maybe has dynamic shape", + output_mem_size, + output_shape.ToString().c_str(), + TypeUtils::FormatToSerialString(out_format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return FAILED; } @@ -203,12 +223,16 @@ ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &out Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map &mem_type_to_offset) { if (memory_offset_.empty()) { - GELOGE(FAILED, "memory_offset_ is empty."); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ empty, not expected, graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData:memory_offset_]empty is not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return ge::FAILED; } GE_CHK_STATUS_RET(ReAssignContinuousMemory(is_loop_graph), "ReAssignContinuousMemory Failed!"); GE_CHK_STATUS_RET(ReAssignAtomicMemory(is_loop_graph), "ReAssignAtomicMemory Failed!"); + GE_CHK_STATUS_RET(AssignBufferPoolMemory(), "AssignBufferPoolMemory Failed!"); size_t total_mem_offset = 0; for (auto pair : memory_offset_) { @@ -218,8 +242,10 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, mapGetSessionID(); if (total_mem_offset > VarManager::Instance(session_id)->GetGraphMemoryMaxSize()) { - GELOGE(ge::FAILED, "Current memoffset %zu is greater than memory manager malloc max size %zu", total_mem_offset, - VarManager::Instance(session_id)->GetGraphMemoryMaxSize()); + GELOGE(ge::FAILED, "[Check][TotalMemOffset] %zu is greater than memory manager malloc max size %zu, " + "graph_id:%u, graph_name:%s, reduce your batchsize or scale your model may solve problem", + total_mem_offset, VarManager::Instance(session_id)->GetGraphMemoryMaxSize(), + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); for (auto iter : mem_type_to_offset) { ErrorManager::GetInstance().ATCReportErrMessage("E19022", {"memType", "size", "item", "maxsize"}, {std::to_string(iter.first), std::to_string(iter.second), "featuremap", @@ -234,7 +260,13 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map &mem_offset, size_t &zero_mem_copy_size) { BlockMemAssignerPtr priority_assigner = std::move(mem_assigner_->GetPriorityAssinger()); - GE_IF_BOOL_EXEC(priority_assigner == nullptr, GELOGE(FAILED, "Get priority_assigner failed."); return ge::FAILED;); + if (priority_assigner == nullptr) { + REPORT_INNER_ERROR("E19999", "InnerData priority_assigner nullptr, not expected, graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData:priority_assigner]nullptr is invalid, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + return ge::FAILED; + } size_t mem_offset_tmp = mem_offset[RT_MEMORY_HBM]; @@ -254,8 +286,10 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(map &mem_offse zero_mem_copy_size = mem_offset[RT_MEMORY_HBM] - mem_offset_tmp; auto iter = memory_offset_.find(RT_MEMORY_HBM); if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type[HBM]"; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } iter->second.mem_offset_ = mem_offset[RT_MEMORY_HBM]; @@ -304,7 +338,7 @@ uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) { } if (continuous_type != 0) { - GELOGI("Current node %s continuous type %d.", op_desc->GetName().c_str(), continuous_type); + GELOGI("[Get][MemType:Continuous]Current node %s, value is %d", op_desc->GetName().c_str(), continuous_type); } return continuous_type; } @@ -312,8 +346,8 @@ uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) { Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &output_desc, uint32_t continuous_type, int64_t &tensor_size, int64_t &nopadding_size) { if ((op_desc == nullptr) || (output_desc == nullptr)) { - GELOGE(FAILED, "Input para is nullptr."); - return FAILED; + REPORT_INNER_ERROR("E19999", "InnerData param op_desc or output_desc is nullptr, not expected"); + GELOGE(FAILED, "[Check][Param]op_desc or output_desc is nullptr"); } tensor_size = 0; nopadding_size = 0; @@ -322,7 +356,10 @@ Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &o int64_t attr_dim_index; bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); if (!get_attr_dim_flag) { - GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + REPORT_INNER_ERROR("E19999", "Get Attr:%s failed, op_name:%s", + ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX.c_str(), op_desc->GetName().c_str()); + GELOGE(FAILED, "[Get][Attr:%s]fail for op_name:%s", + ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX.c_str(), op_desc->GetName().c_str()); return FAILED; } @@ -330,17 +367,25 @@ Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &o int64_t batch_dim_num = 1; if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, nopadding_size, batch_dim_num, tensor_size) != SUCCESS) { - GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s.", op_desc->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "CalculateTensorRealSizeAndOutSize failed, attr_dim_index:%ld, op_name:%s", + attr_dim_index, op_desc->GetName().c_str()); + GELOGE(FAILED, "[Calculate][NopaddingSize]failed for node %s, attr_dim_index:%ld", + op_desc->GetName().c_str(), attr_dim_index); return FAILED; } } else { if (ge::TensorUtils::GetSize(*output_desc, tensor_size) != ge::SUCCESS) { - GELOGE(FAILED, "GetSize failed."); + REPORT_INNER_ERROR("E19999", "Get Tensor Size failed, op_name:%s", op_desc->GetName().c_str()); + GELOGE(FAILED, "[Get][TensorSize]failed in padding case, op_name:%s", op_desc->GetName().c_str()); return FAILED; } } if ((tensor_size < 0) || (nopadding_size < 0)) { - GELOGE(FAILED, "GetMemorySize for node %s failed.", op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "GetMemorySize fail, " + "tensor_size:%ld or nopadding_size:%ld less than 0, invalid, op_name:%s", + tensor_size, nopadding_size, op_desc->GetName().c_str()); + GELOGE(FAILED, "[Get][MemorySize]tensor_size:%ld or nopadding_size:%ld less than 0, invalid, op_name:%s", + tensor_size, nopadding_size, op_desc->GetName().c_str()); return FAILED; } return SUCCESS; @@ -374,7 +419,7 @@ bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op // If GetBool fail, is_peer_reference is false. (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); GE_IF_BOOL_EXEC(is_peer_reference, - std::string warning = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + + std::string warning = "[Check][Continuous]Current op" + FmtToStr(node->GetOpDesc()->GetName()) + " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + " is ref. There may be conflict between the two."; GELOGW("%s", warning.c_str()); @@ -404,7 +449,7 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { if (continuous_input) { if (AssignContinuousInputMemoryWithAtomicProcessDirectly(node, node_2_continuous_type)) { GE_CHK_STATUS_RET(AssignContinuousInputMemoryWithAtomicProcess(node, continuous_type), - "Assign node %s continuous input memory failed.", node->GetName().c_str()) + "[Assign][Memory:Continuous:Input]fail for node:%s", node->GetName().c_str()) } else { nodes_stack.push_back(node); } @@ -413,10 +458,11 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { int64_t memory_type = RT_MEMORY_HBM; bool continuous_output = ((continuous_type & kTypeOutput) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0); if (continuous_output) { - GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "output"), "Get node memory type failed."); + GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "output"), + "[Get][MemType]fail for node:%s", node->GetName().c_str()); ret = AssignContinuousOutputMemory(node, memory_type, continuous_type); if (ret != ge::SUCCESS) { - GELOGE(ret, "Assign continuous output memory failed!"); + GELOGE(ret, "[Assign][Memory:Continuous:Ouput]fail for node:%s", node->GetName().c_str()); return ret; } } @@ -427,14 +473,16 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { nodes_stack.pop_back(); auto iter = node_2_continuous_type.find(node); if (iter == node_2_continuous_type.end()) { - GELOGE(FAILED, "node %s has no continuous type!", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Get ContinuousType from node_2_continuous_type map failed for node:%s", + node->GetName().c_str()); + GELOGE(FAILED, "[Get][ContinuousType] find fail for node:%s", node->GetName().c_str()); return FAILED; } GE_CHK_STATUS_RET(AssignContinuousInputMemoryWithAtomicProcess(node, iter->second, true), - "Assign node %s continuous input memory failed.", node->GetName().c_str()) + "[Assign][Memory:Continuous:Input]fail for node:%s.", node->GetName().c_str()) } for (auto pair : memory_offset_) { - GELOGD("After reassign continuous memory, memory type = %ld, mem_offset = %zu.", pair.first, + GELOGD("[Reassign][Memory:Continuous]At last, memory type = %ld, mem offset = %zu", pair.first, pair.second.mem_offset_); } return ge::SUCCESS; @@ -442,11 +490,13 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type, bool reverse_refresh) { - GELOGI("Current node %s needs continuous input.", node->GetName().c_str()); + GELOGI("[Assign][Memory:Input:Continuous]start for Current node %s", node->GetName().c_str()); auto iter = memory_offset_.find(memory_type); if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "find memory offset fail for mem_type:%ld, " + "for node:%s, ", memory_type, node->GetName().c_str()); + GELOGE(FAILED, "[Find][MemOffset]fail for mem_type:%ld, when AssignContinuousInputMemory for node:%s", + memory_type, node->GetName().c_str()); return FAILED; } // The head and tail of hcom continuous input should be added 512 @@ -459,8 +509,9 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, GE_CHECK_NOTNULL(op_desc); vector output_list_this = op_desc->GetOutputOffset(); if (output_list_this.empty()) { - std::string error = "node:" + FmtToStr(op_desc->GetName()) + "has no output offset"; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "No output offset in node :%s, not expected", + node->GetName().c_str()); + GELOGE(FAILED, "[Get][OutputOffset] empty is invalid, node:%s", node->GetName().c_str()); return FAILED; } (void) ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, is_continuous_input_allocated); @@ -480,8 +531,9 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, lx_fusion = lx_fusion && !offsets_of_fusion.empty(); if (lx_fusion) { if (peer_out_data_anchor->GetIdx() >= static_cast(offsets_of_fusion.size())) { - std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) + - " index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; + std::string error = "fusion: peer node:" + FmtToStr(peer_op_desc->GetName()) + + " anchor_index:" + FmtToStr(peer_out_data_anchor->GetIdx()) + + " is out of range:" + FmtToStr(offsets_of_fusion.size()); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } @@ -497,7 +549,9 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || lx_fusion; vector output_list = peer_op_desc->GetOutputOffset(); if (peer_out_data_anchor->GetIdx() >= static_cast(output_list.size())) { - std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; + std::string error = "peer node:" + FmtToStr(peer_op_desc->GetName()) + + " anchor_index:" + FmtToStr(peer_out_data_anchor->GetIdx()) + + " is out of range:" + FmtToStr(output_list.size()); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } @@ -506,15 +560,15 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0); if (is_allocated_first_input) { std::map out2ins; - GE_CHK_STATUS_RET(GetAllRef(node, out2ins), "Node: %s get all ref failed", node->GetName().c_str()); + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); // output is beginning offset, set offset for input; only support this case now if ((out2ins.size() == 1) && (out2ins.begin()->second == 0) && (reverse_refresh)) { auto peer_output_offset = output_list.at(peer_out_data_anchor->GetIdx()); output_list.at(peer_out_data_anchor->GetIdx()) = output_list_this.at(out2ins.begin()->first); peer_op_desc->SetOutputOffset(output_list); - GELOGI("Node %s out %d ref in %d input node %s, use output offset %ld update %ld.", node->GetName().c_str(), - out2ins.begin()->first, out2ins.begin()->second, peer_op_desc->GetName().c_str(), - output_list_this.at(out2ins.begin()->first), peer_output_offset); + GELOGI("[Update][Offset]Node %s out %d ref in %d input node %s, use output offset %ld update %ld", + node->GetName().c_str(), out2ins.begin()->first, out2ins.begin()->second, + peer_op_desc->GetName().c_str(), output_list_this.at(out2ins.begin()->first), peer_output_offset); } else { GELOGD("Node %s out %d ref in %d input node %s with total ref numbers %zu.", node->GetName().c_str(), out2ins.begin()->first, out2ins.begin()->second, peer_op_desc->GetName().c_str(), out2ins.size()); @@ -542,7 +596,7 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, } GELOGI("[IMAS]Continuous input : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld] " - "size[%zu] realsize[%ld] nopadding size[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), + "size[%zu] realsize[%ld] nopadding size[%d]", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), node->GetType().c_str(), peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), memory_type, is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding); @@ -563,17 +617,29 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, Status GetFirstInputPeerOutOutputOffset(const ge::NodePtr &node, int64_t &mem_offset) { auto in_data_anchor_list = node->GetAllInDataAnchors(); if (in_data_anchor_list.empty()) { - GELOGE(FAILED, "Node %s's in data anchor is empty.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "InAnchor list empty in node:%s, not expect", + node->GetName().c_str()); + GELOGE(FAILED, "[Get][InAnchor]empty is invalid, node:%s", node->GetName().c_str()); return FAILED; } auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, GELOGE(ge::FAILED, "peer_out_data_anchor is null."); + GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, + REPORT_INNER_ERROR("E19999", "PeerAcnhor is null, not expect for node:%s", + node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][PeerAnchor]null is invalid, node:%s", node->GetName().c_str()); return ge::FAILED); auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_IF_BOOL_EXEC(peer_op_desc == nullptr, GELOGE(ge::FAILED, "peer_op_desc is null."); return ge::FAILED); + GE_IF_BOOL_EXEC(peer_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "PeerOpDesc is null, not expect for node:%s", + node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][PeerOpDesc]null is invalid, node:%s", node->GetName().c_str()); + return ge::FAILED); vector in_node_output_offsets = peer_op_desc->GetOutputOffset(); if (peer_out_data_anchor->GetIdx() >= static_cast(in_node_output_offsets.size())) { - GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); + REPORT_INNER_ERROR("E19999", "PeerAnchorIndex:%d bigger than in_offset size:%lu, judge invalid for node:%s", + peer_out_data_anchor->GetIdx(), in_node_output_offsets.size(), node->GetName().c_str()); + GELOGE(FAILED, "[Check][Index:PeerOutDataAnchor]PeerIndex:%d bigger than in_offset size:%lu, node:%s", + peer_out_data_anchor->GetIdx(), in_node_output_offsets.size(), node->GetName().c_str()); return FAILED; } mem_offset = in_node_output_offsets.at(peer_out_data_anchor->GetIdx()); @@ -584,11 +650,16 @@ Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node uint32_t continuous_type) { GELOGI("Current node %s needs continuous output.", node->GetName().c_str()); auto out_op_desc = node->GetOpDesc(); - GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); + GE_IF_BOOL_EXEC(out_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc is null, not expect for node:%s", + node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][OpDesc]null is invalid, node:%s", node->GetName().c_str())); vector output_list = out_op_desc->GetOutputOffset(); if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) { - GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", - out_op_desc->GetOutputsSize(), output_list.size()); + REPORT_INNER_ERROR("E19999", "Output size:%zu more than output offset size:%zu, invalid in node:%s", + out_op_desc->GetOutputsSize(), output_list.size(), node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][InnerData]Output size:%zu more than output offset size:%zu, invalid in node:%s", + out_op_desc->GetOutputsSize(), output_list.size(), node->GetName().c_str()); return ge::FAILED; } @@ -647,14 +718,17 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { map> connecting_output_atomic_nodes; Status status = FilterAtomicNodesForMemoryAssign(normal_atomic_and_clean_nodes_map, connecting_output_atomic_nodes); if (status != SUCCESS) { - GELOGE(status, "Failed to filter atomic nodes for memory assignment."); + GELOGE(status, "[Filter][AtomicNode]failed in graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return status; } auto mem_iter = memory_offset_.find(RT_MEMORY_HBM); if (mem_iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } @@ -670,7 +744,7 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { vector mem_offset_end; status = AssignAtomicOutputAndWorkspaceMemory(atomic_node, mem_offset_end); if (status != SUCCESS) { - GELOGE(status, "Assign atomic output and workspace memory failed, node name is %s.", + GELOGE(status, "[Assign][Memory]output atomic mem and workspace mem, fail for node name is %s.", atomic_node->GetName().c_str()); return status; } @@ -679,7 +753,7 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { int64_t atomic_mem_size = static_cast(mem_iter->second.mem_offset_) - atomic_mem_start; if (atomic_mem_size != 0) { GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}, RT_MEMORY_HBM), - "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str()); + "[Set][Attr]fail for atomic addr clean node %s.", iter.first->GetName().c_str()); } } batch_max_mem_offset = std::max(batch_max_mem_offset, static_cast(mem_iter->second.mem_offset_)); @@ -690,7 +764,8 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { for (auto &iter_batch : connecting_output_atomic_nodes) { mem_iter->second.mem_offset_ = batch_atomic_mem_start; if (AssignConnectNetOutputAtomicMemory(iter_batch.second) != SUCCESS) { - GELOGE(FAILED, "Failed to assign memory of nodes that connect to netoutput."); + GELOGE(FAILED, "[Assign][Memory]for nodes that connect to netoutput failed." + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } batch_max_mem_offset = std::max(batch_max_mem_offset, static_cast(mem_iter->second.mem_offset_)); @@ -721,9 +796,10 @@ Status GraphMemoryAssigner::FilterAtomicNodesForMemoryAssign( // If GetBool fail, is_reference is false. (void) ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_REFERENCE, is_reference); if (is_reference) { - std::string error = "Op" + FmtToStr(peer_in_node_desc->GetName()) + - " cannot have both atomic and is_reference attribute."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "Op:%s cannot have both atomic and is_reference attribute, " + "not support now", peer_in_node_desc->GetName().c_str()); + GELOGE(FAILED, "[Check][Attr]Op:%s cannot have both atomic and is_reference attribute, " + "not support now", peer_in_node_desc->GetName().c_str()); return ge::PARAM_INVALID; } @@ -761,7 +837,7 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP // Assign atomic node output memory Status ret = AssignAtomicOutputMemory(node, mem_offset_end); if (ret != SUCCESS) { - GELOGE(ret, "Failed to assign atomic output memory, node is %s.", node_op_desc->GetName().c_str()); + GELOGE(ret, "[Assign][Memory:Ouput:Atomic]Failed for node:%s.", node_op_desc->GetName().c_str()); return ret; } @@ -781,7 +857,7 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP ret = AssignOrdinaryAtomicWorkspaceMemory(node_op_desc, atomic_workspace_info, mem_offset_end); } if (ret != SUCCESS) { - GELOGE(ret, "Assign atomic workspace memory failed, node is %s.", node_op_desc->GetName().c_str()); + GELOGE(ret, "[Assign][Memory:Atomic:Workspace]fail for node:%s.", node_op_desc->GetName().c_str()); return ret; } } else { @@ -794,8 +870,10 @@ Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodeP Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector &connect_netoutput_nodes) { auto iter = memory_offset_.find(RT_MEMORY_HBM); if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } for (auto &node : connect_netoutput_nodes) { @@ -811,13 +889,14 @@ Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector & node->GetName().c_str(), node->GetOpDesc()->GetType().c_str(), original_atomic_mem_start); vector mem_offset_end; if (AssignAtomicOutputAndWorkspaceMemory(node, mem_offset_end) != SUCCESS) { - GELOGE(FAILED, "Assign atomic output and workspace memory failed, node is %s.", node->GetName().c_str()); + GELOGE(FAILED, "[Assign][Memory]output atomic mem and workspace mem, fail for node name is %s.", + node->GetName().c_str()); return FAILED; } // All atomic nodes use atomic_addr_clean op independently, so we need to set the attr separately. if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end, RT_MEMORY_HBM) != SUCCESS) { - GELOGE(FAILED, "Failed to set atomic attr separately."); + GELOGE(FAILED, "[Set][Attr:IndependentAtomic]fail for node:%s", node->GetName().c_str()); return FAILED; } } @@ -842,8 +921,10 @@ Status GraphMemoryAssigner::AssignReferenceMemory() { vector output_list = out_op_desc->GetOutputOffset(); if (out_op_desc->GetOutputsSize() > output_list.size()) { - GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", - out_op_desc->GetOutputsSize(), output_list.size()); + REPORT_INNER_ERROR("E19999", "Output size:%zu more than output offset size:%zu, judge invalid in node:%s", + out_op_desc->GetOutputsSize(), output_list.size(), node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][InnerData]Output size:%zu more than output offset size:%zu, invalid in node:%s", + out_op_desc->GetOutputsSize(), output_list.size(), node->GetName().c_str()); return ge::FAILED; } @@ -896,9 +977,12 @@ bool GraphMemoryAssigner::CheckInputIsSupportAtomic(const ge::NodePtr &node) { } if ((peer_op_desc->GetType() == CONSTANTOP) || (peer_op_desc->GetType() == AIPP_DATA_TYPE) || (peer_op_desc->GetType() == VARIABLE)) { - std::string error = "Op" + FmtToStr(node->GetName()) + "'s peer out node" + - FmtToStr(peer_op_desc->GetName()) + " is invalid, Constant/AippData/Variable is not supported"; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "node(type:%s, name:%s) link to atomic node(name:%s), " + "this situation not supported now", + peer_op_desc->GetType().c_str(), peer_op_desc->GetName().c_str(), node->GetName().c_str()); + GELOGE(ge::FAILED, "[Check][Link]node(type:%s, name:%s) link to atomic node(name:%s), " + "this situation not supported now", + peer_op_desc->GetType().c_str(), peer_op_desc->GetName().c_str(), node->GetName().c_str()); return false; } } @@ -918,22 +1002,26 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve // Check atomic output vector output_list = op_desc->GetOutputOffset(); if (atomic_output_index.size() > output_list.size()) { - std::string error = "Op" + FmtToStr(node->GetName()) + - "'s size of atomic_output_index is more than the size of output_list"; + std::string error = + "Op:" + FmtToStr(node->GetName()) + "'s size:" + FmtToStr(atomic_output_index.size()) + + " of atomic_output_index is more than the size:" + FmtToStr(output_list.size()) + " of output_list"; GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return ge::FAILED; } auto output_list_size = static_cast(output_list.size()); auto iter = memory_offset_.find(RT_MEMORY_HBM); if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } for (auto &output_index : atomic_output_index) { if (output_index >= output_list_size) { - std::string error = "Op" + FmtToStr(node->GetName()) + "'s output index" + FmtToStr(output_index) + - " is more than the size" + FmtToStr(output_list_size) + " of output_list."; + std::string error = + "Op:" + FmtToStr(node->GetName()) + "'s atomic_output index:" + FmtToStr(output_index) + + " is more than the size:" + FmtToStr(output_list_size) + " of output_list."; GE_ERRORLOG_AND_ERRORMSG(ge::PARAM_INVALID, error.c_str()); return ge::PARAM_INVALID; } @@ -941,7 +1029,8 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve // If the input of the cascade op needs to clear the atomic addr, there is no need to clear it separately here bool is_assigned_mem = false; if (GetMemoryAssignmentStatus(node, output_index, is_assigned_mem) != SUCCESS) { - GELOGE(ge::FAILED, "Failed to get memory assignment of node %s.", node->GetName().c_str()); + GELOGE(ge::FAILED, "[Get][MemoryAssignmentStatus]fail for node %s, out_index:%ld", + node->GetName().c_str(), output_index); return ge::FAILED; } @@ -981,8 +1070,9 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve Status GraphMemoryAssigner::GetMemoryAssignmentStatus(const ge::NodePtr &node, int64_t output_index, bool &is_mem_assigned) { if (static_cast(output_index) >= node->GetAllOutDataAnchors().size()) { - std::string error = "Op" + FmtToStr(node->GetName()) + "'s output index" + FmtToStr(output_index) + - " is more than the size of node's AllOutDataAnchors."; + std::string error = + "Op:" + FmtToStr(node->GetName()) + "'s output index:" + FmtToStr(output_index) + + " is more than the size:" + FmtToStr(node->GetAllOutDataAnchors().size()) + " of node's AllOutDataAnchors."; GE_ERRORLOG_AND_ERRORMSG(ge::PARAM_INVALID, error.c_str()); return ge::PARAM_INVALID; } @@ -1010,8 +1100,10 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc GELOGI("Begin to reassign normal atomic memory, node = %s.", op_desc->GetName().c_str()); auto mem_type_iter = memory_offset_.find(RT_MEMORY_HBM); if (mem_type_iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } vector workspace_vector = op_desc->GetWorkspace(); @@ -1032,8 +1124,9 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc auto workspace_index = static_cast(info_iter.first); auto workspace_size = info_iter.second; if (workspace_index >= workspace_vector.size()) { - std::string error = "The workspace index" + FmtToStr(workspace_index) + - " is more than the size" + FmtToStr(workspace_vector.size()) + " of workspace vector."; + std::string error = "The workspace index:" + FmtToStr(workspace_index) + + " is more than the size:" + FmtToStr(workspace_vector.size()) + " of workspace vector in op:" + + op_desc->GetName().c_str(); GE_ERRORLOG_AND_ERRORMSG(ge::PARAM_INVALID, error.c_str()); return ge::PARAM_INVALID; } @@ -1063,8 +1156,10 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt GELOGI("Begin to reassign fusion atomic memory, node = %s.", op_desc->GetName().c_str()); auto mem_type_iter = memory_offset_.find(RT_MEMORY_HBM); if (mem_type_iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ does not have type[HBM], not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData]memory_offset_ does not have memory type[HBM]" + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); return FAILED; } map> sub_node_workspace_offset; @@ -1095,7 +1190,10 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt sub_node_workspace_offset.insert(std::make_pair(iter.first, index_offset)); } if (!(op_desc->SetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_OFFSET, sub_node_workspace_offset))) { - GELOGE(FAILED, "Set EXT_ATTR_ATOMIC_WORKSPACE_OFFSET failed, op name:%s.", op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for node:%s", + EXT_ATTR_ATOMIC_WORKSPACE_OFFSET.c_str(), op_desc->GetName().c_str()); + GELOGE(FAILED, "[Set][Attr:%s]fail for node:%s.", + EXT_ATTR_ATOMIC_WORKSPACE_OFFSET.c_str(), op_desc->GetName().c_str()); return FAILED; } @@ -1106,7 +1204,8 @@ Status GraphMemoryAssigner::CheckOffset() { std::map anchor_to_symbol; std::map> symbol_to_anchors; if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors, anchor_to_symbol) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Get ref-mapping for graph %s failed", compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Get][RefMapping]fail for graph %s", compute_graph_->GetName().c_str()); return FAILED; } for (const ge::NodePtr &node : compute_graph_->GetAllNodes()) { @@ -1148,18 +1247,55 @@ Status GraphMemoryAssigner::CheckOffset() { std::string error = "Invalid workspace" + FmtToStr(ge::kInvalidOffset) + + " in node" + FmtToStr(node->GetName()); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - GELOGE(FAILED, "Invalid workspace in node: %s workspace: %ld.", node->GetName().c_str(), ge::kInvalidOffset); return FAILED; } } + // check reuse input and output + GE_CHK_STATUS_RET(CheckRefNodeOffset(node), "[Check][Offset]fail for node: %s", node->GetName().c_str()); } + return SUCCESS; } +ge::Status GraphMemoryAssigner::CheckRefNodeOffset(const NodePtr &node) { + std::map out2ins; + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); + auto opdesc = node->GetOpDesc(); + GE_CHECK_NOTNULL(opdesc); + auto output_list = opdesc->GetOutputOffset(); + auto input_list = opdesc->GetInputOffset(); + for (const auto &out2in : out2ins) { + auto out_i = out2in.first; + if (static_cast(out_i) >= output_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "output offset size" + + FmtToStr(output_list.size()) + "should bigger than ref out index" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + auto in_i = out2in.second; + if (static_cast(in_i) >= input_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "input offset size" + + FmtToStr(input_list.size()) + "should bigger than ref input index" + FmtToStr(in_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + if (output_list[out_i] != input_list[in_i]) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "input offset " + FmtToStr(input_list[in_i]) + + "should equal to output offset" + FmtToStr(output_list[out_i]) + "with ref in" + + FmtToStr(in_i) + "to output" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + } + return ge::SUCCESS; +} + ge::Status GraphMemoryAssigner::SetInputOffset() { if (memory_offset_.empty()) { - GELOGE(FAILED, "memory_offset_ is empty."); - return FAILED; + REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ empty, not expected, graph_id:%u, graph_name:%s", + compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); + GELOGE(FAILED, "[Check][InnerData:memory_offset_]empty is not expected, " + "graph_id:%u, graph_name:%s", compute_graph_->GetGraphID(), compute_graph_->GetName().c_str()); } for (auto pair : memory_offset_) { GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(), @@ -1168,7 +1304,7 @@ ge::Status GraphMemoryAssigner::SetInputOffset() { for (const ge::NodePtr &node : compute_graph_->GetAllNodes()) { if (UpdateOpInputOffset(node) != ge::SUCCESS) { - GELOGE(ge::FAILED, "Update op input offset failed"); + GELOGE(ge::FAILED, "[Update][Offset:Input]fail for op:%s", node->GetName().c_str()); return ge::FAILED; } } @@ -1230,6 +1366,8 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< origin_input_list = tmp_op_desc->GetInputOffset(); int64_t valid_input_index = 0; bool has_mem_type_attr = ge::AttrUtils::GetListInt(tmp_op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, memory_type); + std::map out2ins; + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); for (const auto &anchor : node->GetAllInDataAnchors()) { vector output_list; auto peer_out_anchor = anchor->GetPeerOutAnchor(); @@ -1250,17 +1388,25 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< auto ori_input_offset_list_size = origin_input_list.size(); auto mem_type_size = memory_type.size(); if ((input_size != mem_type_size) || (input_size != ori_input_offset_list_size)) { - std::string error = "fusion: node" + FmtToStr(tmp_op_desc->GetName()) + + std::string error = "Node" + FmtToStr(tmp_op_desc->GetName()) + + " input_size" + FmtToStr(input_size) + " diff from memory_type_size" + FmtToStr(mem_type_size) + " from ori_input_offset_list_size" + FmtToStr(ori_input_offset_list_size); GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); return ge::FAILED; } - // not hbm keep orignal inputoffest - // hbm inputoffset = original inputoffset + outputoffset - input_offset = (memory_type[valid_input_index] == RT_MEMORY_L1 ? origin_input_list[valid_input_index] - : origin_input_list[valid_input_index] + output_list.at(out_index)); + GELOGD("Node[%s] input[%d] has origin offset[%ld]", tmp_op_desc->GetName().c_str(), anchor->GetIdx(), + origin_input_list[valid_input_index]); + // L1 keep original input_offset + if (memory_type[valid_input_index] == RT_MEMORY_L1) { + input_offset = origin_input_list[valid_input_index]; + } else { + // hbm input_offset = original input_offset + output_offset + input_offset = origin_input_list[valid_input_index] + output_list.at(out_index); + // update ref output_offset when input change + GE_CHK_STATUS_RET(UpdateRefOpOutputOffset(node, out2ins, anchor->GetIdx(), input_offset), + "[Update][RefOffset]fail for node: %s", node->GetName().c_str()); + } } const auto &in_node = GetKnownInputNode(peer_out_anchor->GetOwnerNode()); if (in_node->GetType() == CONSTANT) { @@ -1268,12 +1414,8 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, input_offset)); } - GELOGD("%s node[%s] input[%ld] is set from node[%s] out index[%lu] offset[%ld]", - has_mem_type_attr ? "Fusion" : "", - tmp_op_desc->GetName().c_str(), - valid_input_index, - peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), - out_index, + GELOGD("Node[%s] input[%d] is set from node[%s] out index[%lu] offset[%ld]", tmp_op_desc->GetName().c_str(), + anchor->GetIdx(), peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), out_index, input_offset); input_list.emplace_back(input_offset); valid_input_index++; @@ -1282,6 +1424,30 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< return ge::SUCCESS; } +ge::Status GraphMemoryAssigner::UpdateRefOpOutputOffset(const NodePtr &node, const std::map &out2ins, + const int ref_in, const int64_t input_offset) const { + auto opdesc = node->GetOpDesc(); + GE_CHECK_NOTNULL(opdesc); + for (const auto &out2in : out2ins) { + auto out_i = out2in.first; + auto in_i = out2in.second; + if (in_i == ref_in) { + auto origin_output_list = opdesc->GetOutputOffset(); + if (static_cast(out_i) >= origin_output_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "output offset size" + + FmtToStr(origin_output_list.size()) + "should bigger than ref out index" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + origin_output_list[out_i] = input_offset; + opdesc->SetOutputOffset(origin_output_list); + GELOGI("Node[%s] output[%d] is updated from reuse input index[%d] to offset[%ld]", opdesc->GetName().c_str(), + out_i, ref_in, input_offset); + } + } + return ge::SUCCESS; +} + ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { GE_CHECK_NOTNULL(node->GetOpDesc()); vector input_list; @@ -1316,12 +1482,12 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { } } else if (node->GetType() == DATA_TYPE) { if (UpdateConstArgsOffset(node, input_list) != SUCCESS) { - GELOGE(FAILED, "Update data: %s args offset failed.", node->GetName().c_str()); + GELOGE(FAILED, "[Update][Offset:Input:Const]fail for node:%s ", node->GetName().c_str()); return FAILED; } } else { if (UpdateOpInputOffset(node, input_list) != SUCCESS) { - GELOGE(FAILED, "Update node: %s input offset failed.", node->GetName().c_str()); + GELOGE(FAILED, "[Update][Offset:Input]fail for node:%s", node->GetName().c_str()); return FAILED; } } @@ -1361,7 +1527,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in peer_out_node_desc->GetName().c_str(), peer_out_node_desc->GetType().c_str()); if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) { if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size, memory_type) != SUCCESS) { - GELOGE(FAILED, "Set atomic clean attr failed."); + GELOGE(FAILED, "[Set][AtomicCleanAttr]fail for node:%s", peer_out_node->GetName().c_str()); return FAILED; } } @@ -1387,7 +1553,10 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve (void) ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector); mem_start_vector.insert(mem_start_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end()); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector), - GELOGE(FAILED, "SetListInt failed."); + REPORT_INNER_ERROR("E19999", "Set Attr:%s failed, op_name:%s", + ATTR_NAME_AUTOMIC_ADD_START.c_str(), node_op_desc->GetName().c_str()); + GELOGE(FAILED, "[Set][Attr:%s]fail for op_name:%s", + ATTR_NAME_AUTOMIC_ADD_START.c_str(), node_op_desc->GetName().c_str()); return FAILED); std::vector mem_size_vector; @@ -1395,7 +1564,10 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve (void) ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector); mem_size_vector.insert(mem_size_vector.end(), atomic_mem_size.begin(), atomic_mem_size.end()); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector), - GELOGE(FAILED, "SetListInt failed."); + REPORT_INNER_ERROR("E19999", "Set Attr:%s failed, op_name:%s", + ATTR_NAME_AUTOMIC_ADD_MEM_SIZE.c_str(), node_op_desc->GetName().c_str()); + GELOGE(FAILED, "[Set][Attr:%s]fail for op_name:%s", + ATTR_NAME_AUTOMIC_ADD_MEM_SIZE.c_str(), node_op_desc->GetName().c_str()); return FAILED); std::stringstream ss; @@ -1437,12 +1609,14 @@ ge::Status GraphMemoryAssigner::GetNodeListMemoryType(const vector &nod // In the dynamic batch scenario, the memory attributes of nodes are the same. for (auto &n : nodes) { if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { - GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed.") + GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), + "[Get][MemType:input]fail for node:%s", n->GetName().c_str()) break; } if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { - GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed."); + GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), + "[Get][MemType:output]fail for node:%s", n->GetName().c_str()) break; } } @@ -1478,7 +1652,7 @@ ge::Status GraphMemoryAssigner::GetNodeMemoryType(const NodePtr &node, int64_t & } if (!CheckContinuousMemType(mem_type_list)) { - GELOGE(FAILED, "Check continuous memory type failed."); + GELOGE(FAILED, "[Check][MemType:Continuous]fail for node:%s", node->GetName().c_str()); return FAILED; } // It is continuous memory and memory type is the same, so use the first memory. @@ -1518,7 +1692,7 @@ void GraphMemoryAssigner::PrintMemoryOffset() { } } -ge::Status GraphMemoryAssigner::GetAllRef(const NodePtr &node, map &out2ins) { +ge::Status GraphMemoryAssigner::TryGetNodeRefIndexes(const NodePtr &node, map &out2ins) const{ for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) { int32_t reuse_in_index = -1; bool reuse_input_flag = GraphUtils::IsRefFromInput(out_data_anchor, reuse_in_index); @@ -1526,7 +1700,11 @@ ge::Status GraphMemoryAssigner::GetAllRef(const NodePtr &node, mapGetInDataAnchor(reuse_in_index) != nullptr) { out2ins.emplace(out_data_anchor->GetIdx(), reuse_in_index); } else { - GELOGE(FAILED, "Invalid reuse_input value %d on output %d of node %s, please check attr reuse_input", + REPORT_INNER_ERROR("E19999", "Invalid reuse_input value %d on output %d of node %s, " + "please check attr reuse_input", + reuse_in_index, out_data_anchor->GetIdx(), node->GetName().c_str()); + GELOGE(FAILED, "[Check][Attr]Invalid reuse_input value %d on output %d of node %s, " + "please check attr reuse_input", reuse_in_index, out_data_anchor->GetIdx(), node->GetName().c_str()); return FAILED; } @@ -1549,7 +1727,7 @@ bool GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcessDirectly( auto continuous_type = iter->second; bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0); if (continuous_input) { - GELOGI("Node %s 's precursor node %s need assign continuous input memory, store node firstly.", + GELOGI("[Store][Node] of %s cause it's precursor node %s need assign continuous input memory", input_continuous_node->GetName().c_str(), in_node->GetName().c_str()); return false; } @@ -1559,7 +1737,7 @@ bool GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcessDirectly( node_2_continuous_type.emplace(out_node, continuous_type); bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0); if (continuous_input) { - GELOGI("Node %s 's succeed node %s need assign continuous input memory, store node firstly.", + GELOGI("[Store][Node] of %s cause it's succeed node %s need assign continuous input memory", input_continuous_node->GetName().c_str(), out_node->GetName().c_str()); return false; } @@ -1575,11 +1753,12 @@ ge::Status GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcess(con int64_t mem_clean_size = 0; int64_t memory_type = RT_MEMORY_HBM; - GE_CHK_STATUS_RET(GetNodeMemoryType(input_continuous_node, memory_type, "input"), "Get node memory type failed."); + GE_CHK_STATUS_RET(GetNodeMemoryType(input_continuous_node, memory_type, "input"), + "[Get][MemType]fail for node:%s", input_continuous_node->GetName().c_str()); auto ret = AssignContinuousInputMemory(input_continuous_node, mem_clean_start, mem_clean_size, memory_type, continuous_type, reverse_refresh); if (ret != ge::SUCCESS) { - GELOGE(ret, "Assign continuous input memory failed!"); + GELOGE(ret, "[Assign][Memory:Input:continuous]fail for node:%s", input_continuous_node->GetName().c_str()); return ret; } @@ -1590,7 +1769,6 @@ ge::Status GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcess(con if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) { // check whether there is an atomic conflict between the current node and the peer out node if (!CheckInputIsSupportAtomic(input_continuous_node)) { - GELOGE(ge::FAILED, "There is an atomic conflict between the current node and the peer out node, not supported!"); return ge::FAILED; } @@ -1602,7 +1780,7 @@ ge::Status GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcess(con if (peer_out_node->GetType() == ATOMICADDRCLEAN) { ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}, memory_type); if (ret != SUCCESS) { - GELOGE(ret, "Failed to set attr for atomic addr clean node %s.", peer_out_node->GetName().c_str()); + GELOGE(ret, "[Set][AtomicCleanAttr]fail for node:%s", peer_out_node->GetName().c_str()); return ret; } } @@ -1612,4 +1790,54 @@ ge::Status GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcess(con return ge::SUCCESS; } +Status GraphMemoryAssigner::AssignBufferPoolMemory() { + auto is_buffer_pool_mem_enable = [] (const ComputeGraphPtr &graph) -> bool { + for (NodePtr &node : graph->GetAllNodes()) { + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + bool has_attrs = op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_ID) && op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_SIZE); + if (has_attrs) { + return true; + } + } + return false; + }; + auto root_graph = GraphUtils::FindRootGraph(compute_graph_); + GE_CHECK_NOTNULL(root_graph); + if (root_graph->GetGraphUnknownFlag()) { + GELOGI("[Check][Enable]Unknown root graph does not support buffer pool memory, graph:%s.", + compute_graph_->GetName().c_str()); + return SUCCESS; + } + if (!is_buffer_pool_mem_enable(compute_graph_)) { + GELOGD("[Check][Enable]Buffer pool memory is not enable, graph:%s.", compute_graph_->GetName().c_str()); + return SUCCESS; + } + map mem_type_to_offset; + for (const auto &pair : memory_offset_) { + mem_type_to_offset[pair.first] = pair.second.mem_offset_; + } + BufferPoolMemAssigner buffer_pool_mem_assigner(compute_graph_, mem_type_to_offset); + Status status = buffer_pool_mem_assigner.Assign(); + if (status != SUCCESS) { + GELOGE(status, "[Assign][BufferPoolMem]Graph:%s.", compute_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to assign buffer pool memory, graph:%s.", compute_graph_->GetName().c_str()); + return status; + } + int64_t mem_type = buffer_pool_mem_assigner.GetMemType(); + auto iter = memory_offset_.find(mem_type); + if (iter == memory_offset_.end()) { + GELOGE(FAILED, "[Check][MemType]Memory type is not supported, graph:%s, mem type:%ld.", + compute_graph_->GetName().c_str(), mem_type); + REPORT_INNER_ERROR("E19999", "Memory type is not supported, graph:%s, mem type:%ld.", + compute_graph_->GetName().c_str(), mem_type); + return FAILED; + } + iter->second.mem_offset_ = buffer_pool_mem_assigner.GetMemOffset(); + GELOGI("[Assign][BufferPoolMem]Assign buffer pool memory successfully, graph:%s, mem type:%ld, mem offset:%zu.", + compute_graph_->GetName().c_str(), mem_type, buffer_pool_mem_assigner.GetMemOffset()); + return SUCCESS; +} } // namespace ge diff --git a/ge/graph/build/memory/graph_mem_assigner.h b/ge/graph/build/memory/graph_mem_assigner.h index 756781fe..773df4e6 100755 --- a/ge/graph/build/memory/graph_mem_assigner.h +++ b/ge/graph/build/memory/graph_mem_assigner.h @@ -110,8 +110,11 @@ class GraphMemoryAssigner { ge::Status SetInputOffset(); ge::Status UpdateOpInputOffset(const NodePtr &node) const; + ge::Status UpdateRefOpOutputOffset(const NodePtr &node, const std::map &out2ins, const int ref_in, + const int64_t input_offset) const; ge::Status CheckOffset(); + ge::Status CheckRefNodeOffset(const NodePtr &node); ge::Status AssignReferenceMemory(); @@ -125,7 +128,7 @@ class GraphMemoryAssigner { ge::Status ReAssignAtomicMemory(bool is_loop_graph); - ge::Status GetAllRef(const NodePtr &node, std::map &out2ins); + ge::Status TryGetNodeRefIndexes(const NodePtr &node, std::map &out2ins) const; bool AssignContinuousInputMemoryWithAtomicProcessDirectly(const NodePtr &input_continuous_node, std::map &node_2_continuous_type); @@ -188,6 +191,8 @@ class GraphMemoryAssigner { void PrintMemoryOffset(); + Status AssignBufferPoolMemory(); + MemoryOffsetMap memory_offset_; ge::ComputeGraphPtr compute_graph_; HybridMemAssignerPtr mem_assigner_; diff --git a/ge/graph/build/memory/hybrid_mem_assigner.cc b/ge/graph/build/memory/hybrid_mem_assigner.cc index 462e190a..4ea52d9d 100755 --- a/ge/graph/build/memory/hybrid_mem_assigner.cc +++ b/ge/graph/build/memory/hybrid_mem_assigner.cc @@ -42,6 +42,7 @@ Status HybridMemAssigner::AssignMemory(std::unique_ptr &block_ Status HybridMemAssigner::Assign() { if (GraphUtils::GetRefMapping(compute_graph_, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get ref-mapping for graph %s failed", compute_graph_->GetName().c_str()); GELOGE(FAILED, "Get ref-mapping for graph %s failed.", compute_graph_->GetName().c_str()); return FAILED; } diff --git a/ge/graph/build/memory/module.mk b/ge/graph/build/memory/module.mk index 73617794..232c2fed 100755 --- a/ge/graph/build/memory/module.mk +++ b/ge/graph/build/memory/module.mk @@ -8,6 +8,7 @@ local_lib_src_files := memory_assigner.cc \ hybrid_mem_assigner.cc \ max_block_mem_assigner.cc \ var_mem_assign_util.cc \ + buffer_pool_mem_assigner.cc \ local_lib_inc_path := ${LOCAL_PATH} \ ${TOPDIR}inc \ diff --git a/ge/graph/build/memory/var_mem_assign_util.cc b/ge/graph/build/memory/var_mem_assign_util.cc index f910d2e2..4cbde2af 100755 --- a/ge/graph/build/memory/var_mem_assign_util.cc +++ b/ge/graph/build/memory/var_mem_assign_util.cc @@ -53,6 +53,8 @@ Status VarMemAssignUtil::AssignStaticMemory2Node(ge::ComputeGraphPtr &compute_gr GE_IF_BOOL_EXEC(ge::AttrUtils::GetStr(n->GetOpDesc(), REF_VAR_SRC_VAR_NAME, ref_var_src_var_name), continue); string node_name = n->GetName(); GE_IF_BOOL_EXEC(n->GetOpDesc()->GetAllOutputsDesc().empty(), + REPORT_INNER_ERROR("E19999", "check node:%s has no OutputDesc", + n->GetName().c_str()); GELOGE(FAILED, "node:%s has no OutputDesc.", n->GetName().c_str()); return FAILED); ge::ConstGeTensorDescPtr tensor_desc = n->GetOpDesc()->GetOutputDescPtr(0); @@ -116,6 +118,8 @@ Status VarMemAssignUtil::SetOutVariableAttr(const ge::NodePtr &node, const ge::N GE_CHECK_NOTNULL(node->GetOpDesc()); output_list = node->GetOpDesc()->GetOutputOffset(); if (output_list.empty()) { + REPORT_INNER_ERROR("E19999", "check node:%s output_offset_list is empty", + node->GetName().c_str()); GELOGE(PARAM_INVALID, "Output_list is empty"); return PARAM_INVALID; } @@ -126,7 +130,12 @@ Status VarMemAssignUtil::SetOutVariableAttr(const ge::NodePtr &node, const ge::N VarManager::Instance(session_id)->GetVarAddr(var_node->GetName(), var_tensor_desc, &dev_ptr, memory_type)); int out_list_size = static_cast(output_list.size()); - GE_CHK_BOOL_RET_STATUS(index < out_list_size, FAILED, "index %d >= output_list.size() %d", index, out_list_size); + if (index >= out_list_size) { + REPORT_INNER_ERROR("E19999", "param index:%d >= output_list.size() %d in node %s, check invalid", + index, out_list_size, node->GetName().c_str()); + GELOGE(FAILED, "index %d >= output_list.size() %d", index, out_list_size); + return FAILED; + } output_list[index] = static_cast(reinterpret_cast(dev_ptr)); GELOGI("Assign node outputOffset[index] is: %ld", output_list[index]); @@ -168,9 +177,13 @@ Status VarMemAssignUtil::DealBroadCastNode(uint32_t graph_id, const ge::NodePtr auto broad_cast_index = static_cast(broad_cast_info.idx); auto input_tensor_desc_ptr_vistor = op_desc->GetAllInputsDescPtr(); - GE_CHK_BOOL_RET_STATUS(input_tensor_desc_ptr_vistor.size() > broad_cast_index, FAILED, - "Get broadcast op %s input tensor desc size [%zu] < idx [%d]", node->GetName().c_str(), - input_tensor_desc_ptr_vistor.size(), broad_cast_info.idx); + if (input_tensor_desc_ptr_vistor.size() <= broad_cast_index) { + REPORT_INNER_ERROR("E19999", "Get broadcast op %s input tensor desc size [%zu] < idx [%d]", + node->GetName().c_str(), input_tensor_desc_ptr_vistor.size(), broad_cast_info.idx); + GELOGE(FAILED, "Get broadcast op %s input tensor desc size [%zu] < idx [%d]", node->GetName().c_str(), + input_tensor_desc_ptr_vistor.size(), broad_cast_info.idx); + return FAILED; + } const ge::GeTensorDescPtr input_tensor_desc = input_tensor_desc_ptr_vistor.at(static_cast(broad_cast_info.idx)); int64_t input_size = 0; @@ -298,6 +311,7 @@ Status VarMemAssignUtil::SetOutTransNodeToAssign(const ge::NodePtr &node, const } Status VarMemAssignUtil::AssignMemory2HasRefAttrNode(ge::ComputeGraphPtr &compute_graph) { + GraphToNodeMap graph_to_node; for (const ge::NodePtr &n : compute_graph->GetAllNodes()) { string ref_var_src_var_name; auto op_desc = n->GetOpDesc(); @@ -305,7 +319,8 @@ Status VarMemAssignUtil::AssignMemory2HasRefAttrNode(ge::ComputeGraphPtr &comput for (uint32_t idx = 0; idx < op_desc->GetOutputsSize(); idx += 1) { const auto out_desc = op_desc->MutableOutputDesc(idx); if (ge::AttrUtils::GetStr(out_desc, REF_VAR_SRC_VAR_NAME, ref_var_src_var_name)) { - GE_CHK_STATUS_RET(AssignData2VarRef(n, ref_var_src_var_name, compute_graph->GetSessionID(), idx)); + GE_CHK_STATUS_RET( + AssignData2VarRef(n, ref_var_src_var_name, compute_graph->GetSessionID(), idx, graph_to_node)); } } } @@ -313,16 +328,37 @@ Status VarMemAssignUtil::AssignMemory2HasRefAttrNode(ge::ComputeGraphPtr &comput } Status VarMemAssignUtil::AssignData2VarRef(const ge::NodePtr &has_ref_attr_node, const string &src_var_name, - uint64_t session_id, uint32_t out_index) { + uint64_t session_id, uint32_t out_index, + GraphToNodeMap &graph_to_node) { // Get ref_var_src_var address auto root_graph = GraphUtils::FindRootGraph(has_ref_attr_node->GetOwnerComputeGraph()); GE_CHECK_NOTNULL(root_graph); - ge::NodePtr var_ref_src_var = root_graph->FindNode(src_var_name); - if (var_ref_src_var == nullptr) { + // Cache mapping (name to nodeptr) simproves query performance + auto &name_to_node = graph_to_node[root_graph]; + if (name_to_node.empty()) { + for (const ge::NodePtr &n : root_graph->GetDirectNode()) { + name_to_node.emplace(n->GetName(), n); + } + for (auto sub_graph : root_graph->GetAllSubgraphs()) { + auto &name_to_node_sub = graph_to_node[sub_graph]; + if (name_to_node_sub.empty()) { + for (const ge::NodePtr &n : sub_graph->GetDirectNode()) { + name_to_node_sub.emplace(n->GetName(), n); + } + } + } + } + + ge::NodePtr var_ref_src_var = nullptr; + auto it = name_to_node.find(src_var_name); + if ((it != name_to_node.end()) && (it->second != nullptr)) { + var_ref_src_var = it->second; + } else { for (auto sub_graph : root_graph->GetAllSubgraphs()) { - auto node_ptr = sub_graph->FindNode(src_var_name); - if (node_ptr != nullptr) { - var_ref_src_var = node_ptr; + auto &name_to_node_sub = graph_to_node[sub_graph]; + it = name_to_node_sub.find(src_var_name); + if ((it != name_to_node_sub.end()) && (it->second != nullptr)) { + var_ref_src_var = it->second; break; } } diff --git a/ge/graph/build/memory/var_mem_assign_util.h b/ge/graph/build/memory/var_mem_assign_util.h index f0e6270d..9528dbdb 100644 --- a/ge/graph/build/memory/var_mem_assign_util.h +++ b/ge/graph/build/memory/var_mem_assign_util.h @@ -22,6 +22,8 @@ #include "graph/utils/node_utils.h" namespace ge { +using GraphToNodeMap = std::map>; + class VarMemAssignUtil { public: static Status AssignVarMemory(ge::ComputeGraphPtr &compute_graph); @@ -47,7 +49,7 @@ class VarMemAssignUtil { static Status DealTransNode(const ge::NodePtr &final_trans_node); static Status DealExportTransNode(const ge::NodePtr &node, const ge::NodePtr &final_trans_node); static Status AssignData2VarRef(const ge::NodePtr &variable_ref, const std::string &src_var_name, uint64_t session_id, - uint32_t out_index); + uint32_t out_index, GraphToNodeMap &graph_to_node); static Status SetOutTransNodeToAssign(const ge::NodePtr &node, const ge::NodePtr &final_trans_node, size_t index); }; diff --git a/ge/graph/build/model_builder.cc b/ge/graph/build/model_builder.cc index 1a14374d..6f427683 100755 --- a/ge/graph/build/model_builder.cc +++ b/ge/graph/build/model_builder.cc @@ -116,11 +116,15 @@ Status ModelBuilder::CalcOutputSize(const ge::NodePtr &n) { int64_t size_temp = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(desc_temp, size_temp); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed for op:%s(%s) index:%u", + node_op_desc->GetName().c_str(), node_op_desc->GetType().c_str(), index); GELOGE(graph_status, "GetTensorMemorySizeInBytes failed!"); return FAILED; } TensorUtils::SetSize(desc_temp, size_temp); if (node_op_desc->UpdateOutputDesc(index, desc_temp) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update Output desc size failed for op:%s(%s) index:%u", + node_op_desc->GetName().c_str(), node_op_desc->GetType().c_str(), index); GELOGE(FAILED, "UpdateOutputDesc failed."); return FAILED; } @@ -197,8 +201,7 @@ void ModelBuilder::SetInputIsConst(const ge::NodePtr &n) { } } - std::string input_const_info = ToString(is_input_const); - GELOGD("update opdesc:%s InputConst:%s", node_op_desc->GetName().c_str(), input_const_info.c_str()); + GELOGD("update opdesc:%s InputConst:%s", node_op_desc->GetName().c_str(), ToString(is_input_const).c_str()); node_op_desc->SetIsInputConst(is_input_const); } @@ -207,11 +210,15 @@ Status ModelBuilder::AdjustConstWeightSize(const ge::NodePtr &node, size_t &mem_ if (node->GetType() == CONSTANT) { vector weights = OpDescUtils::MutableWeights(node); if (weights.empty()) { + REPORT_INNER_ERROR("E19999", "Check weights size of node %s(%s) is empty", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "weights size of node %s is empty", node->GetName().c_str()); return FAILED; } GeTensorPtr weight = weights[0]; if (weight == nullptr) { + REPORT_INNER_ERROR("E19999", "Check weight of node %s(%s) is nullptr", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "weights[0] is null."); return FAILED; } @@ -353,6 +360,9 @@ Status ModelBuilder::AdjustInputTensorFlag() { auto input_desc = owner_node_op_desc->GetInputDesc(in_anchors->GetIdx()); ge::TensorUtils::SetInputTensor(input_desc, true); if (owner_node_op_desc->UpdateInputDesc(in_anchors->GetIdx(), input_desc) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update Input desc size failed for op:%s(%s) index:%u", + owner_node_op_desc->GetName().c_str(), owner_node_op_desc->GetType().c_str(), + in_anchors->GetIdx()); GELOGE(FAILED, "UpdateOutputDesc failed."); return FAILED; } @@ -381,33 +391,51 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) { max_mem_offset_ = mem_type_to_mem_offset_[RT_MEMORY_HBM]; GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_MEMORY_SIZE, max_mem_offset_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_MEMORY_SIZE.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_MEMORY_SIZE failed."); return FAILED); if (mem_type_to_mem_offset_.find(RT_MEMORY_P2P_DDR) != mem_type_to_mem_offset_.end()) { p2p_mem_offset_ = mem_type_to_mem_offset_[RT_MEMORY_P2P_DDR]; } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_P2P_MEMORY_SIZE, p2p_mem_offset_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_P2P_MEMORY_SIZE.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_P2P_MEMORY_SIZE failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_WEIGHT_SIZE, weight_offset_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_WEIGHT_SIZE.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_WEIGHT_SIZE failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_STREAM_NUM, stream_num_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_STREAM_NUM.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_STREAM_NUM failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_EVENT_NUM, event_num_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_EVENT_NUM.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_EVENT_NUM failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(&model, ATTR_MODEL_HUGE_STREAM_LIST, huge_streams_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_HUGE_STREAM_LIST.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_HUGE_STREAM_LIST failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_LABEL_NUM, label_num_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_LABEL_NUM.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_LABEL_NUM failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_ZERO_COPY_MEMORY_SIZE, zero_copy_mem_size_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_ZERO_COPY_MEMORY_SIZE.c_str()); GELOGE(FAILED, "SetInt of ATTR_MODEL_ZERO_COPY_MEMORY_SIZE failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, GetLocalOmgContext().net_out_nodes), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_OUT_NODES_NAME.c_str()); GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed."); return FAILED); GELOGI("For model, max_mem_offset_: %zu, p2p_mem_size: %zu, zero_copy_mem_size_: %zu", max_mem_offset_, @@ -415,6 +443,8 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) { string fp_ceiling_mode; if (ge::GetContext().GetOption("ge.fpCeilingMode", fp_ceiling_mode) == SUCCESS) { if (!ge::AttrUtils::SetStr(&model, ATTR_FP_CEILING_MODE, fp_ceiling_mode)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_FP_CEILING_MODE.c_str()); GELOGE(FAILED, "Failed to set attr ATTR_FP_CEILING_MODE"); return FAILED; } @@ -429,22 +459,30 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) { int64_t core_type = (ge_core_type == kVectorCore) ? 1 : 0; GELOGI("core_type: %ld", core_type); if (!ge::AttrUtils::SetInt(&model, ATTR_MODEL_CORE_TYPE, core_type)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_MODEL_CORE_TYPE.c_str()); GELOGE(FAILED, "SetInt of ATTR_CORE_TYPE failed."); } InitL1FusionOption(); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(&model, ATTR_NAME_SWITCH_FOR_L1_FUSION, is_l1_fusion_enable_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_NAME_SWITCH_FOR_L1_FUSION.c_str()); GELOGE(FAILED, "SetBool of ATTR_NAME_SWITCH_FOR_L1_FUSION failed."); return FAILED); const DumpProperties &dump_properties = DumpManager::GetInstance().GetDumpProperties(session_id_); bool is_op_debug = dump_properties.IsOpDebugOpen(); if (is_op_debug) { if (!ge::AttrUtils::SetBool(&model, ATTR_OP_DEBUG_FLAG, is_op_debug)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_OP_DEBUG_FLAG.c_str()); GELOGE(FAILED, "SetBool of ATTR_OP_DEBUG_FLAG failed."); return FAILED; } uint32_t op_debug_mode = dump_properties.GetOpDebugMode(); GELOGI("Get op debug mode:%d", op_debug_mode); if (!ge::AttrUtils::SetInt(&model, ATTR_OP_DEBUG_MODE, op_debug_mode)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s in model failed", + ATTR_OP_DEBUG_MODE.c_str()); GELOGE(FAILED, "SetBool of ATTR_OP_DEBUG_MODE failed."); return FAILED; } @@ -516,6 +554,8 @@ Status ModelBuilder::MergeWeights() { // If MutableTensor failed, weight is nullptr. (void)ge::AttrUtils::MutableTensor(op_desc, ATTR_NAME_WEIGHTS, weight); if (weight == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get const weight in op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Can't get const op weight, name: %s", node->GetName().c_str()); return FAILED; } @@ -538,8 +578,15 @@ Status ModelBuilder::MergeWeights() { continue; } if (weight_data.data() != nullptr) { - GE_IF_BOOL_EXEC(base_addr == nullptr, GELOGE(FAILED, "Base addr is nullptr."); return FAILED); + GE_IF_BOOL_EXEC(base_addr == nullptr, + REPORT_INNER_ERROR("E19999", "Check weight in op:%s(%s) is nullptr", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(FAILED, "Base addr is nullptr."); + return FAILED); if (weight_offset_ - offset < weight_data.size()) { + REPORT_INNER_ERROR("E19999", "left weight size not enough for op:%s(%s) left_size:%zu, weight_size:%zu", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + weight_offset_ - offset, weight_data.size()); GELOGE(FAILED, "left weight size not enough. left_size:%lu, weight_size:%lu", weight_offset_ - offset, weight_data.size()); return FAILED; @@ -551,6 +598,9 @@ Status ModelBuilder::MergeWeights() { auto err = memcpy_s(reinterpret_cast(dst_ptr), SECUREC_MEM_MAX_LEN, reinterpret_cast(src_ptr), SECUREC_MEM_MAX_LEN); if (err != EOK) { + REPORT_CALL_ERROR("E19999", "mem copy failed. errret:%u, " + "dst_ptr:%lx, dst_size:%lu, src_ptr:%lx, src_size:%lu,", + err, dst_ptr, SECUREC_MEM_MAX_LEN, src_ptr, SECUREC_MEM_MAX_LEN); GELOGE(FAILED, "mem copy failed. errret:%u, " "dst_ptr:%lx, dst_size:%lu, src_ptr:%lx, src_size:%lu", err, dst_ptr, SECUREC_MEM_MAX_LEN, src_ptr, SECUREC_MEM_MAX_LEN); @@ -562,6 +612,9 @@ Status ModelBuilder::MergeWeights() { } auto err = memcpy_s(reinterpret_cast(dst_ptr), left_size, reinterpret_cast(src_ptr), left_size); if (err != EOK) { + REPORT_CALL_ERROR("E19999", "mem copy failed. errret:%u, " + "dst_ptr:%lx, dst_size:%lu, src_ptr:%lx, src_size:%lu,", + err, dst_ptr, SECUREC_MEM_MAX_LEN, src_ptr, SECUREC_MEM_MAX_LEN); GELOGE(FAILED, "mem copy failed. errret:%u, " "dst_ptr:%lx, dst_size:%lu, src_ptr:%lx, src_size:%lu", err, dst_ptr, SECUREC_MEM_MAX_LEN, src_ptr, SECUREC_MEM_MAX_LEN); @@ -574,6 +627,50 @@ Status ModelBuilder::MergeWeights() { return SUCCESS; } +Status ModelBuilder::SaveAtomicTBEKernel(const OpDescPtr &op_desc) { + ge::NodePtr atomic_clean_node = nullptr; + atomic_clean_node = op_desc->TryGetExtAttr("atomic_clean_node_ptr", atomic_clean_node); + if (atomic_clean_node == nullptr) { + return SUCCESS; + } + + ge::OpDescPtr atomic_op_desc = atomic_clean_node->GetOpDesc(); + GE_CHECK_NOTNULL(atomic_op_desc); + TBEKernelPtr tbe_kernel = atomic_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); + if (tbe_kernel == nullptr) { + std::string kernel_name; + GeAttrValue::BYTES kernel_buffer; + (void) AttrUtils::GetStr(atomic_op_desc, ATTR_NAME_TBE_KERNEL_NAME, kernel_name); + (void) AttrUtils::GetBytes(atomic_op_desc, ATTR_NAME_TBE_KERNEL_BUFFER, kernel_buffer); + if (!kernel_name.empty() && (kernel_buffer.GetSize() > 0)) { + GE_CHECK_NOTNULL(kernel_buffer.GetData()); + std::vector data(kernel_buffer.GetData(), kernel_buffer.GetData() + kernel_buffer.GetSize()); + tbe_kernel = MakeShared(kernel_name, std::move(data)); + GE_CHECK_NOTNULL(tbe_kernel); + } + } + if (tbe_kernel == nullptr) { + GELOGD("Atomic_clean_node doesn't have tbe_kernel."); + return SUCCESS; + } + tbe_kernel_store_.AddTBEKernel(tbe_kernel); + GELOGD("Atomic_clean_node tbe_kernel_name %s!", tbe_kernel->GetName().c_str()); + (void) AttrUtils::SetStr(op_desc, ATOMIC_ATTR_TBE_KERNEL_NAME, tbe_kernel->GetName()); + + std::string kernel_name; + (void) AttrUtils::GetStr(atomic_op_desc, atomic_op_desc->GetName() + "_kernelname", kernel_name); + (void) AttrUtils::SetStr(op_desc, op_desc->GetName() + "_atomic_kernelname", kernel_name); + + std::string meta_data; + (void) AttrUtils::GetStr(atomic_op_desc, TVM_ATTR_NAME_METADATA, meta_data); + (void) AttrUtils::SetStr(op_desc, ATOMIC_ATTR_TVM_METADATA, meta_data); + + std::string json_string; + (void) AttrUtils::GetStr(atomic_op_desc, TVM_ATTR_NAME_MAGIC, json_string); + (void) AttrUtils::SetStr(op_desc, ATOMIC_ATTR_TVM_MAGIC, json_string); + return SUCCESS; +} + Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { // Add weight ge_model.SetWeight(weight_buffer_); @@ -602,11 +699,15 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { } GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue); if (tbe_name_set.count(tbe_kernel->GetName()) > 0) { + REPORT_INNER_ERROR("E19999", "tbe_kernel name %s can't be the same, judge for op:%s(%s),", + tbe_kernel->GetName().c_str(), n->GetName().c_str(), n->GetType().c_str()); GELOGE(FAILED, "tbe_kernel name %s can't be the same", tbe_kernel->GetName().c_str()); return FAILED; } tbe_name_set.insert(tbe_kernel->GetName()); tbe_kernel_store_.AddTBEKernel(tbe_kernel); + + GE_CHK_STATUS_RET(SaveAtomicTBEKernel(node_op_desc), "[Save][TBEKernel] save atomic tbekernel failed!"); } SetModelCheckAicpuAttr(model, aicpu_op_types, aicpu_tf_op_types); @@ -618,6 +719,8 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr()); GE_IF_BOOL_EXEC(cust_aicpu_kernel == nullptr, continue); if (aicpu_name_set.count(cust_aicpu_kernel->GetName()) > 0) { + REPORT_INNER_ERROR("E19999", "aicpu_kernel name %s can't be the same, judge for op:%s(%s),", + cust_aicpu_kernel->GetName().c_str(), n->GetName().c_str(), n->GetType().c_str()); GELOGE(FAILED, "aicpu_kernel name %s can't be the same", cust_aicpu_kernel->GetName().c_str()); return FAILED; } @@ -640,6 +743,7 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { // Add task GeAttrValue::BYTES task_def_bytes; if (!AttrUtils::GetZeroCopyBytes(model, MODEL_ATTR_TASKS, task_def_bytes)) { + REPORT_CALL_ERROR("E19999", "Get attr:%s in model failed", MODEL_ATTR_TASKS.c_str()); GELOGE(INTERNAL_ERROR, "Get zero copy bytes fail."); return INTERNAL_ERROR; } @@ -675,6 +779,7 @@ void ModelBuilder::SetModelVersion(ge::Model &model) { Status ModelBuilder::PreBuildModel() { if ((compute_graph_ == nullptr) || !(compute_graph_->IsValid())) { + REPORT_INNER_ERROR("E19999", "Check compute_graph no valid"); GELOGE(FAILED, "Graph_ is not valid."); return FAILED; } @@ -754,6 +859,7 @@ Status ModelBuilder::CompileSingleOp() { // Create ge instance std::shared_ptr instance = ge::GELib::GetInstance(); if ((instance == nullptr) || !instance->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Check GELib instance not init before"); GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "CompileSingleOp failed."); return ge::GE_CLI_GE_NOT_INITIALIZED; } @@ -775,6 +881,8 @@ Status ModelBuilder::CompileSingleOp() { (void)instance->DNNEngineManagerObj().GetDNNEngineName(node); kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { + REPORT_INNER_ERROR("E19999", "Check kernel lib name empty of op:%s(%s)", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(ge::INTERNAL_ERROR, "Get node:%s(%s) kernel lib failed.", node->GetName().c_str(), node->GetType().c_str()); return ge::INTERNAL_ERROR; @@ -785,6 +893,8 @@ Status ModelBuilder::CompileSingleOp() { if (kernel_info != nullptr) { node_vector_map[kernel_lib_name].emplace_back(node); } else { + REPORT_INNER_ERROR("E19999", "Get ops kernel info store failed for op:%s(%s), op_kernel_name:%s,", + node->GetName().c_str(), node->GetType().c_str(), kernel_lib_name.c_str()); GELOGE(ge::GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node->GetName().c_str()); return ge::GE_GRAPH_PARAM_NULLPTR; } @@ -800,6 +910,8 @@ Status ModelBuilder::CompileSingleOp() { GELOGI("[GEPERFTRACE] The node size of compile op of %s is %zu", kernel_lib_name.c_str(), node_vector.size()); GE_TIMESTAMP_ADD(BatchCompileOp); if (ret != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Batch compile op failed, kernel lib name, node size:%zu,", + node_vector.size()); GELOGE(ret, "Compile op failed, kernel lib name is %s", kernel_lib_name.c_str()); return ret; } diff --git a/ge/graph/build/model_builder.h b/ge/graph/build/model_builder.h index 12420614..67def859 100644 --- a/ge/graph/build/model_builder.h +++ b/ge/graph/build/model_builder.h @@ -89,6 +89,8 @@ class ModelBuilder { void SetModelCheckAicpuAttr(ge::Model &model, std::set &aicpu_op_types, std::set &aicpu_tf_op_types); + Status SaveAtomicTBEKernel(const OpDescPtr &op_desc); + uint64_t session_id_; map mem_type_to_mem_offset_; diff --git a/ge/graph/build/run_context.cc b/ge/graph/build/run_context.cc index ba328840..eca8b31b 100644 --- a/ge/graph/build/run_context.cc +++ b/ge/graph/build/run_context.cc @@ -18,6 +18,7 @@ #include "common/util.h" #include "framework/common/debug/ge_log.h" #include "graph/debug/ge_attr_define.h" +#include "graph/common/omg_util.h" namespace ge { RunContextUtil::~RunContextUtil() { DestroyRtModelResources(); } @@ -27,15 +28,21 @@ Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_siz std::map mem_type_to_data_mem_size, uint8_t *weight_mem_base, uint64_t weight_mem_size) { if ((data_mem_size > 0) && (data_mem_base == nullptr)) { + REPORT_INNER_ERROR("E19999", "InitMemInfo param data_mem_base is null but data_mem_size = %lu", data_mem_size); GELOGE(PARAM_INVALID, "InitMemInfo param data_mem_base is null but data_mem_size = %lu.", data_mem_size); return PARAM_INVALID; } if ((weight_mem_size > 0) && (weight_mem_base == nullptr)) { + REPORT_INNER_ERROR("E19999", "InitMemInfo param weight_mem_base is null but weight_mem_size = %lu", + weight_mem_size); GELOGE(PARAM_INVALID, "InitMemInfo param weight_mem_base is null but weight_mem_size = %lu.", weight_mem_size); return PARAM_INVALID; } if (mem_type_to_data_mem_base.empty() || mem_type_to_data_mem_size.empty() || mem_type_to_data_mem_base.size() != mem_type_to_data_mem_size.size()) { + REPORT_INNER_ERROR("E19999", "InitMemInfo param mem_type_to_data_mem_base size[%zu] " + "is not equal to the size of mem_type_to_data_mem_size[%zu].", + mem_type_to_data_mem_base.size(), mem_type_to_data_mem_size.size()); GELOGE(PARAM_INVALID, "InitMemInfo param mem_type_to_data_mem_base size[%zu] is not equal to the size of " "mem_type_to_data_mem_size[%zu].", @@ -55,6 +62,7 @@ Status RunContextUtil::CreateRtModelResources(uint32_t stream_num, uint32_t even // Create rt model rtError_t rt_ret = rtModelCreate(&rt_model_, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtModelCreate failed, ret:%d,", static_cast(rt_ret)); GELOGE(RT_FAILED, "rtModelCreate failed. rt_ret = %d", static_cast(rt_ret)); return RT_FAILED; } @@ -64,6 +72,8 @@ Status RunContextUtil::CreateRtModelResources(uint32_t stream_num, uint32_t even rtStream_t stream = nullptr; rt_ret = rtStreamCreate(&stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtStreamCreate failed, ret:%d, index:%u,", + static_cast(rt_ret), i); GELOGE(RT_FAILED, "rtStreamCreate failed. rt_ret = %d, index = %u", static_cast(rt_ret), i); return RT_FAILED; } @@ -71,16 +81,22 @@ Status RunContextUtil::CreateRtModelResources(uint32_t stream_num, uint32_t even rt_ret = rtModelBindStream(rt_model_, stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtModelBindStream failed, ret:%d, index:%u,", + static_cast(rt_ret), i); GELOGE(RT_FAILED, "Bind stream and model failed. rt_ret = %d, index = %u", static_cast(rt_ret), i); return RT_FAILED; } } // Create rt event + uint32_t create_flag = static_cast((event_num > kEventReuseThreshold) ? RT_EVENT_WITH_FLAG : + RT_EVENT_DEFAULT); for (uint32_t i = 0; i < event_num; ++i) { rtEvent_t event = nullptr; - rt_ret = rtEventCreate(&event); + rt_ret = rtEventCreateWithFlag(&event, create_flag); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtEventCreate failed, ret:%d, index:%u,", + static_cast(rt_ret), i); GELOGE(RT_FAILED, "rtEventCreate failed. rt_ret = %d, index = %u", static_cast(rt_ret), i); return RT_FAILED; } @@ -92,6 +108,8 @@ Status RunContextUtil::CreateRtModelResources(uint32_t stream_num, uint32_t even rtLabel_t label = nullptr; rt_ret = rtLabelCreateV2(&label, rt_model_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtLabelCreateV2 failed, ret:%d, index:%u,", + static_cast(rt_ret), i); GELOGE(RT_FAILED, "rtLabelCreate failed. rt_ret = %d, index = %u", static_cast(rt_ret), i); return RT_FAILED; } @@ -143,12 +161,15 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra GELOGD("Begin to Create RunContext, session_id = %lu", session_id); // check params if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param graph nullptr, session_id:%lu,", session_id); GELOGE(PARAM_INVALID, "CreateRunContext param graph is null. session_id=%lu", session_id); return PARAM_INVALID; } uint32_t stream_num = 0; if (!AttrUtils::GetInt(&model, ATTR_MODEL_STREAM_NUM, stream_num)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s failed for model, session_id:%lu,", + ATTR_MODEL_STREAM_NUM.c_str(), session_id); GELOGE(INTERNAL_ERROR, "Get stream_num attr from model_def failed. session_id=%lu", session_id); return INTERNAL_ERROR; } @@ -156,6 +177,8 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra uint32_t event_num = 0; if (!AttrUtils::GetInt(&model, ATTR_MODEL_EVENT_NUM, event_num)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s failed for model, session_id:%lu,", + ATTR_MODEL_EVENT_NUM.c_str(), session_id); GELOGE(INTERNAL_ERROR, "Get event_num attr from model failed. session_id=%lu", session_id); return INTERNAL_ERROR; } @@ -163,6 +186,8 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra uint32_t label_num = 0; if (!AttrUtils::GetInt(&model, ATTR_MODEL_LABEL_NUM, label_num)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s failed for model, session_id:%lu,", + ATTR_MODEL_LABEL_NUM.c_str(), session_id); GELOGE(INTERNAL_ERROR, "Get label_num attr from model failed. session_id=%lu", session_id); return INTERNAL_ERROR; } diff --git a/ge/graph/build/stream_allocator.cc b/ge/graph/build/stream_allocator.cc index bd7cf7d1..8218588f 100644 --- a/ge/graph/build/stream_allocator.cc +++ b/ge/graph/build/stream_allocator.cc @@ -27,6 +27,8 @@ #include "graph/ge_context.h" #include "graph/utils/graph_utils.h" #include "init/gelib.h" +#include "common/string_util.h" +#include "common/util/error_manager/error_manager.h" using std::map; using std::set; @@ -38,6 +40,13 @@ const int64_t kTaskNumPerNormalNode = 3; const int64_t kTaskNumPerHcclNode = 245; const char *const kTrueStr = "true"; const char *const kFalseStr = "false"; +const size_t kEventMultiplexingItemCount = 3; +const size_t kKeyWordIndex = 0; +const size_t kNodeNameIndex = 1; +const size_t kEventIdIndex = 2; +const char *const kSend = "SendTo"; +const char *const kRecv = "RecvFrom"; +const char kDelim = ';'; inline bool HasContinuousStreamLabel(const ge::OpDescPtr &op_desc, std::string &continuous_stream_label) { if (ge::AttrUtils::GetStr(op_desc, ge::ATTR_NAME_CONTINUOUS_STREAM_LABEL, continuous_stream_label)) { @@ -52,6 +61,97 @@ bool IsHcclOp(const string &op_type) { ge::HCOMALLREDUCE, ge::HCOMREDUCESCATTER, ge::HCOMREDUCE}); return hccl_op_types.find(op_type) != hccl_op_types.end(); } + +ge::Status ParseNodeEventMultiplexing(const ge::NodePtr &node, + const std::vector &raw_event_multiplexing, + std::unordered_map>> &node_to_send, + std::unordered_map>> &node_to_recv) { + GE_CHECK_NOTNULL(node); + for (const auto &str : raw_event_multiplexing) { + std::vector ele = ge::StringUtils::Split(str, kDelim); + if (ele.size() != kEventMultiplexingItemCount) { + GELOGE(ge::PARAM_INVALID, "[Check][RawMultiplexing]Size error, node:%s, require size:%zu, actually:%zu.", + node->GetName().c_str(), kEventMultiplexingItemCount, ele.size()); + REPORT_INNER_ERROR("E19999", "Raw event multiplexing is invalid, node:%s, require size:%zu, actually:%zu.", + node->GetName().c_str(), kEventMultiplexingItemCount, ele.size()); + return ge::PARAM_INVALID; + } + int value; + try { + value = std::stoi(ele[kEventIdIndex]); + } catch (std::invalid_argument &) { + GELOGE(ge::PARAM_INVALID, "[Throw][Exception]Event id is invalid, node:%s, raw:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + REPORT_INNER_ERROR("E19999", "Event id is invalid, node:%s, raw:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + return ge::PARAM_INVALID; + } catch (std::out_of_range &) { + GELOGE(ge::PARAM_INVALID, "[Throw][Exception]Event id is out of range, node:%s, raw:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + REPORT_INNER_ERROR("E19999", "Event id is out of range, node:%s, raw:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + return ge::PARAM_INVALID; + } + if (value < 0) { + GELOGE(ge::PARAM_INVALID, "[Check][EventId]Event id is out of range, node:%s, raw:%s, value:%d.", + node->GetName().c_str(), ele[kEventIdIndex].c_str(), value); + REPORT_INNER_ERROR("E19999", "Event id is out of range, node:%s, raw:%s, value:%d.", + node->GetName().c_str(), ele[kEventIdIndex].c_str(), value); + return ge::PARAM_INVALID; + } + if (ele[kKeyWordIndex] == kSend) { + node_to_send[node].emplace_back(std::make_pair(ele[kNodeNameIndex], static_cast(value))); + } else if (ele[kKeyWordIndex] == kRecv) { + node_to_recv[node].emplace_back(std::make_pair(ele[kNodeNameIndex], static_cast(value))); + } else { + GELOGE(ge::PARAM_INVALID, "[Check][KeyWord]Key word is not supported, node:%s, key:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + REPORT_INNER_ERROR("E19999", "Key word is not supported, node:%s, key:%s.", + node->GetName().c_str(), ele[kEventIdIndex].c_str()); + return ge::PARAM_INVALID; + } + } + return ge::SUCCESS; +} + +ge::Status ParseAllNodeEventMultiplexing(const ge::ComputeGraphPtr &graph, + std::unordered_map &name_to_node_map, + std::unordered_map>> &node_to_send, + std::unordered_map>> &node_to_recv) { + for (const auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { + ge::OpDescPtr op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + name_to_node_map.insert({node->GetName(), node}); + std::vector raw_event_multiplexing; + if (!(op_desc->HasAttr(ge::ATTR_NAME_EVENT_MULTIPLEXING))) { + continue; + } + bool get_attr = ge::AttrUtils::GetListStr(op_desc, ge::ATTR_NAME_EVENT_MULTIPLEXING, raw_event_multiplexing); + if (!get_attr) { + GELOGE(ge::PARAM_INVALID, "[Get][Attr]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get raw event multiplexing, node:%s.", node->GetName().c_str()); + return ge::PARAM_INVALID; + } + auto parse_ret = ParseNodeEventMultiplexing(node, raw_event_multiplexing, node_to_send, node_to_recv); + if (parse_ret != ge::SUCCESS) { + GELOGE(parse_ret, "[Parse][Eventmultiplexing]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to parse node event multiplexing, node:%s.", node->GetName().c_str()); + return parse_ret; + } + } + return ge::SUCCESS; +} + +std::vector GetIntersection(std::vector &a, std::vector &b) { + std::unordered_set ele_of_a(a.begin(), a.end()); + std::vector res; + for (auto &ele : b) { + if (ele_of_a.count(ele) > 0) { + res.emplace_back(ele); + } + } + return res; +} } // namespace namespace ge { @@ -76,6 +176,7 @@ Status StreamAllocator::AssignLogicalStreams(const std::map &m auto gelib = GELib::GetInstance(); if (gelib == nullptr) { + REPORT_INNER_ERROR("E19999", "Check GELib instance nullptr"); GELOGE(FAILED, "Get GELib instance failed."); return FAILED; } @@ -149,6 +250,12 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu return status; } + status = RefreshEventsWithReuse(); + if (status != SUCCESS) { + GELOGE(status, "[Refresh][Events]RefreshEventsWithReuse failed!"); + return status; + } + status = InsertSyncEventNodes(); if (status != SUCCESS) { GELOGE(status, "InsertSyncEventNode failed!"); @@ -184,6 +291,8 @@ Status StreamAllocator::AssignSingleStream() { } if (stream_num_ > 1) { + REPORT_INNER_ERROR("E19999", "The number of ts streams is %ld, only one is supported", + stream_num_); GELOGE(FAILED, "The number of ts streams is %ld, only one is supported.", stream_num_); return FAILED; } @@ -257,6 +366,9 @@ Status StreamAllocator::SetActiveStreamsByLabel() { } } GE_CHK_BOOL_EXEC(AttrUtils::SetListInt(node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, activated_stream_list), + REPORT_INNER_ERROR("E19999", "Set Attr:%s for op:%s(%s) failed", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "SetListInt failed."); return FAILED); } @@ -307,6 +419,9 @@ Status StreamAllocator::SetActiveStreamsForSubgraphs() { } if (!AttrUtils::SetListInt(first_active_node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, active_streams)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s for op:%s(%s) failed", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + first_active_node->GetName().c_str(), first_active_node->GetType().c_str()); GELOGE(FAILED, "Set active streams for node %s failed.", first_active_node->GetName().c_str()); return FAILED; } @@ -376,6 +491,8 @@ Status StreamAllocator::InsertOneEventInTwoNodes(const NodePtr &cur_node, const } if (next_stream_id == kInvalidStream) { + REPORT_INNER_ERROR("E19999", "Stream id of next_node %s(%s) should not be %ld", + next_node->GetName().c_str(), next_node->GetType().c_str(), kInvalidStream); GELOGE(FAILED, "Stream id of next_node %s should not be %ld", next_node->GetName().c_str(), kInvalidStream); return FAILED; } @@ -589,8 +706,14 @@ Status StreamAllocator::OptimizeByStreamActivate() { // -> stream(streamSwitch) -> stream(streamActivate) -> stream(stream true or false) // No need to insert an event between node in stream(normal) and node in stream(stream true or false) bool StreamAllocator::IsRecvNodeActivatedBySendNode(const NodePtr &send_node_ptr, const NodePtr &recv_node_ptr) const { - GE_CHECK_NOTNULL_EXEC(send_node_ptr->GetOpDesc(), GELOGE(FAILED, "op desc is nullptr"); return false); - GE_CHECK_NOTNULL_EXEC(recv_node_ptr->GetOpDesc(), GELOGE(FAILED, "op desc is nullptr"); return false); + GE_CHECK_NOTNULL_EXEC(send_node_ptr->GetOpDesc(), + REPORT_INNER_ERROR("E19999", "Check param send_node_ptr nullptr"); + GELOGE(FAILED, "op desc is nullptr"); + return false); + GE_CHECK_NOTNULL_EXEC(recv_node_ptr->GetOpDesc(), + REPORT_INNER_ERROR("E19999", "Check param recv_node_ptr nullptr"); + GELOGE(FAILED, "op desc is nullptr"); + return false); auto cur_stream_id = send_node_ptr->GetOpDesc()->GetStreamId(); if (AttrUtils::HasAttr(recv_node_ptr->GetOpDesc(), ATTR_NAME_STREAM_LABEL)) { // find streamActivate node @@ -714,6 +837,8 @@ Status StreamAllocator::SplitStreams(vector> &split_streams) { continue; } if (stream_id > last_stream_id) { + REPORT_INNER_ERROR("E19999", "streamid(%ld) > last_stream_id(%ld), check invalid", + stream_id, last_stream_id); GELOGE(FAILED, "SplitStreams:streamid(%ld) > last_stream_id(%ld)", stream_id, last_stream_id); return FAILED; } @@ -727,6 +852,8 @@ Status StreamAllocator::SplitStreams(vector> &split_streams) { stream_continuous_2_node_num_map[continuous_stream_label]++; // return error if (stream_continuous_2_node_num_map[continuous_stream_label] > max_node_num_one_stream) { + REPORT_INNER_ERROR("E19999", "Check node[%s] stream_id[%ld] continuous stream label[%s] unsatisfied", + op_desc->GetName().c_str(), stream_id, continuous_stream_label.c_str()); GELOGE(FAILED, "SplitStreams:node[%s] stream_id[%ld] continuous stream label[%s] unsatisfied ", op_desc->GetName().c_str(), stream_id, continuous_stream_label.c_str()); return FAILED; @@ -881,6 +1008,8 @@ Status StreamAllocator::UpdateActiveStreamsForSwitchNode(NodePtr &switch_node) { GE_CHECK_NOTNULL(op_desc); if (!AttrUtils::SetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, stream_ids)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt failed."); return FAILED; } @@ -895,6 +1024,8 @@ Status StreamAllocator::InsertActiveNodesAfterSwitch(NodePtr &switch_node, vecto vector ori_active_label_list; if (!AttrUtils::GetListStr(switch_desc, ATTR_NAME_ACTIVE_LABEL_LIST, ori_active_label_list) || ori_active_label_list.empty()) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_LABEL_LIST.c_str(), + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get active label list of switch %s failed.", switch_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -918,6 +1049,8 @@ Status StreamAllocator::InsertActiveNodesAfterSwitch(NodePtr &switch_node, vecto for (auto &active_node : added_active_nodes) { GE_CHECK_NOTNULL(switch_node->GetOutControlAnchor()); if (switch_node->GetOutControlAnchor()->LinkTo(active_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Link from %s to %s failed", + switch_node->GetName().c_str(), active_node->GetName().c_str()); GELOGE(FAILED, "Link %s to %s failed.", switch_node->GetName().c_str(), active_node->GetName().c_str()); return FAILED; } @@ -933,6 +1066,8 @@ Status StreamAllocator::UpdateActiveStreamsForActiveNode(const vector new_active_streams = active_streams; for (uint32_t logical_stream : active_streams) { if (static_cast(logical_stream) >= split_streams.size()) { + REPORT_INNER_ERROR("E19999", "Check logical stream:%u is out of range:%zu", + logical_stream, split_streams.size()); GELOGE(FAILED, "logical stream is out of range."); return FAILED; } @@ -951,6 +1086,8 @@ Status StreamAllocator::UpdateActiveStreamsForActiveNode(const vectorGetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, new_active_streams)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Set active streams for node %s failed.", node->GetName().c_str()); return FAILED; } @@ -991,6 +1128,8 @@ Status StreamAllocator::UpdateActiveStreamsForSubgraphs() const { new_active_streams.emplace(static_cast(new_split_stream)); active_streams.assign(new_active_streams.begin(), new_active_streams.end()); if (!AttrUtils::SetListInt(active_op, ATTR_NAME_ACTIVE_STREAM_LIST, active_streams)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + active_op->GetName().c_str(), active_op->GetType().c_str()); GELOGE(FAILED, "Set active streams for node %s failed.", active_node->GetName().c_str()); return FAILED; } @@ -1059,6 +1198,8 @@ Status StreamAllocator::SetActiveStreamsForLoop() { NodePtr pre_switch_node = FindSwitchNodeBeforeLoopActiveNode(node); if (pre_switch_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Find switch node before loop active node %s fail", + node->GetName().c_str()); GELOGE(FAILED, "find switch node before loop active node %s failed", node->GetName().c_str()); return FAILED; } @@ -1066,6 +1207,9 @@ Status StreamAllocator::SetActiveStreamsForLoop() { if (!AttrUtils::GetListStr(node->GetOpDesc(), ATTR_NAME_ACTIVE_LABEL_LIST, activated_label_list) || activated_label_list.empty()) { GE_CHK_BOOL_EXEC(AttrUtils::SetListInt(node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, loop_active_streams), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "SetListInt failed."); return FAILED); for (const auto &stream_id : loop_active_streams) { @@ -1112,6 +1256,8 @@ Status StreamAllocator::CheckStreamActived() const { uint32_t stream_id = static_cast(node->GetOpDesc()->GetStreamId()); auto iter = find(active_streams.begin(), active_streams.end(), stream_id); if (iter != active_streams.end()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) cannot active its own stream %u, check invalid ", + node->GetName().c_str(), node->GetType().c_str(), stream_id); GELOGE(FAILED, "Node %s cannot active its own stream %u.", node->GetName().c_str(), stream_id); return FAILED; } @@ -1121,6 +1267,94 @@ Status StreamAllocator::CheckStreamActived() const { return SUCCESS; } +Status StreamAllocator::ReuseEvent(bool send_to, + const std::unordered_map &name_to_node_map, + const std::unordered_map>> &node_to_event_id) { + for (const auto &node_event_id : node_to_event_id) { + ge::NodePtr curr_node = node_event_id.first; + NodePtr send_node = send_to ? curr_node : nullptr; + NodePtr recv_node = send_to ? nullptr : curr_node; + for (const auto &event_pair : node_event_id.second) { + auto peer_node_iter = name_to_node_map.find(event_pair.first); + if (peer_node_iter == name_to_node_map.end()) { + GELOGE(PARAM_INVALID, "[Get][Node]Name:%s.", event_pair.first.c_str()); + REPORT_INNER_ERROR("E19999", "Failed to find node, name:%s.", event_pair.first.c_str()); + return PARAM_INVALID; + } + recv_node = send_to ? peer_node_iter->second : recv_node; + send_node = send_to ? send_node : peer_node_iter->second; + GE_CHECK_NOTNULL(send_node); + GE_CHECK_NOTNULL(recv_node); + auto event_id = GetIntersection(node_to_send_events_[send_node], node_to_recv_events_[recv_node]); + uint32_t new_event = event_pair.second + event_num_; + if (event_id.empty()) { + GELOGI("[Check][Optimized]Send:%s, recv:%s.", send_node->GetName().c_str(), recv_node->GetName().c_str()); + continue; + } else if (event_id.size() != 1) { + GELOGW("[Check][Event]More than one event are found between %s and %s, event num:%zu.", + send_node->GetName().c_str(), recv_node->GetName().c_str(), event_id.size()); + } + uint32_t old_event = event_id[0]; + auto reuse_event_id = [] (vector &event_list, uint32_t old_event, uint32_t new_event) -> void { + event_list.erase(std::remove(event_list.begin(), event_list.end(), old_event), event_list.end()); + event_list.push_back(new_event); + return; + }; + reuse_event_id(node_to_send_events_[send_node], old_event, new_event); + reuse_event_id(node_to_recv_events_[recv_node], old_event, new_event); + GELOGI("[Reuse][Event]Replace event successfully, send node:%s, recv node:%s, old id:%u, new id:%u.", + send_node->GetName().c_str(), recv_node->GetName().c_str(), old_event, new_event); + } + } + return ge::SUCCESS; +} + +// Refresh events to reuse events +Status StreamAllocator::RefreshEventsWithReuse() { + GELOGI("[Refresh][Events]Refresh events with reuse, stream num:%ld, original event num:%u.", stream_num_, event_num_); + if (event_num_ <= kEventReuseThreshold) { + GELOGI("[Check][ReuseThreshold]Event used num is %u, less than %u, skip reuse.", + event_num_, kEventReuseThreshold); + return SUCCESS; + } + std::unordered_map name_to_node_map; + std::unordered_map>> node_to_send; + std::unordered_map>> node_to_recv; + Status ret = ParseAllNodeEventMultiplexing(whole_graph_, name_to_node_map, node_to_send, node_to_recv); + if (ret != SUCCESS) { + GELOGE(ret, "[Parse][AllNodeEventMultiplexing]Graph:%s.", whole_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to parse all node event multiplexing, graph:%s.", + whole_graph_->GetName().c_str()); + return ret; + } + if (node_to_send.empty() && node_to_recv.empty()) { + return SUCCESS; + } + + ret = ReuseEvent(true, name_to_node_map, node_to_send); + if (ret != SUCCESS) { + GELOGE(ret, "[Reuse][Event]Phase:Send, graph:%s.", whole_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to reuse event, phase:Send, graph:%s.", whole_graph_->GetName().c_str()); + return ret; + } + + ret = ReuseEvent(false, name_to_node_map, node_to_recv); + if (ret != SUCCESS) { + GELOGE(ret, "[Reuse][Event]Phase:Recv, graph:%s.", whole_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to reuse event, phase:Recv, graph:%s.", whole_graph_->GetName().c_str()); + return ret; + } + + Status status = RefreshContinuousEvents(); + if (status != SUCCESS) { + GELOGE(status, "[Refresh][ContinuousEvents]Graph:%s.", whole_graph_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to refresh continuous events, graph:%s.", whole_graph_->GetName().c_str()); + return status; + } + GELOGI("[Refresh][Events]RefreshEventsWithReuse successfully, event num:%u.", event_num_); + return SUCCESS; +} + // Refresh events to continuous events Status StreamAllocator::RefreshContinuousEvents() { // Establish a mapping relationship from old to new event id @@ -1128,8 +1362,10 @@ Status StreamAllocator::RefreshContinuousEvents() { uint32_t new_event_id = 0; for (const auto &one_pair : node_to_send_events_) { for (const auto &event_id : one_pair.second) { - old_to_new_events[event_id] = new_event_id; - new_event_id++; + if (old_to_new_events.find(event_id) == old_to_new_events.end()) { + old_to_new_events[event_id] = new_event_id; + new_event_id++; + } } } @@ -1139,6 +1375,7 @@ Status StreamAllocator::RefreshContinuousEvents() { for (size_t i = 0; i < send_events.size(); i++) { auto find_it = old_to_new_events.find(send_events[i]); if (find_it == old_to_new_events.end()) { + REPORT_INNER_ERROR("E19999", "Check invalid send event %u", send_events[i]); GELOGE(FAILED, "RefreshContinuousEvents: invalid send event %u", send_events[i]); return FAILED; } @@ -1152,6 +1389,7 @@ Status StreamAllocator::RefreshContinuousEvents() { for (size_t i = 0; i < recv_events.size(); i++) { auto find_it = old_to_new_events.find(recv_events[i]); if (find_it == old_to_new_events.end()) { + REPORT_INNER_ERROR("E19999", "Check invalid recv event %u", recv_events[i]); GELOGE(FAILED, "RefreshContinuousEvents: invalid recv event %u", recv_events[i]); return FAILED; } @@ -1166,6 +1404,7 @@ Status StreamAllocator::RefreshContinuousEvents() { // Insert the real send/recv node in the graph Status StreamAllocator::InsertSyncEventNodes() { + unordered_map sync_event_name; for (const auto &node : whole_graph_->GetNodes(whole_graph_->GetGraphUnknownFlag())) { // Add the node corresponding to the recv event vector recv_event_id_list; @@ -1175,12 +1414,23 @@ Status StreamAllocator::InsertSyncEventNodes() { GE_CHECK_NOTNULL(node->GetOutControlAnchor()); for (auto &event_id : recv_event_id_list) { string recv_node_name = whole_graph_->GetName() + "_Recv_" + to_string(event_id); + auto iter = sync_event_name.find(recv_node_name); + if (iter == sync_event_name.end()) { + sync_event_name[recv_node_name] = 1; + } else { + recv_node_name = recv_node_name + "_Reuse_" + to_string(iter->second); + ++(iter->second); + } OpDescPtr op_desc_ptr = MakeShared(recv_node_name, RECV); GE_CHECK_NOTNULL(op_desc_ptr); int64_t temp_stream_id = node->GetOpDesc()->GetStreamId(); op_desc_ptr->SetStreamId(temp_stream_id); - GE_CHK_BOOL_EXEC(AttrUtils::SetInt(op_desc_ptr, RECV_ATTR_EVENT_ID, event_id), GELOGE(FAILED, "SetInt failed."); + GE_CHK_BOOL_EXEC(AttrUtils::SetInt(op_desc_ptr, RECV_ATTR_EVENT_ID, event_id), + REPORT_INNER_ERROR("E19999", "Set Attr:%s for op:%s(%s) failed, event_id:%u,", + RECV_ATTR_EVENT_ID.c_str(), + node->GetName().c_str(), node->GetType().c_str(), event_id); + GELOGE(FAILED, "SetInt failed."); return FAILED); (void)AttrUtils::SetListStr(op_desc_ptr, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, std::move(std::vector())); @@ -1189,6 +1439,8 @@ Status StreamAllocator::InsertSyncEventNodes() { GE_CHECK_NOTNULL(recv_node->GetOutControlAnchor()); Status status = GraphUtils::AddEdge(recv_node->GetOutControlAnchor(), node->GetInControlAnchor()); if (status != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Add edge from node %s to node %s failed", + recv_node->GetName().c_str(), node->GetName().c_str()); GELOGE(status, "Add edge for node %s and node %s failed.", recv_node->GetName().c_str(), node->GetName().c_str()); return status; @@ -1203,6 +1455,13 @@ Status StreamAllocator::InsertSyncEventNodes() { for (auto &event_id : send_event_id_list) { string send_node_name = whole_graph_->GetName() + "_Send_" + to_string(event_id); + auto iter = sync_event_name.find(send_node_name); + if (iter == sync_event_name.end()) { + sync_event_name[send_node_name] = 1; + } else { + send_node_name = send_node_name + "_Reuse_" + to_string(iter->second); + ++(iter->second); + } OpDescPtr op_desc_ptr = MakeShared(send_node_name, SEND); GE_CHECK_NOTNULL(op_desc_ptr); @@ -1217,6 +1476,8 @@ Status StreamAllocator::InsertSyncEventNodes() { GE_CHECK_NOTNULL(send_node->GetInControlAnchor()); Status status = GraphUtils::AddEdge(node->GetOutControlAnchor(), send_node->GetInControlAnchor()); if (status != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Add edge from node %s to node %s failed", + node->GetName().c_str(), send_node->GetName().c_str()); GELOGE(status, "Add edge for node %s and node %s failed.", node->GetName().c_str(), send_node->GetName().c_str()); return status; @@ -1228,6 +1489,8 @@ Status StreamAllocator::InsertSyncEventNodes() { Status status = whole_graph_->InsertGraphEvents(); if (status != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert Graph Events fail, graph:%s,", + whole_graph_->GetName().c_str()); GELOGE(status, "Graph ReorderEventNodes failed"); return status; } @@ -1248,12 +1511,16 @@ void StreamAllocator::DumpEvents() { GELOGD("After RefreshRealStream: stream %ld.", stream_id); for (const auto &node : one_pair.second) { + if (node == nullptr || node->GetOpDesc() == nullptr) { + continue; + } string send_event_str; for (const auto &send_event_id : node_to_send_events_[node]) { send_event_str += " " + to_string(send_event_id); } if (!send_event_str.empty()) { - GELOGI("node: %s, send events: %s", node->GetName().c_str(), send_event_str.c_str()); + GELOGI("node: %s, id: %ld, stream id :%ld, send events: %s.", node->GetName().c_str(), + node->GetOpDesc()->GetId(), node->GetOpDesc()->GetStreamId(), send_event_str.c_str()); } string recv_event_str; @@ -1261,7 +1528,8 @@ void StreamAllocator::DumpEvents() { recv_event_str += " " + to_string(recv_event_id); } if (!recv_event_str.empty()) { - GELOGI("node: %s, recv events: %s", node->GetName().c_str(), recv_event_str.c_str()); + GELOGI("node: %s, id: %ld, stream id :%ld, recv events: %s.", node->GetName().c_str(), + node->GetOpDesc()->GetId(), node->GetOpDesc()->GetStreamId(), recv_event_str.c_str()); } } } @@ -1274,6 +1542,8 @@ Status StreamAllocator::GetMaxStreamAndTask(bool huge_stream, uint32_t &max_stre } rtError_t ret = rtGetMaxStreamAndTask(stream_type, &max_stream_count, &max_task_count); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "call rtGetMaxStreamAndTask fail, ret:%d, stream_type:%u,", + static_cast(ret), stream_type); GELOGE(FAILED, "Get max stream and task count by rts failed."); return FAILED; } @@ -1416,6 +1686,7 @@ Status StreamAllocator::AddActiveNodes(NodePtr &switch_node, const vectorGetOutControlAnchor()); if (switch_node->GetOutControlAnchor()->Unlink(node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Unlink %s to %s failed", + switch_node->GetName().c_str(), node->GetName().c_str()); GELOGE(FAILED, "Unlink %s to %s failed.", switch_node->GetName().c_str(), node->GetName().c_str()); return FAILED; } GE_CHECK_NOTNULL(active_node->GetOutControlAnchor()); if (active_node->GetOutControlAnchor()->LinkTo(node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Link %s to %s failed", + active_node->GetName().c_str(), node->GetName().c_str()); GELOGE(FAILED, "Link %s to %s failed.", active_node->GetName().c_str(), node->GetName().c_str()); return FAILED; } @@ -1477,12 +1752,15 @@ Status StreamAllocator::AddActiveNodes(NodePtr &switch_node, const vector &streams = labeled_streams_[active_label]; vector active_streams(streams.begin(), streams.end()); if (!AttrUtils::SetListInt(active_node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, active_streams)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "SetListInt of %s failed.", ATTR_NAME_ACTIVE_STREAM_LIST.c_str()); return FAILED; } diff --git a/ge/graph/build/stream_allocator.h b/ge/graph/build/stream_allocator.h index dd82700d..44dcd673 100644 --- a/ge/graph/build/stream_allocator.h +++ b/ge/graph/build/stream_allocator.h @@ -71,6 +71,10 @@ class StreamAllocator { Status SetActiveStreamsForLoop(); Status CheckStreamActived() const; + Status ReuseEvent(bool send_to, + const std::unordered_map &name_to_node_map, + const std::unordered_map>> &node_to_event_id); + Status RefreshEventsWithReuse(); Status RefreshContinuousEvents(); Status InsertSyncEventNodes(); diff --git a/ge/graph/build/stream_graph_optimizer.cc b/ge/graph/build/stream_graph_optimizer.cc index 05049818..c71c31be 100644 --- a/ge/graph/build/stream_graph_optimizer.cc +++ b/ge/graph/build/stream_graph_optimizer.cc @@ -14,6 +14,9 @@ * limitations under the License. */ #include "stream_graph_optimizer.h" + +#include + #include "common/util.h" #include "framework/common/debug/ge_log.h" #include "graph/utils/node_utils.h" @@ -122,13 +125,16 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com GE_CHECK_NOTNULL(op_desc); int64_t stream_id = op_desc->GetStreamId(); if (static_cast(stream_id) >= run_context.graphStreamList.size()) { + REPORT_INNER_ERROR("E19999", "Check stream_id:%ld in op:%s(%s) is bigger than " + "run_context.graphStreamList.size():%zu", stream_id, op_desc->GetName().c_str(), + op_desc->GetType().c_str(), run_context.graphStreamList.size()); GELOGE(FAILED, "stream_id %ld is bigger than run_context.graphStreamList.size() %zu", stream_id, run_context.graphStreamList.size()); return FAILED; } run_context.stream = run_context.graphStreamList[stream_id]; - std::string batch_label; - (void)AttrUtils::GetStr(subgraph, ATTR_NAME_BATCH_LABEL, batch_label); + std::string batch_label; + (void)AttrUtils::GetStr(subgraph, ATTR_NAME_BATCH_LABEL, batch_label); GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu, " "batch_label: %s", subgraph->GetName().c_str(), engine_name.c_str(), stream_id, static_cast(reinterpret_cast(run_context.stream)), batch_label.c_str()); @@ -136,6 +142,9 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com GE_CHECK_NOTNULL(*iter); Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph " + "Optimizer num: %zu, ret: %u", subgraph->GetName().c_str(), engine_name.c_str(), + graph_optimizers.size(), ret); GELOGE( ret, "[optimizeStreamedSubGraph]: optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph " diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc index 4eda4020..633f541c 100755 --- a/ge/graph/build/task_generator.cc +++ b/ge/graph/build/task_generator.cc @@ -49,6 +49,7 @@ const char *const kIsLastNode = "is_last_node"; const char *const kIsInputVar = "INPUT_IS_VAR"; const char *const kIsOutputVar = "OUTPUT_IS_VAR"; const char *const kProfilingMode = "PROFILING_MODE"; +const char *const kIteratorV2 = "IteratorV2"; const uint32_t kProfilingArStep = 2; const uint64_t kProfilingFpStartLogid = 1; const uint64_t kProfilingBpEndLogid = 2; @@ -57,6 +58,7 @@ const uint64_t kProfilingArEndLogid = 4; const uint64_t kProfilingIterEndLogid = 65535; const int64_t kHashFactor = 100000; const int64_t kInvalidGroupId = -1; +const std::set kFpNodeTypes = {ge::DATA, ge::GETNEXT, kIteratorV2}; } // namespace namespace ge { TaskGenerator::TaskGenerator(uint8_t *var_mem_base, uint64_t var_mem_size) { @@ -69,6 +71,7 @@ Status TaskGenerator::GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t GELOGD("Begin to Get TaskInfo. session_id=%lu", session_id); // Check params if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param graph is null, session_id:%lu", session_id); GELOGE(PARAM_INVALID, "GetTaskInfo param graph is null. session_id=%lu", session_id); return PARAM_INVALID; } @@ -93,6 +96,8 @@ Status TaskGenerator::GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t op_name.push_back(iter.second); } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(model, ATTR_MODEL_TASK_INDEX_OP_NAME, op_name), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for model:%s", + ATTR_MODEL_TASK_INDEX_OP_NAME.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetListStr failed."); return FAILED); @@ -106,6 +111,8 @@ Status TaskGenerator::GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t for (const TaskDef &task_def_temp : task_def_list) { TaskDef *task_def = model_task_def.add_task(); if (task_def == nullptr) { + REPORT_INNER_ERROR("E19999", "Add task_def in ModelTaskDef fail, session_id:%lu, graph:%s, model:%s", + session_id, graph->GetName().c_str(), model.GetName().c_str()); GELOGE(FAILED, "task_def is nullptr."); return FAILED; } @@ -126,30 +133,44 @@ Status TaskGenerator::AddModelTaskToModel(const ModelTaskDef &model_task_def, ui RunContext &run_context) { GE_CHK_BOOL_EXEC( AttrUtils::SetInt(model, MODEL_ATTR_TASK_GEN_BASE_ADDR, reinterpret_cast(run_context.dataMemBase)), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for model:%s", + MODEL_ATTR_TASK_GEN_BASE_ADDR.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetInt MODEL_ATTR_TASK_GEN_BASE_ADDR failed."); return FAILED); GE_CHK_BOOL_EXEC( AttrUtils::SetInt(model, MODEL_ATTR_TASK_GEN_WEIGHT_ADDR, reinterpret_cast(run_context.weightMemBase)), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for model:%s", + MODEL_ATTR_TASK_GEN_WEIGHT_ADDR.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetInt MODEL_ATTR_TASK_GEN_WEIGHT_ADDR failed."); return FAILED); GE_CHK_BOOL_EXEC(AttrUtils::SetInt(model, ATTR_MODEL_TASK_GEN_VAR_ADDR, reinterpret_cast(var_mem_base_)), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for model:%s", + ATTR_MODEL_TASK_GEN_VAR_ADDR.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetInt ATTR_MODEL_TASK_GEN_VAR_ADDR failed."); return FAILED); GE_CHK_BOOL_EXEC(AttrUtils::SetInt(model, ATTR_MODEL_VAR_SIZE, var_mem_size_), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for model:%s", + ATTR_MODEL_VAR_SIZE.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetInt ATTR_MODEL_VAR_SIZE failed."); return FAILED); GE_CHK_BOOL_EXEC(AttrUtils::SetInt(model, MODEL_ATTR_SESSION_ID, session_id), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for mode:%s", + MODEL_ATTR_SESSION_ID.c_str(), model.GetName().c_str()); GELOGE(FAILED, "SetInt MODEL_ATTR_SESSION_ID failed."); return FAILED); size_t task_size = model_task_def.ByteSizeLong(); ge::Buffer serial_buff(task_size); if (!model_task_def.SerializePartialToArray(serial_buff.GetData(), static_cast(task_size))) { + REPORT_INNER_ERROR("E19999", "model_task_def's serialize failed, model name = %s, task_size=%zu", + model.GetName().c_str(), task_size); GELOGE(FAILED, "model_task_def's serialize failed, model name = %s, task_size=%zu.", model.GetName().c_str(), task_size); return FAILED; } if (!AttrUtils::SetZeroCopyBytes(model, MODEL_ATTR_TASKS, std::move(serial_buff))) { + REPORT_INNER_ERROR("E19999", "Set model task to model failed, model name = %s, task_size=%zu", + model.GetName().c_str(), task_size); GELOGE(FAILED, "Set model task to model failed, model name = %s, task_size=%zu.", model.GetName().c_str(), task_size); return FAILED; @@ -167,7 +188,10 @@ Status TaskGenerator::UpdateOpIsVarAttr(const OpDescPtr &op_desc, uint64_t sessi for (int64_t input : input_offsets) { input_var.push_back(VarManager::Instance(session_id)->IsVarAddr(input)); } - GE_CHK_BOOL_EXEC(AttrUtils::SetListBool(op_desc, kIsInputVar, input_var), GELOGE(FAILED, "SetListBool failed."); + GE_CHK_BOOL_EXEC(AttrUtils::SetListBool(op_desc, kIsInputVar, input_var), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", kIsInputVar, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(FAILED, "SetListBool failed."); return FAILED); } @@ -177,7 +201,10 @@ Status TaskGenerator::UpdateOpIsVarAttr(const OpDescPtr &op_desc, uint64_t sessi for (int64_t output : output_offsets) { output_var.push_back(VarManager::Instance(session_id)->IsVarAddr(output)); } - GE_CHK_BOOL_EXEC(AttrUtils::SetListBool(op_desc, kIsOutputVar, output_var), GELOGE(FAILED, "SetListBool failed."); + GE_CHK_BOOL_EXEC(AttrUtils::SetListBool(op_desc, kIsOutputVar, output_var), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", kIsOutputVar, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(FAILED, "SetListBool failed."); return FAILED); } return SUCCESS; @@ -252,6 +279,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra GELOGD("Beign to generate task, graph name is %s.", graph->GetName().c_str()); std::shared_ptr ge_lib = GELib::GetInstance(); if ((ge_lib == nullptr) || !ge_lib->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Check GELib instance not init before"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GenerateTask failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -319,6 +347,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra } auto kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name); if (kernel_info_store == nullptr) { + REPORT_INNER_ERROR("E19999", "Get ops kernel info store failed for op:%s(%s), op_kernel_name:%s", + node->GetName().c_str(), node->GetType().c_str(), op_kernel_lib_name.c_str()); GELOGE(INTERNAL_ERROR, "No ops kernel store or ops kernel builder found. node:%s(%s), op_kernel_lib_name=%s.", name.c_str(), @@ -344,6 +374,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra auto ret = OpsKernelBuilderManager::Instance().GenerateTask(*node, run_context, task_def_list); GE_TIMESTAMP_ADD(GenerateTask); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call OpsKernelBuilderManager GenerateTask fail for op:%s(%s)", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(ret, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task failed.", op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id); return ret; @@ -353,6 +385,9 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra size_t task_list_size_after = task_def_list.size(); // If tasks is reduced if (task_list_size_after < task_list_size_before) { + REPORT_INNER_ERROR("E19999", "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task " + "but task num from %zu to %zu, check invalid", op_kernel_lib_name.c_str(), name.c_str(), + type.c_str(), op_id, stream_id, task_list_size_before, task_list_size_after); GELOGE(FAILED, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task. but task num from %zu to %zu.", op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, task_list_size_before, task_list_size_after); @@ -417,6 +452,9 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info size_t task_list_size_before = task_def_list.size(); OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name); if (kernel_info_store == nullptr) { + REPORT_INNER_ERROR("E19999", "Get ops kernel info store failed for op:%s(%s), op_kernel_name:%s", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + op_kernel_lib_name.c_str()); GELOGE(INTERNAL_ERROR, "Fusion: No ops kernel store or ops kernel builder found. fusion_node:%s(%s), op_kernel_lib_name=%s.", fusion_node_name.c_str(), fusion_node_type.c_str(), op_kernel_lib_name.c_str()); @@ -433,6 +471,9 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info int64_t op_id = op_desc->GetId(); int64_t stream_id = op_desc->GetStreamId(); if (stream_id < 0 || stream_id >= (int64_t)run_context.graphStreamList.size()) { + REPORT_INNER_ERROR("E19999", "Fusion: fusion_node[name:%s(%s), id:%ld] stream id is invalid, " + "stream list size=%zu", fusion_node_name.c_str(), fusion_node_type.c_str(), + op_id, run_context.graphStreamList.size()); GELOGE(INTERNAL_ERROR, "Fusion: fusion_node[name:%s(%s), id:%ld] stream id is invalid, stream list size=%zu", fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, run_context.graphStreamList.size()); return INTERNAL_ERROR; @@ -444,6 +485,9 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); ret = OpsKernelBuilderManager::Instance().GenerateTask(*fusion_node, run_context, task_def_list); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", " Call %s to generate fusion_node:[fusion_node_name:%s(%s), " + "id:%ld, stream_id:%ld] task failed", op_kernel_lib_name.c_str(), + fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); GELOGE(ret, "Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), " "id:%ld, stream_id:%ld] task failed.", @@ -455,6 +499,10 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info size_t task_list_size_after = task_def_list.size(); // if tasks is reduced if (task_list_size_after < task_list_size_before) { + REPORT_INNER_ERROR("E19999", "InsertProfilingTask for fusion_node:[fusion_node_name:%s(%s), kernel_name:%s" + "id:%ld, stream_id:%ld] task, but task num from %zu to %zu, check invalid", + fusion_node_name.c_str(), fusion_node_type.c_str(), op_kernel_lib_name.c_str(), + op_id, stream_id, task_list_size_before, task_list_size_after); GELOGE(FAILED, "Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), " "id:%ld, stream_id:%ld] task. but task num from %zu to %zu.", @@ -489,6 +537,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { if (NodeUtils::SetAllAnchorStatus(node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "SetAllAnchorStatus fail for op:%s(%s)", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "NodeUtils::SetAllAnchorStatus failed."); return INTERNAL_ERROR; } @@ -496,6 +546,8 @@ Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { auto peer_anchor = anchor->GetPeerOutAnchor(); if (peer_anchor == nullptr) { if (AnchorUtils::SetStatus(anchor, ANCHOR_SUSPEND) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set in peer anchor status fail for op:%s(%s), anchor_index:%d,", + node->GetName().c_str(), node->GetType().c_str(), anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "AnchorUtils::SetStatus failed."); return INTERNAL_ERROR; } @@ -506,11 +558,15 @@ Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { bool is_const = NodeUtils::GetConstOpType(peer_anchor->GetOwnerNode(), const_type); if (is_const && (const_type == CONSTANT)) { if (AnchorUtils::SetStatus(anchor, ANCHOR_CONST) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set in anchor CONST status fail for op:%s(%s), anchor_index:%d,", + node->GetName().c_str(), node->GetType().c_str(), anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "AnchorUtils::SetStatus failed."); return INTERNAL_ERROR; } } else { if (AnchorUtils::SetStatus(anchor, ANCHOR_DATA) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set in anchor DATA status fail for op:%s(%s), anchor_index:%d,", + node->GetName().c_str(), node->GetType().c_str(), anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "AnchorUtils::SetStatus failed."); return INTERNAL_ERROR; } @@ -523,12 +579,15 @@ Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) { auto ge_lib = GELib::GetInstance(); if ((ge_lib == nullptr) || !ge_lib->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Check GELib instance not init before"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized or is finalized."); return GE_CLI_GE_NOT_INITIALIZED; } const auto all_nodes = graph->GetNodes(graph->GetGraphUnknownFlag()); if (all_nodes.empty()) { + REPORT_INNER_ERROR("E19999", "Check param all_nodes empty in graph:%s", + graph->GetName().c_str()); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Graph's node is empty"); return GE_GRAPH_GRAPH_NODE_NULL; } @@ -584,6 +643,9 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector &ops, bool is_ for (auto &op_desc : continuous_ops) { string op_kernel_lib_name = op_desc->GetOpKernelLibName(); if (op_kernel_lib_name.empty()) { + REPORT_INNER_ERROR("E19999", "Get ops kernel info store failed for op:%s(%s), op_kernel_name:%s", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + op_kernel_lib_name.c_str()); GELOGE(INTERNAL_ERROR, "node:%s(%s) get op kernel lib failed.", op_desc->GetName().c_str(), op_desc->GetType().c_str()); return INTERNAL_ERROR; @@ -599,9 +661,15 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector &ops, bool is_ for (auto &it : first_and_last_ops) { auto &op_pair = it.second; - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(op_pair.first, kIsFirstNode, true), GELOGE(FAILED, "SetBool failed."); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(op_pair.first, kIsFirstNode, true), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", kIsFirstNode, + op_pair.first->GetName().c_str(), op_pair.first->GetType().c_str()); + GELOGE(FAILED, "SetBool failed."); return FAILED); - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(op_pair.second, kIsLastNode, true), GELOGE(FAILED, "SetBool failed."); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(op_pair.second, kIsLastNode, true), + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", kIsLastNode, + op_pair.second->GetName().c_str(), op_pair.second->GetType().c_str()); + GELOGE(FAILED, "SetBool failed."); return FAILED); } } @@ -621,8 +689,10 @@ Status TaskGenerator::AutoFindFpOpIndex(const ComputeGraphPtr &graph, ProfilingP if (op_kernel_lib_name.empty()) { continue; } - - if (op_desc->GetType() == GETNEXT || op_desc->GetType() == DATA) { + auto type = op_desc->GetType(); + std::string original_type; + (void)AttrUtils::GetStr(op_desc, ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, original_type); + if (kFpNodeTypes.find(type) != kFpNodeTypes.end() || kFpNodeTypes.find(original_type) != kFpNodeTypes.end()) { auto out_anchor = node->GetOutDataAnchor(0); for (auto &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) { GE_CHECK_NOTNULL(peer_in_anchor); @@ -723,7 +793,9 @@ uint32_t TaskGenerator::FindLastBpFromBpNode(const ComputeGraphPtr &graph, const GELOGI("bp_op_desc is %s, id is %ld", bp_op_desc->GetName().c_str(), bp_op_desc->GetId()); } - GE_CHECK_NOTNULL(bp_op_desc); + if (bp_op_desc == nullptr) { + return last_bp; + } uint32_t current_idx = 0; for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { OpDescPtr op_desc = node->GetOpDesc(); @@ -906,6 +978,8 @@ Status TaskGenerator::InsertProfilingArTaskBefore(const OpDescPtr &op_desc, std: for (size_t i = 0; i < all_reduce_nodes.size(); i++) { if (all_reduce_nodes[i] == node_index) { GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), + REPORT_INNER_ERROR("E19999", "Multiply result is out of range when calc profiling ar log id " + "for node:%s(%s)", op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Multiply result is out of range."); return FAILED); ar_log_id = i * kProfilingArStep + kProfilingArStartLogid; @@ -998,6 +1072,8 @@ Status TaskGenerator::InsertProfilingArTaskAfter(const OpDescPtr &op_desc, std:: for (size_t i = 0; i < all_reduce_nodes.size(); i++) { if (all_reduce_nodes[i] == node_index) { GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), + REPORT_INNER_ERROR("E19999", "Multiply result is out of range when calc profiling ar log id " + "for node:%s(%s)", op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Multiply result is out of range."); return FAILED); ar_log_id = i * kProfilingArStep + kProfilingArEndLogid; @@ -1107,6 +1183,7 @@ Status TaskGenerator::SetUnknownShapeStream(RunContext &run_context, rtStream_t run_context.stream = stream; rtError_t rt_ret = rtModelBindStream(run_context.model, stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret:0x%X", rt_ret); GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); GE_CHK_RT_RET(rtStreamDestroy(stream)); return FAILED; diff --git a/ge/graph/common/bcast.cc b/ge/graph/common/bcast.cc index 7948ff14..95a93897 100644 --- a/ge/graph/common/bcast.cc +++ b/ge/graph/common/bcast.cc @@ -73,6 +73,8 @@ Status BCast::SetShapeDifferentInfo(const kVecInt &x, const kVecInt &y) { y_bcast_i = x_i; grad_y_reduce_idx_.push_back(n - 1 - i); } else { + REPORT_INNER_ERROR("E19999", "SetShapeDifferentInfo failed. Two tensor shapes are not compatible " + "according to the broadcasting rule."); GELOGE(domi::PARAM_INVALID, "SetShapeDifferentInfo failed. Two tensor shapes are not compatible " "according to the broadcasting rule."); diff --git a/ge/graph/common/bcast.h b/ge/graph/common/bcast.h index 9df1c422..a8399896 100644 --- a/ge/graph/common/bcast.h +++ b/ge/graph/common/bcast.h @@ -111,11 +111,14 @@ class BCast { const std::function &func) { Status ret; if (func == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param func nullptr"); GELOGE(domi::PARAM_INVALID, "Param func is null"); return domi::PARAM_INVALID; } // Min input num is 2 if (input.size() < kMinDimNum) { + REPORT_INNER_ERROR("E19999", "Param input.size():%zu < %zu, check invalid", + input.size(), kMinDimNum); GELOGE(domi::PARAM_INVALID, "Input size is smaller than two."); return domi::PARAM_INVALID; } @@ -149,11 +152,14 @@ class BCast { Status BCastComputeCheck(const std::vector &input, std::vector &v_output, const std::function &func) { if (func == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param func nullptr"); GELOGE(PARAM_INVALID, "Param func is null"); return PARAM_INVALID; } // Min input num is 2 if (input.size() < kMinDimNum) { + REPORT_INNER_ERROR("E19999", "Param input.size():%zu < %zu, check invalid", + input.size(), kMinDimNum); GELOGE(PARAM_INVALID, "Input size is smaller than two."); return PARAM_INVALID; } @@ -179,6 +185,7 @@ class BCast { auto value = func((*(reinterpret_cast(x1_data) + x_index)), (*(reinterpret_cast(x2_data) + y_index)), data_type, ret); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "BCastComputeCheck func execute failed, datatype is %d.", data_type); GELOGE(ret, "BCastComputeCheck func execute failed, datatype is %d.", data_type); return ret; } diff --git a/ge/graph/common/omg_util.cc b/ge/graph/common/omg_util.cc index 5c76d0a1..7fe11f23 100644 --- a/ge/graph/common/omg_util.cc +++ b/ge/graph/common/omg_util.cc @@ -21,6 +21,8 @@ #include "framework/common/debug/ge_log.h" #include "graph/debug/ge_attr_define.h" #include "graph/utils/graph_utils.h" +#include "graph/utils/tensor_utils.h" +#include "common/math/math_util.h" namespace ge { /// @@ -36,6 +38,8 @@ Status GetOriginalType(const ge::NodePtr &node, string &type) { GE_CHECK_NOTNULL(node->GetOpDesc()); bool ret = ge::AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type); if (!ret) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s fail for op:%s(%s)", ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get FrameWorkOp original type [%s]", type.c_str()); return INTERNAL_ERROR; } @@ -55,6 +59,8 @@ Status SetStreamLabel(const ge::NodePtr &node, const std::string &label) { GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetStr(tmp_desc, ge::ATTR_NAME_STREAM_LABEL, label)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_STREAM_LABEL.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_STREAM_LABEL failed", node->GetName().c_str()); return FAILED; } @@ -72,6 +78,8 @@ Status SetCycleEvent(const ge::NodePtr &node) { OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetBool(tmp_desc, ge::ATTR_NAME_STREAM_CYCLE_EVENT_FLAG, true)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_STREAM_CYCLE_EVENT_FLAG.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_STREAM_CYCLE_EVENT_FLAG failed", node->GetName().c_str()); return FAILED; } @@ -90,6 +98,8 @@ Status SetActiveLabelList(const ge::NodePtr &node, const std::vectorGetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetListStr(tmp_desc, ge::ATTR_NAME_ACTIVE_LABEL_LIST, active_label_list)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ACTIVE_LABEL_LIST.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_ACTIVE_LABEL_LIST failed", node->GetName().c_str()); return FAILED; } @@ -108,6 +118,8 @@ Status SetSwitchBranchNodeLabel(const ge::NodePtr &node, const std::string &bran OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetStr(tmp_desc, ge::ATTR_NAME_SWITCH_BRANCH_NODE_LABEL, branch_label)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_SWITCH_BRANCH_NODE_LABEL.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_SWITCH_BRANCH_NODE_LABEL failed", node->GetName().c_str()); return FAILED; } @@ -126,6 +138,8 @@ Status SetSwitchTrueBranchFlag(const ge::NodePtr &node, bool value) { OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetBool(tmp_desc, ge::ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG failed", node->GetName().c_str()); return FAILED; } @@ -144,6 +158,8 @@ Status SetOriginalNodeName(const ge::NodePtr &node, const std::string &orig_name OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetStr(tmp_desc, ge::ATTR_NAME_ORIG_NODE_NAME, orig_name)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_ORIG_NODE_NAME.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_ORIG_NODE_NAME failed", node->GetName().c_str()); return FAILED; } @@ -161,6 +177,8 @@ Status SetCyclicDependenceFlag(const ge::NodePtr &node) { OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetBool(tmp_desc, ge::ATTR_NAME_CYCLIC_DEPENDENCE_FLAG, true)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_CYCLIC_DEPENDENCE_FLAG.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_CYCLIC_DEPENDENCE_FLAG failed", node->GetName().c_str()); return FAILED; } @@ -180,10 +198,50 @@ Status SetNextIteration(const ge::NodePtr &node, const std::string &next) { GE_CHECK_NOTNULL(tmp_desc); if (!AttrUtils::SetStr(tmp_desc, ge::ATTR_NAME_NEXT_ITERATION, next)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_NEXT_ITERATION.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Op: %s set ATTR_NAME_NEXT_ITERATION failed", node->GetName().c_str()); return FAILED; } return SUCCESS; } + +/// +/// @brief Align the memory +/// @param [in/out] memory size +/// @param [in] alinment +/// @return void +/// +void AlignMemSize(int64_t &mem_size, int64_t align_size) { + if (mem_size <= 0) { + return; + } + mem_size = (mem_size + align_size - 1) / align_size * align_size; +} + +/// +/// @brief Get memory size from tensor desc +/// @param [in] node +/// @param [out] memory size +/// @return Status +/// +Status GetMemorySize(const NodePtr &node, int64_t &output_size) { + GE_CHECK_NOTNULL(node->GetOpDesc()); + auto output_op_desc = node->GetOpDesc()->GetOutputDescPtr(kBufferPoolNodeOutIndex); + GE_CHECK_NOTNULL(output_op_desc); + int64_t size = 0; + auto ret = ge::TensorUtils::GetSize(*output_op_desc, size); + if (ret != ge::GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "[Get][Size]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get output size, node:%s.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + FMK_INT64_ADDCHECK(size, kBufferPoolMemAlignSize); + AlignMemSize(size, kBufferPoolMemAlignSize); + // The HCOM operator requires an additional 512 bytes before and after + FMK_INT64_ADDCHECK(size, (kBufferPoolMemAlignSize + kBufferPoolMemAlignSize)); + output_size = kBufferPoolMemAlignSize + size + kBufferPoolMemAlignSize; + return SUCCESS; +} } // namespace ge diff --git a/ge/graph/common/omg_util.h b/ge/graph/common/omg_util.h index 1f93c92b..561a12e0 100644 --- a/ge/graph/common/omg_util.h +++ b/ge/graph/common/omg_util.h @@ -27,6 +27,11 @@ #include "graph/node.h" namespace ge { +namespace { +const int64_t kBufferPoolMemAlignSize = 512; +const uint32_t kBufferPoolNodeOutIndex = 0; +const uint32_t kEventReuseThreshold = 65500; +} // namespace /// /// @brief get the Original Type of FrameworkOp /// @param [in] node @@ -96,6 +101,22 @@ Status SetCyclicDependenceFlag(const ge::NodePtr &node); /// @return Status /// Status SetNextIteration(const ge::NodePtr &node, const std::string &next); + +/// +/// @brief Align the memory +/// @param [in/out] memory size +/// @param [in] alinment +/// @return void +/// +void AlignMemSize(int64_t &mem_size, int64_t align_size); + +/// +/// @brief Get memory size from tensor desc +/// @param [in] node +/// @param [out] memory size +/// @return Status +/// +Status GetMemorySize(const NodePtr &node, int64_t &output_size); } // namespace ge #endif // GE_GRAPH_COMMON_OMG_UTIL_H_ diff --git a/ge/graph/execute/graph_execute.cc b/ge/graph/execute/graph_execute.cc index 1aee756c..5142e347 100755 --- a/ge/graph/execute/graph_execute.cc +++ b/ge/graph/execute/graph_execute.cc @@ -20,9 +20,12 @@ #include #include "graph/load/model_manager/model_manager.h" +#include "graph/load/model_manager/davinci_model.h" #include "omm/csa_interact.h" namespace ge { +using Uint32Pair = pair; +const uint32_t kInvalidModelId = UINT32_MAX; GraphExecutor::GraphExecutor() : init_flag_(false), train_graph_flag_(false), @@ -40,6 +43,7 @@ GraphExecutor::~GraphExecutor() { rtError_t rt_ret; rt_ret = rtFreeHost(buffer_addr); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFreeHost failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "[GraphManager] subgraph free buffer failed, ret: 0x%X", rt_ret); } } @@ -51,14 +55,17 @@ GraphExecutor::~GraphExecutor() { Status GraphExecutor::SetCondition(std::mutex *mutex, std::condition_variable *cond, std::shared_ptr listener) { if (mutex == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param mutex nullptr"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "[SetCondition] input param mutex is nullptr."); return GE_GRAPH_PARAM_NULLPTR; } if (cond == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param cond nullptr"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "[SetCondition] input param cond is nullptr."); return GE_GRAPH_PARAM_NULLPTR; } if (listener == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param listener nullptr"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "[SetCondition] input param listener is nullptr."); return GE_GRAPH_PARAM_NULLPTR; } @@ -75,6 +82,7 @@ Status GraphExecutor::SetCondition(std::mutex *mutex, std::condition_variable *c Status GraphExecutor::SetGraphContext(GraphContextPtr graph_context_ptr) { if (graph_context_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param graph_context_ptr nullptr"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "[SetGraphContext] input param graph_context_ptr is nullptr"); return GE_GRAPH_PARAM_NULLPTR; } @@ -101,6 +109,7 @@ Status GraphExecutor::FreeInOutBuffer() { rtError_t rt_ret; rt_ret = rtFreeHost(*iter); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFreeHost failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "[GraphManager] subgraph free buffer failed, ret: 0x%X", rt_ret); (void)buffer_addr_.erase(buffer_addr_.begin(), iter); return GE_GRAPH_FREE_FAILED; @@ -146,6 +155,8 @@ Status GraphExecutor::MallocInOutBuffer(const std::vector &buffer_size void *tmp_buf = nullptr; rt_ret = rtMallocHost(&tmp_buf, buffer_size[i]); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%lu, ret:0x%X", + buffer_size[i], rt_ret); GELOGE(RT_FAILED, "[GraphManager] subgraph malloc buffer failed, ret: 0x%X", rt_ret); return GE_GRAPH_MALLOC_FAILED; } @@ -191,6 +202,8 @@ Status GraphExecutor::PrepareInputData(const std::vector &input_tensor rtError_t rt_ret = rtMemcpy(addrVec[i], bufferSizeVec[i], in_tensor->GetData().data(), in_tensor->GetData().size(), RT_MEMCPY_HOST_TO_HOST); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, dst_size:%lu, src_size:%zu, ret:0x%X", + bufferSizeVec[i], in_tensor->GetData().size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } @@ -250,6 +263,8 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vectorResetResult() != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call graph_run_listener_.ResetResult fail, model_id:%u", + model_id); GELOGE(GE_GRAPH_EXECUTE_FAILED, "Reset result failed"); return GE_GRAPH_EXECUTE_FAILED; } @@ -273,6 +288,8 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vectorGetResultCode(); if (result_code != SUCCESS && result_code != END_OF_SEQUENCE) { + REPORT_CALL_ERROR("E19999", "Graph_run_listener_ run fail, result:%u, model_id:%u", + result_code, model_id); GELOGE(GE_GRAPH_EXECUTE_FAILED, "[GraphExecutor] execute model failed, ret=%u, modelId=%u.", result_code, model_id); return GE_GRAPH_EXECUTE_FAILED; @@ -281,10 +298,14 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vector outBufTmp(new (std::nothrow) uint8_t[outputDataTmp.length]); if (outBufTmp == nullptr) { + REPORT_CALL_ERROR("E19999", "New output buffer fail, length:%lu, model:%u", + outputDataTmp.length, model_id); GELOGE(FAILED, "Failed to allocate memory."); return FAILED; } @@ -292,6 +313,8 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vector &input_tensor) { + const std::vector &input_tensor, + const RunAsyncCallback& callback) { GELOGI("[GraphExecutor] Start to async execute graph, graph_id=%u", graph_id); if (graph_id != last_graph_id_) { auto ret = FreeExecuteMemory(); @@ -368,7 +394,7 @@ Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr & } last_graph_id_ = graph_id; GE_CHECK_NOTNULL_EXEC(ge_root_model, return FAILED); - Status ret = AsyncExecuteModel(ge_root_model->GetModelId(), input_tensor); + Status ret = AsyncExecuteModel(ge_root_model, input_tensor, callback); if (ret != SUCCESS) { GELOGE(GE_GRAPH_SYNC_MODEL_FAILED, "[GraphExecutor] AsyncExecuteModel Error!"); return GE_GRAPH_SYNC_MODEL_FAILED; @@ -378,11 +404,81 @@ Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr & return SUCCESS; } -Status GraphExecutor::AsyncExecuteModel(uint32_t model_id, const std::vector &inputs) { +bool CompareByLoad(const Uint32Pair &lhs, const Uint32Pair &rhs) { + return lhs.second < rhs.second; +} + +uint32_t GraphExecutor::GetExecuteModelId(const GeRootModelPtr &ge_root_model) { + std::vector model_ids = ge_root_model->GetAllModelId(); + if (model_ids.empty()) { + return kInvalidModelId; + } + if (model_ids.size() == 1) { + return ge_root_model->GetModelId(); + } + std::vector model_id_to_loads; + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + for (auto model_id : model_ids) { + auto davinci_model = model_manager->GetModel(model_id); + auto hybrid_model = model_manager->GetHybridModel(model_id); + if (hybrid_model == nullptr) { + GE_CHECK_NOTNULL(davinci_model); + } + uint32_t input_load = hybrid_model != nullptr ? hybrid_model->GetDataInputerSize() : + davinci_model->GetDataInputerSize(); + uint32_t running_load = hybrid_model != nullptr ? static_cast(hybrid_model->GetRunningFlag()) : + static_cast(davinci_model->GetRunningFlag()); + uint32_t load = input_load + running_load; + if (load == 0) { + return model_id; + } + model_id_to_loads.emplace_back(model_id, load); + } + sort(model_id_to_loads.begin(), model_id_to_loads.end(), CompareByLoad); + if (model_id_to_loads.empty()) { + return kInvalidModelId; + } + return model_id_to_loads.begin()->first; +} + +Status GraphExecutor::SetCallback(uint32_t model_id, const GeRootModelPtr &ge_root_model, + const RunAsyncCallback &callback) { + auto model_manager = ge::ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + if (model_manager->IsNeedHybridLoad(*ge_root_model)) { + auto model = model_manager->GetHybridModel(model_id); + GE_CHECK_NOTNULL(model); + if (model->SetRunAsyncListenerCallback(callback) != SUCCESS) { + GELOGE(FAILED, "SetRunAsyncListenerCallback failed."); + return FAILED; + } + } else { + auto model = model_manager->GetModel(model_id); + GE_CHECK_NOTNULL(model); + if (model->SetRunAsyncListenerCallback(callback) != SUCCESS) { + GELOGE(FAILED, "SetRunAsyncListenerCallback failed."); + return FAILED; + } + } + return SUCCESS; +} + +Status GraphExecutor::AsyncExecuteModel(const GeRootModelPtr &ge_root_model, const std::vector &inputs, + const RunAsyncCallback &callback) { + uint32_t model_id = GetExecuteModelId(ge_root_model); + if (model_id == kInvalidModelId) { + GELOGE(INTERNAL_ERROR, "No valid model id."); + return INTERNAL_ERROR; + } try { auto model_manager = ge::ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); GELOGI("RunAsync begin.model_id %u", model_id); + if (SetCallback(model_id, ge_root_model, callback) != SUCCESS) { + GELOGE(FAILED, "RunAsync: SetCallBack for model fail"); + return FAILED; + } Status ret = model_manager->DataInputTensor(model_id, inputs); if (ret != SUCCESS) { @@ -392,10 +488,12 @@ Status GraphExecutor::AsyncExecuteModel(uint32_t model_id, const std::vector &output_tensor); ge::Status ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr &ge_root_model, - const std::vector &input_tensor); + const std::vector &input_tensor, const RunAsyncCallback &callback); Status SetCondition(std::mutex *mutex, std::condition_variable *cond, std::shared_ptr listener); @@ -116,6 +116,8 @@ class GraphExecutor { static Status GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info); + uint32_t GetExecuteModelId(const GeRootModelPtr &ge_root_model); + private: Status PrepareInputData(const std::vector &input_tensor, InputData &graph_input_data, OutputData &graph_output_data, std::vector &output_desc); @@ -123,7 +125,8 @@ class GraphExecutor { Status SyncExecuteModel(uint32_t model_id, const std::vector &input_tensor, std::vector &output_tensor); - Status AsyncExecuteModel(uint32_t model_id, const std::vector &input_tensor); + Status AsyncExecuteModel(const GeRootModelPtr &ge_root_model, const std::vector &input_tensor, + const RunAsyncCallback &callback); void InitModelIdInfo(std::vector &out_model_id_info, std::vector &sub_graph_vec, uint32_t output_size); @@ -132,6 +135,9 @@ class GraphExecutor { Status MallocInOutBuffer(const std::vector &buffer_size, std::vector &data_addr); + static Status SetCallback(uint32_t model_id, const GeRootModelPtr &ge_root_model, + const RunAsyncCallback &callback); + bool init_flag_; bool train_graph_flag_; diff --git a/ge/graph/label/case_label_maker.cc b/ge/graph/label/case_label_maker.cc index ea4b2a03..8bf5de71 100644 --- a/ge/graph/label/case_label_maker.cc +++ b/ge/graph/label/case_label_maker.cc @@ -42,6 +42,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { const auto graph_names = case_desc->GetSubgraphInstanceNames(); if (graph_names.empty() || graph_names.size() > kMaxCaseBranch) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) subgraph size: %zu, check invalid", case_desc->GetName().c_str(), + case_desc->GetType().c_str(), graph_names.size()); GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, graph size: %zu.", case_desc->GetName().c_str(), graph_names.size()); return FAILED; @@ -67,6 +69,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { parent_node_->GetName() + "/StreamActive_" + std::to_string(index); // rtStreamActive NodePtr stream_active = AddStreamActive(graph, stream_active_name); if (stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active failed.", graph->GetName().c_str()); return FAILED; } @@ -75,6 +79,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { std::string label_set_name = parent_node_->GetName() + "/LabelSet_" + std::to_string(index); // rtLabelSet NodePtr label = AddLabelSetEnter(graph, label_set_name, curr_label_index, stream_active); if (label == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetEnter node in graph:%s fail", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", graph->GetName().c_str()); return FAILED; } @@ -88,6 +94,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { // middle node, add goto node to tail. std::string label_goto_name = parent_node_->GetName() + "/LabelGoto_" + std::to_string(index); // rtLabelGoto if (AddLabelGotoLeave(graph, label_goto_name, last_label_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelGotoLeave node in graph:%s fail", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", graph->GetName().c_str()); return FAILED; } @@ -95,6 +103,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { // last node, add label node to tail. std::string last_label_name = parent_node_->GetName() + "/LabelSet_Last"; // rtLabelSet if (AddLabelSetLeave(graph, last_label_name, last_label_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetLeave node in graph:%s fail", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", graph->GetName().c_str()); return FAILED; } @@ -110,12 +120,16 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { const GeTensorDesc &pred_desc = case_desc->GetInputDesc(kCasePredIndex); NodePtr switch_node = AddLabelSwitchEnter(first_graph, label_switch_name, pred_desc, switch_labels); if (switch_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSwitchEnter node in graph:%s fail", + first_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", first_graph->GetName().c_str()); return FAILED; } // Link control edge to then branch head. if (GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), first_label->GetInControlAnchor()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", switch_node->GetName().c_str(), + first_label->GetName().c_str(), first_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add ctrl edge to %s failed.", first_label->GetName().c_str()); return FAILED; } @@ -123,6 +137,8 @@ Status CaseOpLabelMaker::Run(uint32_t &label_index) { uint32_t parent_index = 0; // Case cond input is first. const std::string data_name = parent_node_->GetName() + "/SwitchIndexData"; if (AddLabelSwitchIndex(first_graph, data_name, pred_desc, switch_node, parent_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSwitchIndex node in graph:%s fail", + first_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add switch input failed.", first_graph->GetName().c_str()); return FAILED; } diff --git a/ge/graph/label/if_label_maker.cc b/ge/graph/label/if_label_maker.cc index d07f7984..cf4cdd39 100644 --- a/ge/graph/label/if_label_maker.cc +++ b/ge/graph/label/if_label_maker.cc @@ -43,6 +43,10 @@ Status IfOpLabelMaker::Run(uint32_t &label_index) { const std::string then_branch_name = if_desc->GetSubgraphInstanceName(kThenBranchIndex); const std::string else_branch_name = if_desc->GetSubgraphInstanceName(kElseBranchIndex); if (then_branch_name.empty() || else_branch_name.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s), check subgraph invalid, " + "then branch graph: %s, else branch graph: %s", + if_desc->GetName().c_str(), if_desc->GetType().c_str(), + then_branch_name.c_str(), else_branch_name.c_str()); GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, then branch: %s, else branch: %s.", if_desc->GetName().c_str(), then_branch_name.c_str(), else_branch_name.c_str()); return FAILED; @@ -66,32 +70,44 @@ Status IfOpLabelMaker::Run(uint32_t &label_index) { NodePtr then_stream_active = AddStreamActive(then_sub_graph, then_active_name); if (then_stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active failed.", then_sub_graph->GetName().c_str()); return FAILED; } NodePtr then_enter_label = AddLabelSetEnter(then_sub_graph, then_label_name, then_enter_index, then_stream_active); if (then_enter_label == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetEnter node in graph:%s fail", + then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", then_sub_graph->GetName().c_str()); return FAILED; } if (AddLabelGotoLeave(then_sub_graph, then_leave_name, else_leave_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelGotoLeave node in graph:%s fail", + then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", then_sub_graph->GetName().c_str()); return FAILED; } NodePtr else_stream_active = AddStreamActive(else_sub_graph, else_active_name); if (else_stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + else_stream_active->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active failed.", else_sub_graph->GetName().c_str()); return FAILED; } if (AddLabelSetEnter(else_sub_graph, else_enter_name, else_enter_index, else_stream_active) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetEnter node in graph:%s fail", + else_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", else_sub_graph->GetName().c_str()); return FAILED; } if (AddLabelSetLeave(else_sub_graph, else_leave_name, else_leave_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetLeave node in graph:%s fail", + else_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", else_sub_graph->GetName().c_str()); return FAILED; } @@ -103,12 +119,16 @@ Status IfOpLabelMaker::Run(uint32_t &label_index) { const GeTensorDesc &pred_desc = if_desc->GetInputDesc(kIfPredIndex); NodePtr switch_node = AddLabelSwitchEnter(then_sub_graph, then_enter_name, pred_desc, switch_labels); if (switch_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSwitchEnter node in graph:%s fail", + then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", then_sub_graph->GetName().c_str()); return FAILED; } // Link control edge to then branch head. if (GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), then_enter_label->GetInControlAnchor()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", switch_node->GetName().c_str(), + then_enter_label->GetName().c_str(), then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add ctrl edge to %s failed.", then_enter_label->GetName().c_str()); return FAILED; } @@ -116,6 +136,8 @@ Status IfOpLabelMaker::Run(uint32_t &label_index) { uint32_t parent_index = 0; // If cond input is first. const std::string data_name = parent_node_->GetName() + "/SwitchIndexData"; if (AddLabelSwitchIndex(then_sub_graph, data_name, pred_desc, switch_node, parent_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSwitchIndex node in graph:%s fail", + then_sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add switch input failed.", then_sub_graph->GetName().c_str()); return FAILED; } diff --git a/ge/graph/label/label_maker.cc b/ge/graph/label/label_maker.cc index 0e1e571c..156748e8 100644 --- a/ge/graph/label/label_maker.cc +++ b/ge/graph/label/label_maker.cc @@ -56,6 +56,8 @@ void LabelMaker::LinkToGraphHead(const ComputeGraphPtr &graph, const NodePtr &no } if (GraphUtils::AddEdge(node->GetOutControlAnchor(), n->GetInControlAnchor()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", node->GetName().c_str(), + n->GetName().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Add ctrl edge from %s to %s failed.", node->GetName().c_str(), n->GetName().c_str()); } } @@ -78,6 +80,8 @@ void LabelMaker::LinkToGraphTail(const ComputeGraphPtr &graph, const NodePtr &no } if (GraphUtils::AddEdge(tail->GetOutControlAnchor(), node->GetInControlAnchor()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", tail->GetName().c_str(), + node->GetName().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Add ctrl edge from %s to %s failed.", tail->GetName().c_str(), node->GetName().c_str()); } return; @@ -96,6 +100,7 @@ NodePtr LabelMaker::AddStreamActive(const ComputeGraphPtr &graph, const std::str const auto &node_list = graph->GetDirectNode(); if (node_list.empty()) { + REPORT_INNER_ERROR("E19999", "Check param graph:%s has no node", graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSet: Graph %s node is empty.", graph->GetName().c_str()); return nullptr; } @@ -131,6 +136,7 @@ NodePtr LabelMaker::AddLabelSetEnter(const ComputeGraphPtr &graph, const std::st const auto &node_list = graph->GetDirectNode(); if (node_list.empty()) { + REPORT_INNER_ERROR("E19999", "Check param graph:%s has no node", graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSet: Graph %s node is empty.", graph->GetName().c_str()); return nullptr; } @@ -145,6 +151,8 @@ NodePtr LabelMaker::AddLabelSetEnter(const ComputeGraphPtr &graph, const std::st GE_CHECK_NOTNULL_EXEC(label_set, return nullptr); if (GraphUtils::AddEdge(label_set->GetOutControlAnchor(), stream_active->GetInControlAnchor()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", label_set->GetName().c_str(), + stream_active->GetName().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Add ctrl edge from %s to %s failed.", label_set->GetName().c_str(), stream_active->GetName().c_str()); return nullptr; @@ -193,6 +201,7 @@ NodePtr LabelMaker::AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::s const auto &node_list = graph->GetDirectNode(); auto it = node_list.begin(); if (it == node_list.end()) { + REPORT_INNER_ERROR("E19999", "Check param graph:%s has no node", graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelGoto: Graph %s node is empty.", graph->GetName().c_str()); return nullptr; } @@ -205,6 +214,8 @@ NodePtr LabelMaker::AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::s (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); NodePtr label_goto = graph->AddNodeFront(op_desc); if (label_goto == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelGoto: Add to graph %s failed.", graph->GetName().c_str()); return nullptr; } @@ -253,6 +264,7 @@ NodePtr LabelMaker::AddLabelSwitchEnter(const ComputeGraphPtr &graph, const std: const auto &node_list = graph->GetDirectNode(); auto it = node_list.begin(); if (it == node_list.end()) { + REPORT_INNER_ERROR("E19999", "Check param graph:%s has no node", graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Graph %s node is empty.", graph->GetName().c_str()); return nullptr; } @@ -263,17 +275,23 @@ NodePtr LabelMaker::AddLabelSwitchEnter(const ComputeGraphPtr &graph, const std: GELOGI("LabelSwitchByIndex: Create node %s.", op_desc->GetName().c_str()); if (op_desc->AddInputDesc(desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc into node:%s(%s) in graph:%s fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input desc failed."); return nullptr; } if (!AttrUtils::SetListInt(op_desc, ATTR_NAME_LABEL_SWITCH_LIST, labels)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_LABEL_SWITCH_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); return nullptr; } NodePtr label_switch = graph->AddNodeFront(op_desc); if (label_switch == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s ahead fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add to graph %s failed.", graph->GetName().c_str()); return nullptr; } @@ -300,11 +318,15 @@ NodePtr LabelMaker::AddLabelSwitchLeave(const ComputeGraphPtr &graph, const std: GELOGI("LabelSwitchByIndex: Create node %s.", op_desc->GetName().c_str()); if (op_desc->AddInputDesc(desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc into node:%s(%s) in graph:%s fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input desc failed."); return nullptr; } if (!AttrUtils::SetListInt(op_desc, ATTR_NAME_LABEL_SWITCH_LIST, labels)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_LABEL_SWITCH_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); return nullptr; } @@ -336,15 +358,21 @@ NodePtr LabelMaker::AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std: GELOGI("Data: Create node %s.", op_desc->GetName().c_str()); if (op_desc->AddInputDesc(desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc into node:%s(%s) in graph:%s fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add data input desc failed."); return nullptr; } if (op_desc->AddOutputDesc(desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc into node:%s(%s) in graph:%s fail", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add data output desc failed."); return nullptr; } if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s fail for op:%s(%s)", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_PARENT_NODE_INDEX.c_str()); return nullptr; } @@ -354,6 +382,8 @@ NodePtr LabelMaker::AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std: // Link control edge to graph head. if (GraphUtils::AddEdge(op_data->GetOutDataAnchor(0), sw_node->GetInDataAnchor(0)) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", op_data->GetName().c_str(), + sw_node->GetName().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input edge to %s failed.", op_data->GetName().c_str()); return nullptr; } diff --git a/ge/graph/label/partitioned_call_label_maker.cc b/ge/graph/label/partitioned_call_label_maker.cc index 0be738f0..d9a89ef2 100644 --- a/ge/graph/label/partitioned_call_label_maker.cc +++ b/ge/graph/label/partitioned_call_label_maker.cc @@ -39,12 +39,17 @@ Status PartitionedCallLabelMaker::Run(uint32_t &label_index) { std::string sub_graph_name = call_desc->GetSubgraphInstanceName(kSubGraphIndex); if (sub_graph_name.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) subgraph_index:%d name is empty, check invalid", + call_desc->GetName().c_str(), call_desc->GetType().c_str(), kSubGraphIndex); GELOGE(INTERNAL_ERROR, "Node: %s has no subgraph name.", sub_graph_name.c_str()); return FAILED; } ComputeGraphPtr sub_graph = parent_graph_->GetSubgraph(sub_graph_name); if (sub_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) subgraph_name:%s is not exist in parent_graph, check invalid", + call_desc->GetName().c_str(), call_desc->GetType().c_str(), + sub_graph_name.c_str()); GELOGE(INTERNAL_ERROR, "Node: %s has no subgraph.", sub_graph_name.c_str()); return FAILED; } @@ -52,6 +57,8 @@ Status PartitionedCallLabelMaker::Run(uint32_t &label_index) { const std::string stream_active_name = parent_node_->GetName() + "/StreamActive"; // rtStreamActive NodePtr stream_active = AddStreamActive(sub_graph, stream_active_name); if (stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + sub_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active node failed.", sub_graph->GetName().c_str()); return FAILED; } diff --git a/ge/graph/label/while_label_maker.cc b/ge/graph/label/while_label_maker.cc index 83aad7c9..22e783e3 100644 --- a/ge/graph/label/while_label_maker.cc +++ b/ge/graph/label/while_label_maker.cc @@ -44,6 +44,9 @@ Status WhileOpLabelMaker::Run(uint32_t &label_index) { std::string cond_name = while_desc->GetSubgraphInstanceName(kCondBranchIndex); std::string body_name = while_desc->GetSubgraphInstanceName(kBodyBranchIndex); if (cond_name.empty() || body_name.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) cond subgraph index:%d or body subgraph index:%d name is empty, " + "check invalid", while_desc->GetName().c_str(), while_desc->GetType().c_str(), + kCondBranchIndex, kBodyBranchIndex); GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, cond branch: %s, body branch: %s.", while_desc->GetName().c_str(), cond_name.c_str(), body_name.c_str()); return FAILED; @@ -67,32 +70,44 @@ Status WhileOpLabelMaker::Run(uint32_t &label_index) { NodePtr cond_stream_active = AddStreamActive(cond_graph, cond_active_name); if (cond_stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + cond_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active failed.", cond_graph->GetName().c_str()); return FAILED; } if (AddLabelSetEnter(cond_graph, cond_enter_name, cond_enter_index, cond_stream_active) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetEnter node in graph:%s fail", + cond_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", cond_graph->GetName().c_str()); return FAILED; } NodePtr body_stream_active = AddStreamActive(body_graph, body_active_name); if (body_stream_active == nullptr) { + REPORT_CALL_ERROR("E19999", "Add StreamActive node in graph:%s fail", + body_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active failed.", body_graph->GetName().c_str()); return FAILED; } if (AddLabelSetEnter(body_graph, body_enter_name, body_enter_index, body_stream_active) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetEnter node in graph:%s fail", + body_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", body_graph->GetName().c_str()); return FAILED; } if (AddLabelGotoLeave(body_graph, goto_leave_name, cond_enter_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelGotoLeave node in graph:%s fail", + body_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", body_graph->GetName().c_str()); return FAILED; } if (AddLabelSetLeave(body_graph, body_leave_name, body_leave_index) == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSetLeave node in graph:%s fail", + body_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", body_graph->GetName().c_str()); return FAILED; } @@ -109,6 +124,8 @@ Status WhileOpLabelMaker::Run(uint32_t &label_index) { const std::vector switch_labels = {body_leave_index, body_enter_index}; NodePtr switch_node = AddLabelSwitchLeave(cond_graph, cond_leave_name, pred_desc, switch_labels); if (switch_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add LabelSwitchLeave node in graph:%s fail", + cond_graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", cond_graph->GetName().c_str()); return FAILED; } @@ -124,6 +141,9 @@ Status WhileOpLabelMaker::Run(uint32_t &label_index) { InDataAnchorPtr in_anchor = all_in_data.at(kCondOutputIndex); GE_CHECK_NOTNULL(in_anchor); if (GraphUtils::AddEdge(in_anchor->GetPeerOutAnchor(), switch_node->GetInDataAnchor(kCondOutputIndex)) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge from %s to %s in graph:%s fail", + in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetName().c_str(), + switch_node->GetName().c_str(), cond_graph->GetName().c_str()); GELOGE(FAILED, "Node: %s Add pred data input failed.", switch_node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc index 644880ce..bdf415a3 100755 --- a/ge/graph/load/graph_loader.cc +++ b/ge/graph/load/graph_loader.cc @@ -52,15 +52,17 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrGetModelId(); auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); @@ -71,6 +73,8 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -58,6 +60,8 @@ Status CpuTaskModelDequeue::Init(uint32_t queue_id, uintptr_t &in_mbuf) { args_size_ = sizeof(MbufQueueInfo) + sizeof(uintptr_t); // sizeof(uintptr_t) for save in_mbuf. rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -69,6 +73,8 @@ Status CpuTaskModelDequeue::Init(uint32_t queue_id, uintptr_t &in_mbuf) { queue_info.in_mbuf = in_mbuf; // Placeholder, input mbuf addr will save to this place. status = rtMemcpy(args_, args_size_, &queue_info, sizeof(MbufQueueInfo), RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -78,12 +84,16 @@ Status CpuTaskModelDequeue::Init(uint32_t queue_id, uintptr_t &in_mbuf) { Status CpuTaskModelDequeue::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param args_ is nullptr or args_size_:%u is 0 or stream_ is nullptr," + "check invalid", args_size_); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelDequeue, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret:0x%X", + status); GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelDequeue failed, status: 0x%X", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -101,6 +111,8 @@ Status CpuTaskModelDequeue::Distribute() { /// Status CpuTaskZeroCopy::Init(std::vector &mbuf_list, const map &outside_addrs) { if ((args_ != nullptr) || (args_size_ > 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -155,12 +167,16 @@ Status CpuTaskZeroCopy::Init(std::vector &mbuf_list, const map 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -206,6 +224,8 @@ Status CpuTaskPrepareOutput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mb args_size_ = sizeof(PrepareOutputInfo) + sizeof(uintptr_t); // sizeof(uintptr_t) for save out_mbuf. rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -220,6 +240,8 @@ Status CpuTaskPrepareOutput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mb prepare.out_mbuf = out_mbuf; // Placeholder, output mbuf addr will save to this place. status = rtMemcpy(args_, args_size_, &prepare, sizeof(PrepareOutputInfo), RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -229,12 +251,16 @@ Status CpuTaskPrepareOutput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mb Status CpuTaskPrepareOutput::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param args_ is nullptr or args_size_:%u is 0 or stream_ is nullptr," + "check invalid", args_size_); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskPrepareOutput, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret:0x%X", + status); GELOGE(RT_FAILED, "Call rt CpuKernelLaunch PrepareOutput failed, status: 0x%X", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -252,6 +278,8 @@ Status CpuTaskPrepareOutput::Distribute() { /// Status CpuTaskModelEnqueue::Init(uint32_t queue_id, uintptr_t out_mbuf) { if ((args_ != nullptr) || (args_size_ > 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -260,6 +288,8 @@ Status CpuTaskModelEnqueue::Init(uint32_t queue_id, uintptr_t out_mbuf) { args_size_ = sizeof(MbufQueueInfo); rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -270,6 +300,8 @@ Status CpuTaskModelEnqueue::Init(uint32_t queue_id, uintptr_t out_mbuf) { queue_info.in_mbuf = out_mbuf; status = rtMemcpy(args_, args_size_, &queue_info, args_size_, RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -279,12 +311,16 @@ Status CpuTaskModelEnqueue::Init(uint32_t queue_id, uintptr_t out_mbuf) { Status CpuTaskModelEnqueue::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param args_ is nullptr or args_size_ is 0 or stream_ is nullptr, arg_size:%u," + "check invalid", args_size_); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelEnqueue, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret:0x%X", + status); GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelEnqueue failed, status: 0x%X", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -301,6 +337,7 @@ Status CpuTaskModelEnqueue::Distribute() { /// Status CpuTaskActiveEntry::Init(rtStream_t stream) { if (stream == nullptr) { + REPORT_INNER_ERROR("E19999", "Param stream is nullptr, check invalid"); GELOGE(FAILED, "Task active stream not valid"); return FAILED; } @@ -311,12 +348,16 @@ Status CpuTaskActiveEntry::Init(rtStream_t stream) { Status CpuTaskActiveEntry::Distribute() { if ((active_stream_ == nullptr) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param stream is nullptr or active_stream_ is nullptr, " + "check invalid"); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t ret = rtStreamActive(active_stream_, stream_); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamActive failed, ret:0x%X", + ret); GELOGE(RT_FAILED, "Call rt StreamActive failed, ret: 0x%X", ret); return RT_ERROR_TO_GE_STATUS(ret); } @@ -333,6 +374,8 @@ Status CpuTaskActiveEntry::Distribute() { /// Status CpuTaskWaitEndGraph::Init(uint32_t model_id) { if ((args_ != nullptr) || (args_size_ > 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -340,6 +383,8 @@ Status CpuTaskWaitEndGraph::Init(uint32_t model_id) { args_size_ = sizeof(model_id); rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -347,6 +392,8 @@ Status CpuTaskWaitEndGraph::Init(uint32_t model_id) { status = rtMemcpy(args_, args_size_, &model_id, args_size_, RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -356,12 +403,16 @@ Status CpuTaskWaitEndGraph::Init(uint32_t model_id) { Status CpuTaskWaitEndGraph::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param args_ is nullptr or args_size_:%u is 0 or stream_ is nullptr," + "check invalid", args_size_); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskWaitEndGraph, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret:0x%X", + status); GELOGE(RT_FAILED, "Call rt CpuKernelLaunch WaitEndGraph failed, status: 0x%X", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -378,6 +429,8 @@ Status CpuTaskWaitEndGraph::Distribute() { /// Status CpuTaskModelRepeat::Init(uint32_t model_id) { if ((args_ != nullptr) || (args_size_ > 0)) { + REPORT_INNER_ERROR("E19999", "Param args_ is not nullptr or args_size_:%u > 0," + "check invalid", args_size_); GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } @@ -385,6 +438,8 @@ Status CpuTaskModelRepeat::Init(uint32_t model_id) { args_size_ = sizeof(model_id); rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -392,6 +447,8 @@ Status CpuTaskModelRepeat::Init(uint32_t model_id) { status = rtMemcpy(args_, args_size_, &model_id, args_size_, RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, status); GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -401,12 +458,16 @@ Status CpuTaskModelRepeat::Init(uint32_t model_id) { Status CpuTaskModelRepeat::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param args_ is nullptr or args_size_:%u is 0 or stream_ is nullptr," + "check invalid", args_size_); GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelRepeat, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret:0x%X", + status); GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelRepeat failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } diff --git a/ge/graph/load/model_manager/data_dumper.cc b/ge/graph/load/model_manager/data_dumper.cc index 235cffa9..f74272a5 100644 --- a/ge/graph/load/model_manager/data_dumper.cc +++ b/ge/graph/load/model_manager/data_dumper.cc @@ -72,24 +72,6 @@ static bool ParseNameIndex(const std::string &node_name_index, std::string &node static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector v_memory_type, size_t i) { return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1); } - -static uint64_t GetNowTime() { - uint64_t ret = 0; - mmTimeval tv; - if (mmGetTimeOfDay(&tv, nullptr) == 0) { - ret = tv.tv_sec * 1000000ULL + tv.tv_usec; - } - - return ret; -} - -static void ReplaceStringElem(std::string &str) { - for_each(str.begin(), str.end(), [](char &ch) { - if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { - ch = '_'; - } - }); -} } // namespace static int32_t GetIrDataType(ge::DataType data_type) { @@ -194,66 +176,6 @@ void DataDumper::SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_de is_op_debug_ = is_op_debug; } -void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, - uint32_t stream_id) { - GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id); - OpDescInfo op_desc_info; - op_desc_info.op_name = op->GetName(); - op_desc_info.op_type = op->GetType(); - op_desc_info.task_id = task_id; - op_desc_info.stream_id = stream_id; - for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { - GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); - if (input_tensor_desc == nullptr) { - continue; - } - op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); - op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); - op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); - int64_t input_size = 0; - - if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the input size is %ld", input_size); - op_desc_info.input_size.emplace_back(input_size); - } - for (size_t j = 0; j < op->GetOutputsSize(); ++j) { - GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); - if (output_tensor_desc == nullptr) { - continue; - } - op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); - op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); - op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); - int64_t output_size = 0; - if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the output size is %ld", output_size); - op_desc_info.output_size.emplace_back(output_size); - } - op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); - op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); - - op_desc_info_.emplace_back(op_desc_info); -} - -bool DataDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - GELOGI("There are %zu op need to dump.", op_desc_info_.size()); - for (size_t index = 0; index < op_desc_info_.size(); ++index) { - OpDescInfo dump_op_info = op_desc_info_.at(index); - if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { - GELOGI("find exception op of task_id: %u, stream_id: %u.", task_id, stream_id); - op_desc_info = dump_op_info; - return true; - } - } - return false; -} - void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args) { if (op_desc == nullptr) { @@ -325,6 +247,7 @@ Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vis } int64_t output_size = 0; if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), output_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size fail"); GELOGE(PARAM_INVALID, "Get output size filed"); return PARAM_INVALID; } @@ -385,8 +308,11 @@ Status DataDumper::DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_inf Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) { const auto &output_descs = inner_dump_info.op->GetAllOutputsDesc(); - const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op); + const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(*runtime_param_, inner_dump_info.op); if (output_descs.size() != output_addrs.size()) { + REPORT_INNER_ERROR("E19999", "output_desc size:%zu != output addr size:%zu in op:%s(%s)", + output_descs.size(), output_addrs.size(), + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str()); GELOGE(PARAM_INVALID, "Invalid output desc addrs size %zu, op %s has %zu output desc.", output_addrs.size(), inner_dump_info.op->GetName().c_str(), output_descs.size()); return PARAM_INVALID; @@ -411,6 +337,8 @@ Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicp GELOGI("[L1Fusion] DumpOutputWithTask[%s] output[%zu] is l1 addr.", inner_dump_info.op->GetName().c_str(), i); int64_t output_size = 0; if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get output tensor size fail in op:%s(%s), index:%zu", + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str(), i); GELOGE(PARAM_INVALID, "Get output size failed."); return PARAM_INVALID; } @@ -436,8 +364,12 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump: // else data, const or variable op aicpu::dump::Output output; auto output_tensor = inner_dump_info.op->GetOutputDescPtr(inner_dump_info.output_anchor_index); - const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op); + const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(*runtime_param_, inner_dump_info.op); if (output_tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "output_desc tensor is nullptr in op:%s(%s), index:%u, " + "check invalid", + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str(), + inner_dump_info.output_anchor_index); GELOGE(PARAM_INVALID, "output_tensor is null, index: %d, size: %zu.", inner_dump_info.output_anchor_index, inner_dump_info.op->GetOutputsSize()); return PARAM_INVALID; @@ -461,6 +393,9 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump: output.set_original_output_data_type(static_cast(output_tensor->GetOriginDataType())); // due to lhisi virtual addr bug, cannot use args now if (inner_dump_info.output_anchor_index >= static_cast(output_addrs.size())) { + REPORT_INNER_ERROR("E19999", "output_anchor_index:%u >= output addr size:%zu in op:%s(%s), " + "check invalid", inner_dump_info.output_anchor_index, output_addrs.size(), + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str()); GELOGE(FAILED, "Index is out of range."); return FAILED; } @@ -487,6 +422,7 @@ Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor if (AttrUtils::GetInt(tensor_descs.at(index), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) { GELOGI("Get aipp input size according to attr is %ld", input_size); } else if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), input_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size fail"); GELOGE(PARAM_INVALID, "Get input size filed"); return PARAM_INVALID; } @@ -540,8 +476,11 @@ Status DataDumper::DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) { GELOGI("Start dump input"); const auto &input_descs = inner_dump_info.op->GetAllInputsDesc(); - const std::vector input_addrs = ModelUtils::GetInputDataAddrs(runtime_param_, inner_dump_info.op); + const std::vector input_addrs = ModelUtils::GetInputDataAddrs(*runtime_param_, inner_dump_info.op); if (input_descs.size() != input_addrs.size()) { + REPORT_INNER_ERROR("E19999", "input_desc size:%zu != input addr size:%zu in op:%s(%s)", + input_descs.size(), input_addrs.size(), + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str()); GELOGE(PARAM_INVALID, "Invalid input desc addrs size %zu, op %s has %zu input desc.", input_addrs.size(), inner_dump_info.op->GetName().c_str(), input_descs.size()); return PARAM_INVALID; @@ -567,6 +506,8 @@ Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump:: if (AttrUtils::GetInt(input_descs.at(i), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) { GELOGI("Get aipp input size according to attr is %ld", input_size); } else if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get input tensor size fail in op:%s(%s), index:%zu", + inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str(), i); GELOGE(PARAM_INVALID, "Get input size failed."); return PARAM_INVALID; } @@ -595,6 +536,7 @@ Status DataDumper::ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_in size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_str); if (!ret || proto_size == 0) { + REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); GELOGE(PARAM_INVALID, "Protobuf SerializeToString failed, proto size %zu.", proto_size); return PARAM_INVALID; } @@ -606,6 +548,8 @@ Status DataDumper::ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_in rtError_t rt_ret = rtMalloc(&dev_mem_load_, proto_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + proto_size, rt_ret); GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -613,12 +557,15 @@ Status DataDumper::ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_in rt_ret = rtMemcpy(dev_mem_load_, proto_size, proto_str.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + proto_size, rt_ret); GELOGE(RT_FAILED, "Call rtMemcpy failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtDatadumpInfoLoad(dev_mem_load_, proto_size); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtDatadumpInfoLoad failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rtDatadumpInfoLoad failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -633,6 +580,7 @@ Status DataDumper::ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_ size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_str); if (!ret || proto_size == 0) { + REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); GELOGE(PARAM_INVALID, "Protobuf SerializeToString failed, proto size %zu.", proto_size); return PARAM_INVALID; } @@ -644,6 +592,8 @@ Status DataDumper::ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_ rtError_t rt_ret = rtMalloc(&dev_mem_unload_, proto_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + proto_size, rt_ret); GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -651,12 +601,15 @@ Status DataDumper::ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_ rt_ret = rtMemcpy(dev_mem_unload_, proto_size, proto_str.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + proto_size, rt_ret); GELOGE(RT_FAILED, "Call rtMemcpy failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtDatadumpInfoLoad(dev_mem_unload_, proto_size); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtDatadumpInfoLoad failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rtDatadumpInfoLoad failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -873,97 +826,4 @@ void DataDumper::PrintCheckLog(string &dump_list_key) { } } } - -Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception input"); - for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception output"); - for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != - SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionInfo(const std::vector exception_infos) { - GELOGI("Start to dump exception info"); - for (const rtExceptionInfo &iter : exception_infos) { - OpDescInfo op_desc_info; - if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { - toolkit::dumpdata::DumpData dump_data; - dump_data.set_version("2.0"); - dump_data.set_dump_time(GetNowTime()); - dump_data.set_op_name(op_desc_info.op_name); - for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { - toolkit::dumpdata::OpInput input; - input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i]))); - input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); - for (auto dim : op_desc_info.input_shape[i]) { - input.mutable_shape()->add_dim(dim); - } - input.set_size(op_desc_info.input_size[i]); - GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]); - dump_data.mutable_input()->Add(std::move(input)); - } - for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { - toolkit::dumpdata::OpOutput output; - output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j]))); - output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); - for (auto dim : op_desc_info.output_shape[j]) { - output.mutable_shape()->add_dim(dim); - } - output.set_size(op_desc_info.output_size[j]); - GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]); - dump_data.mutable_output()->Add(std::move(output)); - } - uint64_t now_time = GetNowTime(); - std::string op_name = op_desc_info.op_name; - std::string op_type = op_desc_info.op_type; - ReplaceStringElem(op_name); - ReplaceStringElem(op_type); - string dump_file_path = - "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); - GELOGI("The exception dump file path is %s", dump_file_path.c_str()); - - uint64_t proto_size = dump_data.ByteSizeLong(); - std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); - bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); - if (!ret || proto_size == 0) { - GELOGE(PARAM_INVALID, "Dump data proto serialize failed"); - return PARAM_INVALID; - } - - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), - "Failed to dump proto size"); - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), - "Failed to dump proto msg"); - if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception input failed"); - return PARAM_INVALID; - } - - if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception output failed"); - return PARAM_INVALID; - } - GELOGI("Dump exception info SUCCESS"); - } else { - GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid); - return PARAM_INVALID; - } - } - return SUCCESS; -} } // namespace ge diff --git a/ge/graph/load/model_manager/data_dumper.h b/ge/graph/load/model_manager/data_dumper.h index fbe70cf0..8af07d86 100755 --- a/ge/graph/load/model_manager/data_dumper.h +++ b/ge/graph/load/model_manager/data_dumper.h @@ -36,9 +36,21 @@ namespace ge { class DataDumper { public: - DataDumper() : runtime_param_{} {} - - explicit DataDumper(const RuntimeParam &rsh) : runtime_param_(rsh) {} + explicit DataDumper(RuntimeParam *rsh) + : model_name_(), + model_id_(0), + runtime_param_(rsh), + dev_mem_load_(nullptr), + dev_mem_unload_(nullptr), + op_list_(), + input_map_(), + load_flag_(false), + device_id_(0), + global_step_(0), + loop_per_iter_(0), + loop_cond_(0), + compute_graph_(nullptr), + ref_info_() {} ~DataDumper(); @@ -58,8 +70,6 @@ class DataDumper { void SaveDumpInput(const std::shared_ptr &node); - void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); - // args is device memory stored first output addr void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args); void SaveEndGraphId(uint32_t task_id, uint32_t stream_id); @@ -75,14 +85,8 @@ class DataDumper { void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; } const DumpProperties &GetDumpProperties() const { return dump_properties_; } - bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; const std::vector &GetAllOpDescInfo() const { return op_desc_info_; } - // Dump exception info - Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionInfo(const std::vector exception_infos); - private: void ReleaseDevMem(void **ptr) noexcept; @@ -93,10 +97,10 @@ class DataDumper { // for inference data dump std::string om_name_; - uint32_t model_id_ = 0; - const RuntimeParam &runtime_param_; - void *dev_mem_load_ = nullptr; - void *dev_mem_unload_ = nullptr; + uint32_t model_id_; + RuntimeParam *runtime_param_; + void *dev_mem_load_; + void *dev_mem_unload_; struct InnerDumpInfo; struct InnerInputMapping; @@ -107,12 +111,12 @@ class DataDumper { uint32_t end_graph_stream_id_ = 0; bool is_end_graph_ = false; std::multimap input_map_; // release after DavinciModel::Init - bool load_flag_ = false; - uint32_t device_id_ = 0; - uintptr_t global_step_ = 0; - uintptr_t loop_per_iter_ = 0; - uintptr_t loop_cond_ = 0; - ComputeGraphPtr compute_graph_ = nullptr; // release after DavinciModel::Init + bool load_flag_; + uint32_t device_id_; + uintptr_t global_step_; + uintptr_t loop_per_iter_; + uintptr_t loop_cond_; + ComputeGraphPtr compute_graph_; // release after DavinciModel::Init std::map ref_info_; // release after DavinciModel::Init void *l1_fusion_addr_ = nullptr; diff --git a/ge/graph/load/model_manager/data_inputer.h b/ge/graph/load/model_manager/data_inputer.h index 14ebcea5..b8d145d4 100755 --- a/ge/graph/load/model_manager/data_inputer.h +++ b/ge/graph/load/model_manager/data_inputer.h @@ -134,6 +134,8 @@ class DataInputer { /// void Stop() { queue_.Stop(); } + uint32_t Size() { return queue_.Size(); } + private: /// /// @ingroup domi_ome diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index 933aba5a..0d4b5b84 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -31,6 +31,7 @@ #include "common/scope_guard.h" #include "common/thread_pool.h" #include "framework/common/debug/ge_log.h" +#include "framework/common/util.h" #include "graph/common/ge_call_wrapper.h" #include "graph/compute_graph.h" #include "graph/debug/ge_attr_define.h" @@ -59,6 +60,7 @@ #include "securec.h" #include "graph/common/local_context.h" #include "common/formats/utils/formats_trans_utils.h" +#include "graph/common/omg_util.h" // create std::thread, catch exceptions using try/catch #define CREATE_STD_THREAD(thread_id, func, args) \ @@ -66,6 +68,8 @@ try { \ thread_id = std::thread(func, args); \ } catch (const std::system_error &e) { \ + REPORT_CALL_ERROR("E19999", "Create thread fail, ecode:%d, emsg:%s", \ + e.code().value(), e.what()); \ GELOGE(FAILED, "Caught system_error with code:%d, meaning:%s", e.code().value(), e.what()); \ GELOGE(FAILED, "Thread creat FAIL, Please check the left resource!"); \ return FAILED; \ @@ -184,7 +188,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptrRelease(), "Release task failed."); } } + + for (auto &item : label_goto_args_) { + GE_FREE_RT_LOG(item.second.first); + } + label_goto_args_.clear(); } Status DavinciModel::Assign(const GeModelPtr &ge_model) { @@ -323,6 +332,8 @@ void DavinciModel::Shrink() { Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size) { if (is_weight_mem_has_inited_) { + REPORT_INNER_ERROR("E19999", "Call InitWeightMem more than once, model_id:%u, check invalid", + model_id_); GELOGE(FAILED, "call InitWeightMem more than once."); return FAILED; } @@ -333,6 +344,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh GE_CHECK_LE(weights_size, ALLOC_MEMORY_MAX_SIZE); if ((weight_ptr != nullptr) && (weight_size < weights_size)) { + REPORT_INNER_ERROR("E19999", "Param weight_ptr is nullptr or ge_model.weight.size:%zu < param weights_size:%zu, " + "model_id:%u, check invalid", weight_size, weights_size, model_id_); GELOGE(FAILED, "Invalid mem param: weight_size=%zu totalsize=%zu.", weight_size, weights_size); return FAILED; } @@ -346,6 +359,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh if (weight_ptr == nullptr) { weights_mem_base_ = MallocWeightsMem(weights_size); if (weights_mem_base_ == nullptr) { + REPORT_CALL_ERROR("E19999", "MallocWeightsMem fail, weights_size:%zu, model_id:%u, check invalid", + weights_size, model_id_); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc weight memory failed. size: %zu", weights_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -364,6 +379,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (is_feature_map_mem_has_inited_) { + REPORT_INNER_ERROR("E19999", "Call InitFeatureMapMem more than once, model_id:%u, check invalid", + model_id_); GELOGE(PARAM_INVALID, "call InitFeatureMapMem more than once"); return PARAM_INVALID; } @@ -373,6 +390,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { std::size_t p2p_data_size = P2PMemInfos().at(RT_MEMORY_P2P_DDR).memory_size; if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) { + REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr or mem_size:%zu < ge_model.mem_size:%zu, " + "model_id:%u, check invalid", mem_size, TotalMemSize(), model_id_); GELOGE(PARAM_INVALID, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); return PARAM_INVALID; } @@ -384,6 +403,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (TotalMemSize() && mem_base_ == nullptr) { mem_base_ = MallocFeatureMapMem(data_size); if (mem_base_ == nullptr) { + REPORT_CALL_ERROR("E19999", "MallocFeatureMapMem fail, data_size:%zu, model_id:%u, check invalid", + data_size, model_id_); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc feature map memory failed. size: %zu", data_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -400,6 +421,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (p2p_data_size != 0) { p2p_mem_base_ = MallocP2PMem(p2p_data_size); if (p2p_mem_base_ == nullptr) { + REPORT_CALL_ERROR("E19999", "MallocFeatureMapMem fail, p2p_data_size:%zu, model_id:%u, check invalid", + p2p_data_size, model_id_); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc p2p memory failed,size: %zu", p2p_data_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -421,6 +444,8 @@ Status DavinciModel::InitVariableMem() { if (TotalVarMemSize() && (var_mem_base_ == nullptr)) { Status ret = VarManager::Instance(session_id_)->MallocVarMemory(TotalVarMemSize()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "MallocVarMemory fail, var_size:%zu, model_id:%u, check invalid", + TotalVarMemSize(), model_id_); GELOGE(ret, "Malloc variable memory failed."); return ret; } @@ -561,6 +586,8 @@ Status DavinciModel::SetTSDevice() { GELOGD("SetTSDevice: %u.", core_type); rtError_t rt_ret = rtSetTSDevice(core_type); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtSetTSDevice failed, core_type:%u, model_id:%u", + core_type, model_id_); GELOGE(RT_FAILED, "SetTSDevice failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -569,7 +596,7 @@ Status DavinciModel::SetTSDevice() { Status DavinciModel::OpDebugRegister() { if (GetDumpProperties().IsOpDebugOpen()) { - uint32_t op_debug_mode = GetDumpProperties().GetOpDebugMode(); + uint32_t op_debug_mode = GetDumpProperties().GetOpDebugMode(); auto ret = opdebug_register_.RegisterDebugForModel(rt_model_handle_, op_debug_mode, data_dumper_); if (ret != SUCCESS) { GELOGE(ret,"Register known shape op debug failed, ret: 0x%X",ret); @@ -638,9 +665,12 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size GELOGI("Logical stream index:%u, stream:%p, rtstream: %d.", i, stream, rt_stream_id); } - for (uint32_t i = 0; i < EventNum(); i++) { - rtEvent_t rt_event; - GE_CHK_RT_RET(rtEventCreate(&rt_event)); + uint32_t event_num = EventNum(); + uint32_t create_flag = static_cast((event_num > kEventReuseThreshold) ? RT_EVENT_WITH_FLAG : + RT_EVENT_DEFAULT); + for (uint32_t i = 0; i < event_num; ++i) { + rtEvent_t rt_event = nullptr; + GE_CHK_RT_RET(rtEventCreateWithFlag(&rt_event, create_flag)); event_list_.push_back(rt_event); } @@ -654,12 +684,12 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size runtime_param_.graph_id = compute_graph->GetGraphID(); // op debug register - GE_CHK_STATUS_RET(OpDebugRegister(), "OpDebugRegister failed."); + GE_CHK_STATUS_RET(OpDebugRegister(), "OpDebugRegister failed"); GE_TIMESTAMP_START(TransAllVarData); - GE_CHK_STATUS_RET(TransAllVarData(compute_graph, runtime_param_.graph_id), "TransAllVarData failed."); + GE_CHK_STATUS_RET(TransAllVarData(compute_graph, runtime_param_.graph_id), "TransAllVarData failed"); GE_TIMESTAMP_END(TransAllVarData, "GraphLoader::TransAllVarData"); - GE_CHK_STATUS_RET(TransVarDataUtils::CopyVarData(compute_graph, session_id_, device_id_), "copy var data failed."); + GE_CHK_STATUS_RET(TransVarDataUtils::CopyVarData(compute_graph, session_id_, device_id_), "copy var data failed"); GE_TIMESTAMP_START(InitModelMem); GELOGD("Known node is %d.", known_node_); @@ -667,7 +697,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size if (!known_node_) { GE_CHK_STATUS_RET_NOLOG(InitFeatureMapAndP2PMem(dev_ptr, mem_size)); data_inputer_ = new (std::nothrow) DataInputer(); - GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, MEMALLOC_FAILED, "data_inputer_ is nullptr."); + GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, MEMALLOC_FAILED, "data_inputer_ is nullptr"); } fixed_mem_base_ = reinterpret_cast(mem_base_); GE_TIMESTAMP_END(InitModelMem, "GraphLoader::InitModelMem"); @@ -853,6 +883,8 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { continue; } + // for dynamic shape with control flow + SetLabelForDynamic(node); auto it = op_desc_handle.find(op_desc->GetType()); if (it != op_desc_handle.end()) { if ((this->*it->second)(op_desc) != SUCCESS) { @@ -861,8 +893,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { } continue; } - // for dynamic shape with control flow - SetLabelForDynamic(node); + if (IsNoTaskAndDumpNeeded(op_desc)) { GELOGD("node[%s] without task, and save op_desc and addr for dump", op_desc->GetName().c_str()); const RuntimeParam &rts_param = GetRuntimeParam(); @@ -879,6 +910,8 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { rtError_t rt_ret = rtMemcpy(addr, size, tensor_device_addrs.data(), size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", + size, rt_ret); GELOGE(RT_FAILED, "rtMemcpy error, ret: 0x%X", rt_ret); GE_CHK_RT(rtFree(addr)); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -904,11 +937,12 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { } void DavinciModel::SetLabelForDynamic(const NodePtr &node) { - if (known_node_ && node->GetOpDesc()->GetType() == LABELSWITCHBYINDEX) { + if (known_node_ && (node->GetType() == LABELSWITCHBYINDEX || node->GetType() == STREAMSWITCH)) { for (auto &in_data_anchor : node->GetAllInDataAnchors()) { auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); if (peer_out_data_anchor != nullptr) { - string tensor_name = node->GetName(); + // name+index as the label of switch input + string tensor_name = node->GetName() + std::to_string(in_data_anchor->GetIdx()); auto peer_node = peer_out_data_anchor->GetOwnerNode(); (void)AttrUtils::SetStr(peer_node->GetOpDesc(), ATTR_DYNAMIC_SHAPE_FIXED_ADDR, tensor_name); (void)AttrUtils::SetInt(peer_node->GetOpDesc(), ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX, 0); @@ -953,6 +987,11 @@ Status DavinciModel::InitDataOp(const ComputeGraphPtr &graph, const NodePtr &nod const vector output_offset_list = op_desc->GetOutputOffset(); if (output_size_list.empty() || virtual_addr_list.empty() || (output_size_list.size() != virtual_addr_list.size()) || (output_offset_list.size() != virtual_addr_list.size())) { + REPORT_INNER_ERROR( + "E19999", "Check data fail in op:%s(%s), output_desc size:%zu output addr size:%zu output offset size:%zu " + "not equal or has empty, model_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + output_size_list.size(), virtual_addr_list.size(), output_offset_list.size(), model_id_); GELOGE(PARAM_INVALID, "Data[%s] init failed: output size is %zu, virtual_addr size is %zu, offset size is %zu.", op_desc->GetName().c_str(), output_size_list.size(), virtual_addr_list.size(), output_offset_list.size()); return PARAM_INVALID; @@ -1070,12 +1109,22 @@ Status DavinciModel::InitNetOutput(const ComputeGraphPtr &graph, const NodePtr & const vector virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc); const vector input_offset_list = op_desc->GetInputOffset(); GE_IF_BOOL_EXEC(input_offset_list.size() != virtual_addr_list.size(), - GELOGE(PARAM_INVALID, "virtual_addr size should be equal to offset size."); return PARAM_INVALID;); + REPORT_INNER_ERROR( + "E19999", "Check data fail in op:%s(%s), input addr size:%zu input offset size:%zu " + "not equal, model_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + virtual_addr_list.size(), input_offset_list.size(), model_id_); + GELOGE(PARAM_INVALID, "virtual_addr size should be equal to offset size."); + return PARAM_INVALID;); if (input_size_list.empty() && virtual_addr_list.empty()) { GELOGI("NetOutput[%s] is empty.", op_desc->GetName().c_str()); return SUCCESS; } if (input_size_list.empty() || input_size_list.size() != virtual_addr_list.size()) { + REPORT_INNER_ERROR( + "E19999", "Check data fail in op:%s(%s), input_desc size:%zu input addr size:%zu not equal or has empty, " + "model_id:%u", op_desc->GetName().c_str(), op_desc->GetType().c_str(), + input_size_list.size(), virtual_addr_list.size(), model_id_); GELOGE(PARAM_INVALID, "NetOutput[%s] init failed: Input size is %zu, Input addr is %zu", op_desc->GetName().c_str(), input_size_list.size(), virtual_addr_list.size()); return PARAM_INVALID; @@ -1173,6 +1222,9 @@ Status DavinciModel::GetGetDynamicDimsNodeInfo(const NodePtr &node) { auto in_anchor = node->GetAllInDataAnchors().at(get_dynamic_dims_index); auto peer_out_anchor = in_anchor->GetPeerOutAnchor(); if (peer_out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "In anchor index:%zu in op:%s(%s) peer anchor is nullptr, model_id:%u, check invalid", + get_dynamic_dims_index, + node->GetName().c_str(), node->GetType().c_str(), model_id_); GELOGE(PARAM_INVALID, "Out anchor of getdynmaicdims node should not be nullptr."); return PARAM_INVALID; } @@ -1184,12 +1236,18 @@ Status DavinciModel::GetGetDynamicDimsNodeInfo(const NodePtr &node) { auto input_addr = ModelUtils::GetInputDataAddrs(runtime_param_, node->GetOpDesc()); auto input_size = ModelUtils::GetInputSize(node->GetOpDesc()); if (input_addr.empty() || input_size.empty()) { + REPORT_INNER_ERROR("E19999", "input_addr size:%zu or input_length size:%zu in op:%s(%s) has empty, model_id:%u " + "check invalid", input_addr.size(), input_size.size(), + node->GetName().c_str(), node->GetType().c_str(), model_id_); GELOGE(PARAM_INVALID, "Not set output of %s", op_desc->GetName().c_str()); return PARAM_INVALID; } auto input_desc = node->GetOpDesc()->GetInputDescPtr(get_dynamic_dims_index); GE_CHECK_NOTNULL(input_desc); if (input_desc->GetShape().GetDims().empty()) { + REPORT_INNER_ERROR("E19999", "input_desc_index:%zu in op:%s(%s) shape dim is empty, model_id:%u, check invalid", + get_dynamic_dims_index, + node->GetName().c_str(), node->GetType().c_str(), model_id_); GELOGE(PARAM_INVALID, "Not set output desc shape of %s.", op_desc->GetName().c_str()); return PARAM_INVALID; } @@ -1234,6 +1292,8 @@ Status DavinciModel::GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_ for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get name:%s subgraph in graph:%s fail, model_id:%u, check invalid", + name.c_str(), graph->GetName().c_str(), model_id_); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s.", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -1246,6 +1306,10 @@ Status DavinciModel::GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_ size_t batch_index = static_cast(stoi(batch_label.substr(batch_label.rfind('_') + 1))); GELOGD("Batch index of %s is %zu.", op_desc->GetName().c_str(), batch_index); if (batch_index > all_gears_info_.size()) { + REPORT_INNER_ERROR("E19999", "Batch_index:%zu in op:%s(%s) > all_gears_info.size:%zu, model_id:%u, " + "check invalid", batch_index, + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + all_gears_info_.size(), model_id_); GELOGE(PARAM_INVALID, "The value of ATTR_NAME_BATCH_LABEL is invalid."); return PARAM_INVALID; } @@ -1255,6 +1319,9 @@ Status DavinciModel::GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_ GE_CHECK_NOTNULL(tensor_desc); int64_t data_size = 0; if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, data_size) != GRAPH_SUCCESS) { + REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu, model_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + input_index, model_id_); GELOGE(FAILED, "Get tensor size in bytes failed."); return FAILED; } @@ -1296,6 +1363,9 @@ Status DavinciModel::GetGearAndRealOutShapeInfo(const ComputeGraphPtr &graph, co for (auto &it : dynamic_output_shape) { auto gear_index = static_cast(it[0]); if (gear_index > all_gears_info_.size()) { + REPORT_INNER_ERROR("E19999", "gear index:%zu in op:%s(%s) > all_gears_info.size:%zu in model:%u " + "check invalid", gear_index, op_desc->GetName().c_str(), op_desc->GetType().c_str(), + all_gears_info_.size(), model_id_); GELOGE(PARAM_INVALID, "The value of cur index: %zu is invalid.", static_cast(it[0])); return PARAM_INVALID; } @@ -1334,6 +1404,44 @@ void DavinciModel::ParseDynamicOutShape(const std::vector &str_info } } +Status DavinciModel::GetLabelGotoAddr(uint32_t label_index, rtMemType_t mem_type, void *&arg_addr, uint32_t &arg_size) { + std::lock_guard lock(label_args_mutex_); + auto it = label_goto_args_.find(label_index); + if (it != label_goto_args_.end()) { + arg_addr = it->second.first; + arg_size = it->second.second; + return SUCCESS; + } + + if (label_index >= label_list_.size()) { + REPORT_INNER_ERROR("E19999", "Param label index:%u >= label_list_.size:%zu in model:%u, check invalid", + label_index, label_list_.size(), model_id_); + GELOGE(INTERNAL_ERROR, "Invalid label id:%u, label size:%zu", label_index, label_list_.size()); + return INTERNAL_ERROR; + } + GE_CHECK_NOTNULL(label_list_[label_index]); + vector label_used = { label_list_[label_index] }; + + arg_size = label_used.size() * sizeof(rtLabelDevInfo); + rtError_t rt_ret = rtMalloc(&arg_addr, arg_size, mem_type); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret: 0x%X", + arg_size, rt_ret); + GELOGE(RT_FAILED, "Call rtMalloc failed, error: %#x", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + + label_goto_args_[label_index] = { arg_addr, arg_size }; + rt_ret = rtLabelListCpy(label_used.data(), label_used.size(), arg_addr, arg_size); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtLabelListCpy failed, ret: 0x%X", rt_ret); + GELOGE(RT_FAILED, "Call rtLabelListCpy failed, error: %#x", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + + return SUCCESS; +} + /// @ingroup ge /// @brief LabelSet Op Initialize. /// @param [in] op_desc: LabelSet Op descriptor. @@ -1341,15 +1449,24 @@ void DavinciModel::ParseDynamicOutShape(const std::vector &str_info Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) { uint32_t label_index = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, label_index)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail, model_id:%u, check invalid", + ATTR_NAME_LABEL_SWITCH_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); GELOGE(INTERNAL_ERROR, "InitLabelSet: %s attr [%s] not exist.", op_desc->GetName().c_str(), ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); return INTERNAL_ERROR; } if (label_index >= LabelNum()) { + REPORT_INNER_ERROR("E19999", "label_switch_index:%u in op:%s(%s) >= label_num:%u in model:%u, check invalid", + label_index, op_desc->GetName().c_str(), op_desc->GetType().c_str(), + LabelNum(), model_id_); GELOGE(INTERNAL_ERROR, "InitLabelSet: label index: %u >= label size: %u.", label_index, LabelNum()); return INTERNAL_ERROR; } if (label_id_indication_.count(label_index) > 0) { + REPORT_INNER_ERROR("E19999", "label_switch_index:%u in op:%s(%s) is already used in model:%u, check invalid", + label_index, op_desc->GetName().c_str(), op_desc->GetType().c_str(), + model_id_); GELOGE(INTERNAL_ERROR, "InitLabelSet: %s label index: %u already used.", op_desc->GetName().c_str(), label_index); return INTERNAL_ERROR; } @@ -1361,6 +1478,9 @@ Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) { } else if (stream_list_.size() > stream_id) { stream = stream_list_[stream_id]; } else { + REPORT_INNER_ERROR("E19999", "stream_id:%u in op:%s(%s) >= stream size:%zu in model:%u, check invalid", + stream_id, op_desc->GetName().c_str(), op_desc->GetType().c_str(), + stream_list_.size(), model_id_); GELOGE(INTERNAL_ERROR, "InitLabelSet: stream index: %u >= stream size: %zu.", stream_id, stream_list_.size()); return INTERNAL_ERROR; } @@ -1368,6 +1488,8 @@ Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) { rtLabel_t rt_label = nullptr; rtError_t rt_error = rtLabelCreateExV2(&rt_label, rt_model_handle_, stream); if (rt_error != RT_ERROR_NONE || rt_label == nullptr) { + REPORT_CALL_ERROR("E19999", "Call rtLabelCreateExV2 failed, ret: 0x%X", + rt_error); GELOGE(INTERNAL_ERROR, "InitLabelSet: %s create label failed, error=0x%x.", op_desc->GetName().c_str(), rt_error); return INTERNAL_ERROR; } @@ -1406,6 +1528,9 @@ Status DavinciModel::InitVariable(const OpDescPtr &op_desc, map &input_queue_ids, const std::vector &output_queue_ids) { if (input_queue_ids.empty() && output_queue_ids.empty()) { + REPORT_INNER_ERROR("E19999", "Param input_queue_ids.size:%zu or output_queue_ids.size:%zu is empty, model_id:%u," + "check invalid", input_queue_ids.size(), output_queue_ids.size(), + model_id_); GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID, "Param is empty"); return ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID; } @@ -1428,12 +1553,18 @@ Status DavinciModel::LoadWithQueue() { } if (input_queue_ids_.size() != input_data_info_.size()) { + REPORT_INNER_ERROR("E19999", "Param input_queue_ids_.size:%zu != input_data_info_.size:%zu, model_id:%u," + "check invalid", input_queue_ids_.size(), input_data_info_.size(), + model_id_); GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID, "Input queue ids not match model: input_queue=%zu input_data=%zu", input_queue_ids_.size(), input_data_info_.size()); return ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID; } if (output_queue_ids_.size() != output_data_info_.size()) { + REPORT_INNER_ERROR("E19999", "Param output_queue_ids_.size:%zu != output_data_info_.size:%zu, model_id:%u," + "check invalid", output_queue_ids_.size(), output_data_info_.size(), + model_id_); GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu", output_queue_ids_.size(), output_data_info_.size()); @@ -1465,6 +1596,7 @@ Status DavinciModel::BindInputQueue() { for (size_t i = 0; i < input_queue_ids_.size(); ++i) { auto it = input_data_info_.find(i); if (it == input_data_info_.end()) { + GELOGE(FAILED, "Input not match: tensor num=%zu, Queue id index=%zu", input_data_info_.size(), i); return FAILED; } @@ -1481,6 +1613,7 @@ Status DavinciModel::BindInputQueue() { rtError_t rt_ret = rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_INPUT_QUEUE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindQueue failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "Call rtModelBindQueue failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1501,6 +1634,8 @@ Status DavinciModel::CpuModelDequeue(uint32_t queue_id) { GELOGI("Set CpuKernel model dequeue task enter."); std::shared_ptr dequeue_task = MakeShared(rt_entry_stream_); if (dequeue_task == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskModelDequeue failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskModelDequeue task failed."); return MEMALLOC_FAILED; } @@ -1523,6 +1658,8 @@ Status DavinciModel::CpuTaskModelZeroCopy(std::vector &mbuf_list, GELOGI("Set CpuKernel model zero_copy task enter."); std::shared_ptr zero_copy = MakeShared(rt_entry_stream_); if (zero_copy == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskZeroCopy failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskZeroCopy task failed."); return MEMALLOC_FAILED; } @@ -1545,12 +1682,16 @@ Status DavinciModel::BindOutputQueue() { for (size_t i = 0; i < output_queue_ids_.size(); ++i) { auto it = output_data_info_.find(i); if (it == output_data_info_.end()) { + REPORT_INNER_ERROR("E19999", "Index:%zu can't find in output_data_info_ size:%zu in model_id:%u, check invalid", + i, output_data_info_.size(), model_id_); GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i); return FAILED; } uint32_t queue_id = output_queue_ids_[i]; if (it->second.GetDataInfo().empty()) { + REPORT_INNER_ERROR("E19999", "Index:%zu out_data_info in model:%u is empty, check invalid", + i, model_id_); GELOGE(INTERNAL_ERROR, "the %zu output_queue not set data_info.", i); return INTERNAL_ERROR; } @@ -1561,6 +1702,8 @@ Status DavinciModel::BindOutputQueue() { rtError_t rt_ret = rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindQueue failed, queue_id:%u, ret: 0x%X", + queue_id, rt_ret); GELOGE(RT_FAILED, "Call rtModelBindQueue failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1582,12 +1725,16 @@ Status DavinciModel::BindOutputQueue() { Status DavinciModel::CpuModelPrepareOutput(uintptr_t addr, uint32_t size) { GELOGI("Set CpuKernel model enqueue task enter."); if (input_mbuf_list_.empty()) { + REPORT_INNER_ERROR("E19999", "input_mbuf_list_ is empty, model_id:%u, check invalid", + model_id_); GELOGE(FAILED, "Need input mbuf for fill output mbuf head info."); return FAILED; } std::shared_ptr prepare_output = MakeShared(rt_entry_stream_); if (prepare_output == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskPrepareOutput failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskPrepareOutput task failed."); return MEMALLOC_FAILED; } @@ -1612,6 +1759,8 @@ Status DavinciModel::CpuActiveStream() { GELOGI("Set CpuKernel active stream task enter."); std::shared_ptr active_entry = MakeShared(rt_entry_stream_); if (active_entry == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskActiveEntry failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskActiveEntry task failed."); return MEMALLOC_FAILED; } @@ -1633,6 +1782,8 @@ Status DavinciModel::CpuWaitEndGraph() { GELOGI("Set CpuKernel wait end graph task enter."); std::shared_ptr wait_endgraph = MakeShared(rt_entry_stream_); if (wait_endgraph == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskWaitEndGraph failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskWaitEndGraph task failed."); return MEMALLOC_FAILED; } @@ -1651,6 +1802,8 @@ Status DavinciModel::BindEnqueue() { for (size_t i = 0; i < output_queue_ids_.size(); ++i) { auto it = output_data_info_.find(i); if (it == output_data_info_.end()) { + REPORT_INNER_ERROR("E19999", "Index:%zu can't find in output_data_info_ size:%zu in model_id:%u, check invalid", + i, output_data_info_.size(), model_id_); GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i); return FAILED; } @@ -1667,6 +1820,8 @@ Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf) { GELOGI("Set CpuKernel model enqueue task enter."); std::shared_ptr model_enqueue = MakeShared(rt_entry_stream_); if (model_enqueue == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskModelEnqueue failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskModelEnqueue task failed."); return MEMALLOC_FAILED; } @@ -1687,6 +1842,8 @@ Status DavinciModel::CpuModelRepeat() { GELOGI("Set CpuKernel repeat task enter."); std::shared_ptr model_repeat = MakeShared(rt_entry_stream_); if (model_repeat == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskModelRepeat failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskModelRepeat task failed."); return MEMALLOC_FAILED; } @@ -1720,6 +1877,8 @@ Status DavinciModel::GetInputOutputDescInfo(vector &input_d vector &input_formats, vector &output_formats, bool by_dims) { if (input_addrs_list_.empty() || input_addrs_list_[0].size() != 1) { + REPORT_INNER_ERROR("E19999", "input_addrs_list_ is empty or first member size != 1, model_id:%u, " + "check invalid", model_id_); GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!"); return FAILED; } @@ -1830,6 +1989,9 @@ Status DavinciModel::InitAippType(uint32_t index, const OpDescPtr &op_desc, cons } else if (data_mode == "dynamic_aipp_conf") { aipp_type = DYNAMIC_AIPP_NODE; } else { + REPORT_INNER_ERROR("E19999", "Attr:%s data_mode:%s in op:%s(%s), model_id:%u, check invalid", + ATTR_DATA_RELATED_AIPP_MODE.c_str(), data_mode.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID, "The info of aipp releated info %s is invalid with index %u.", data_mode.c_str(), index); return ACL_ERROR_GE_AIPP_MODE_INVALID; @@ -1979,7 +2141,11 @@ Status DavinciModel::GetInputDescInfo(vector &input_descs, void DavinciModel::CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result) { /// netoutput input tensor desc - GE_IF_BOOL_EXEC(op_desc->GetInputDescPtr(index) == nullptr, GELOGE(FAILED, "OpDesc GetInputDescPtr is nullptr"); + GE_IF_BOOL_EXEC(op_desc->GetInputDescPtr(index) == nullptr, + REPORT_INNER_ERROR("E19999", "input_desc index:%u in op:%s(%s) not exist, model_id:%u, " + "check invalid", index, + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); + GELOGE(FAILED, "OpDesc GetInputDescPtr is nullptr"); return); Format format = op_desc->GetInputDescPtr(index)->GetFormat(); GeShape shape = op_desc->GetInputDescPtr(index)->GetShape(); @@ -2069,6 +2235,8 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data const std::vector &blobs = input_data.blobs; for (const auto &data : input_data_info_) { if (data.first >= blobs.size()) { + REPORT_INNER_ERROR("E19999", "index:%u in input_data_info_ >= input_data.blobs.size:%zu, model_id:%u, " + "check invalid", data.first, blobs.size(), model_id_); GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld, op_name(%s)", blobs.size(), input_data_info_.size(), data.first, data.second.GetDataInfo().at(0).first, data.second.GetOpName().c_str()); @@ -2098,7 +2266,6 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data Status DavinciModel::SyncVarData() { GELOGI("Sync var data, model id:%u", model_id_); - Status ret = SUCCESS; if (global_step_addr_ != nullptr && global_step_size_ != 0) { const vector v_step = { iterator_count_ }; @@ -2106,7 +2273,7 @@ Status DavinciModel::SyncVarData() { RT_MEMCPY_HOST_TO_DEVICE)); } - return ret; + return SUCCESS; } Status DavinciModel::InitModelProfile() { @@ -2219,8 +2386,12 @@ Status DavinciModel::SinkModelProfile() { try { reported_data = model_load_info.dump(kInteval, ' ', false, Json::error_handler_t::ignore); } catch (std::exception &e) { + REPORT_INNER_ERROR("E19999", "Convert model_load_info JSON to string failed, model_id:%u, reason:%s", + model_id_, e.what()); GELOGE(FAILED, "Failed to convert JSON to string, reason: %s.", e.what()); } catch (...) { + REPORT_INNER_ERROR("E19999", "Convert model_load_info JSON to string failed, model_id:%u", + model_id_); GELOGE(FAILED, "Failed to convert JSON to string."); } reported_data.append(",") @@ -2255,8 +2426,12 @@ Status DavinciModel::SinkTimeProfile(const InputData ¤t_data) { try { reported_data = model_time_info.dump(kInteval, ' ', false, Json::error_handler_t::ignore); } catch (std::exception &e) { + REPORT_INNER_ERROR("E19999", "Convert model_time_info JSON to string failed, model_id:%u, reason:%s", + model_id_, e.what()); GELOGE(FAILED, "Failed to convert JSON to string, reason: %s.", e.what()); } catch (...) { + REPORT_INNER_ERROR("E19999", "Convert model_time_info JSON to string failed, model_id:%u", + model_id_); GELOGE(FAILED, "Failed to convert JSON to string."); } reported_data.append(",") @@ -2323,6 +2498,9 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r output_data.index = data_id; output_data.model_id = model_id_; if (output_data.blobs.size() != output_data_info_.size()) { + REPORT_INNER_ERROR("E19999", "output_data.blobs.size:%zu != output_data_info.size:%zu, model_id:%u, " + "check invalid", + output_data.blobs.size(), output_data_info_.size(), model_id_); GELOGE(FAILED, "Output data buffer num=%zu not equal model data num=%zu", output_data.blobs.size(), output_data_info_.size()); return FAILED; @@ -2332,6 +2510,8 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r size_t idx = 0; for (const auto &output : output_data_info_) { if (output.first >= blobs.size()) { + REPORT_INNER_ERROR("E19999", "index:%u in output_data_info_ >= output_data.blobs.size:%zu, model_id:%u, " + "check invalid", output.first, blobs.size(), model_id_); GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld", blobs.size(), input_data_info_.size(), output.first, output.second.GetDataInfo().at(0).first); return FAILED; @@ -2350,6 +2530,9 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r if (is_dynamic_) { GELOGI("No need to check output data size."); } else if (buffer.length < mem_size) { + REPORT_INNER_ERROR("E19999", "Buffer.length:%lu in output blob < mem_size:%lu in output_data_info, index:%u, " + "model_id:%u, check invalid", buffer.length, mem_size, output.first, + model_id_); GELOGE(FAILED, "Tensor data size=%lu, buffer size=%lu", mem_size, buffer.length); return FAILED; } else if (buffer.length > mem_size) { @@ -2386,6 +2569,10 @@ Status DavinciModel::InitOutputTensorInfo(const OpDescPtr &op_desc) { GE_CHECK_NOTNULL(input_desc); auto ret = TensorUtils::GetTensorSizeInBytes(*input_desc, size); GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu, " + "model_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), i, + model_id_); GELOGE(ret, "Get size from TensorDesc failed, op:%s, input id:%zu", op_desc->GetName().c_str(), i); return ret); const GeShape &shape = input_desc->GetShape(); @@ -2428,6 +2615,8 @@ Status DavinciModel::GenOutputTensorInfo(OutputData *output_data, vector data_buf(new (std::nothrow) uint8_t[output_buffer_size[i]]); if (data_buf == nullptr) { + REPORT_CALL_ERROR("E19999", "New buffer failed, size:%ld, model_id:%u", + output_buffer_size[i], model_id_); GELOGE(GE_GRAPH_MALLOC_FAILED, "Malloc buffer failed."); return GE_GRAPH_MALLOC_FAILED; } @@ -2467,9 +2656,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b GE_CHECK_NOTNULL(model_manager); auto exception_infos = model_manager->GetExceptionInfos(); if (exception_infos.size() > 0) { - GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed"); + GE_CHK_STATUS_RET(DumpExceptionInfo(exception_infos), "[Dump][Exception] Dump exception info failed."); } else { - GELOGI("Exception info is null"); + GELOGI("[Dump][Exception] Exception info is null."); } GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed."); return INTERNAL_ERROR; @@ -2539,6 +2728,7 @@ void *DavinciModel::Run(DavinciModel *model) { GELOGI("Model Run thread start, model_id:%u.", model_id); rtError_t rt_ret = rtSetDevice(static_cast(device_id)); if (rt_ret != RT_ERROR_NONE) { + GELOGE(FAILED, "Model run rtsetdevice failed."); return nullptr; } @@ -2547,6 +2737,8 @@ void *DavinciModel::Run(DavinciModel *model) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); while (model->RunFlag()) { + // Model hasn't truly started runing before received data + model->SetRunningFlag(false); bool rslt_flg = true; if (model->GetDataInputer() == nullptr) { GELOGW("Data inputer is nullptr."); @@ -2556,6 +2748,8 @@ void *DavinciModel::Run(DavinciModel *model) { std::shared_ptr data_wrapper; Status ret = model->GetDataInputer()->Pop(data_wrapper); + // Model run indeedly start after received data. + model->SetRunningFlag(true); if (data_wrapper == nullptr || ret != SUCCESS) { GELOGI("data_wrapper is null!"); continue; @@ -2642,7 +2836,9 @@ void *DavinciModel::Run(DavinciModel *model) { model->iterator_count_++; model->is_first_execute_ = false; - GELOGI("run iterator count is %lu", model->iterator_count_); + // model run finished + model->SetRunningFlag(false); + GELOGI("run iterator count is %lu, model_id:%u", model->iterator_count_, model->model_id_); } CsaInteract::GetInstance().WriteInternalErrorCode(); @@ -2700,7 +2896,7 @@ Status DavinciModel::ModelRunStart() { error_context_ = ErrorManager::GetInstance().GetErrorContext(); CREATE_STD_THREAD(thread_id_, DavinciModel::Run, this); - GELOGI("model tread create success, model id:%u.", model_id_); + GELOGI("model thread create success, model id:%u.", model_id_); return SUCCESS; } @@ -2836,23 +3032,16 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector &inputs, const vec GELOGI("DavinciModel::UpdateKnownNodeArgs in"); GE_CHK_STATUS_RET(CreateKnownZeroCopyMap(inputs, outputs), "DavinciModel::UpdateKnownNodeArgs create map for input/output zero copy."); - if (!base_addr_not_changed_) { - total_io_addrs_.clear(); - orig_total_io_addrs_.clear(); - for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { - auto &task = task_list_[task_index]; - if (task != nullptr) { - Status ret = task->UpdateArgs(); - if (ret != SUCCESS) { - GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index); - return FAILED; - } + total_io_addrs_.clear(); + for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { + auto &task = task_list_[task_index]; + if (task != nullptr) { + Status ret = task->UpdateArgs(); + if (ret != SUCCESS) { + GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index); + return FAILED; } } - // cache latest iterator io addr - orig_total_io_addrs_ = total_io_addrs_; - } else { - total_io_addrs_ = orig_total_io_addrs_; } GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_, false), "DavinciModel::UpdateKnownZeroCopyAddr failed."); @@ -2892,6 +3081,14 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) { return SUCCESS; } +Status DavinciModel::CheckCapability(rtFeatureType_t featureType, int32_t featureInfo, bool &is_support) const { + int64_t value = RT_CAPABILITY_SUPPORT; + auto rt_ret = rtGetRtCapability(featureType, featureInfo, &value); + GE_CHK_BOOL_RET_STATUS(rt_ret == RT_ERROR_NONE, FAILED, "call rtGetRtCapability failed!"); + is_support = (value == RT_CAPABILITY_SUPPORT) ? true : false; + return SUCCESS; +} + Status DavinciModel::MallocKnownArgs() { GELOGI("DavinciModel::MallocKnownArgs in"); const auto &model_task_def = ge_model_->GetModelTaskDefPtr(); @@ -2910,21 +3107,26 @@ Status DavinciModel::MallocKnownArgs() { return ret; } } + rtError_t rt_ret; + bool is_support = false; + GE_CHK_STATUS_RET_NOLOG(CheckCapability(FEATURE_TYPE_MEMORY, MEMORY_INFO_TS_4G_LIMITED, is_support)); + auto mem_type = is_support ? RT_MEMORY_TS_4G : RT_MEMORY_HBM; // malloc args memory - if (total_args_size_ == 0) { - GELOGW("DavinciModel::MallocKnownArgs total_args_size_ equals to zero."); - return SUCCESS; - } - - rtError_t rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); + if (total_args_size_ != 0) { + rt_ret = rtMalloc(&args_, total_args_size_, mem_type); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret: 0x%X", + total_args_size_, rt_ret); + GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } } // malloc dynamic and static hybrid memory if (total_hybrid_args_size_ != 0) { - rt_ret = rtMalloc(&hybrid_addrs_, total_hybrid_args_size_, RT_MEMORY_HBM); + rt_ret = rtMalloc(&hybrid_addrs_, total_hybrid_args_size_, mem_type); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret: 0x%X", + total_hybrid_args_size_, rt_ret); GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -2932,8 +3134,10 @@ Status DavinciModel::MallocKnownArgs() { // malloc fixed addr memory, eg: rts op if (total_fixed_addr_size_ != 0) { GELOGI("Begin to allocate fixed addr."); - rt_ret = rtMalloc(&fixed_addrs_, total_fixed_addr_size_, RT_MEMORY_HBM); + rt_ret = rtMalloc(&fixed_addrs_, total_fixed_addr_size_, mem_type); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret: 0x%X", + total_hybrid_args_size_, rt_ret); GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -3025,9 +3229,8 @@ Status DavinciModel::DistributeTask() { task_def.kernel_ex().op_index()); OpDescPtr op = GetOpByIndex(op_index); GE_CHECK_NOTNULL(op); - if (reinterpret_cast(task->GetDumpArgs()) != nullptr) { - bool call_dump = GetDumpProperties().IsLayerNeedDump(name_, om_name_, op->GetName()) && task->CallSaveDumpInfo(); + bool call_dump = OpNeedDump(op->GetName()) && task->CallSaveDumpInfo(); if (call_dump || is_op_debug_reg_) { SaveDumpTask(task->GetTaskID(), task->GetStreamId(), op, task->GetDumpArgs()); } @@ -3047,11 +3250,16 @@ Status DavinciModel::DistributeTask() { return SUCCESS; } -void DavinciModel::SetEndGraphId(uint32_t task_id, uint32_t stream_id) { +bool DavinciModel::ModelNeedDump() { auto all_dump_model = GetDumpProperties().GetAllDumpModel(); - bool findByOmName = all_dump_model.find(om_name_) != all_dump_model.end(); - bool findByModelName = all_dump_model.find(name_) != all_dump_model.end(); - if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || findByOmName || findByModelName) { + bool ret = all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || + all_dump_model.find(dump_model_name_) != all_dump_model.end() || + all_dump_model.find(om_name_) != all_dump_model.end(); + return ret; +} + +void DavinciModel::SetEndGraphId(uint32_t task_id, uint32_t stream_id) { + if (ModelNeedDump()) { GELOGI("start save end_graph_info to dumper, task_id is %u, stream_id is %u", task_id, stream_id); data_dumper_.SaveEndGraphId(task_id, stream_id); } @@ -3177,6 +3385,9 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64 } // The input and model input size can not be exactly equal because user input is not definite. if ((input_size + kDataMemAlignSizeCompare) < op_size) { + REPORT_INNER_ERROR("E19999", "input size:%ld from user add align:%u > input_op_size:%ld in model, model_id:%u, " + "check invalid", + input_size, kDataMemAlignSizeCompare, op_size, model_id_); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input size [%ld] can not be smaller than op size [%ld] after 64-byte alignment", input_size, op_size); return false; @@ -3225,24 +3436,31 @@ Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &outp /// Status DavinciModel::UpdateIoTaskArgs(const std::map &data_info, bool is_input, const vector &blobs, bool is_dynamic, const string &batch_label) { - string input_or_output; - is_input ? input_or_output = "input" : input_or_output = "output"; if (blobs.size() != data_info.size()) { + REPORT_INNER_ERROR("E19999", "is_input:%d blob size:%ld from user != op_size:%ld in model, mode_id:%u" + "check invalid", is_input, + blobs.size(), data_info.size(), model_id_); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Verify %s data num failed: model requires %zu, but user actually feeds %zu", - input_or_output.c_str(), data_info.size(), blobs.size()); + is_input ? "input" : "output", data_info.size(), blobs.size()); return ACL_ERROR_GE_PARAM_INVALID; } for (const auto &data : data_info) { if (data.first >= blobs.size()) { // check data index. + REPORT_INNER_ERROR("E19999", "is_input:%d, data index:%u from model >= blobs.size:%zu from user, mode_id:%u" + "check invalid", is_input, + data.first, blobs.size(), model_id_); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Verify %s data num failed: can not find No.%u data, because user only feeds %zu", - input_or_output.c_str(), data.first, blobs.size()); + is_input ? "input" : "output", data.first, blobs.size()); return ACL_ERROR_GE_PARAM_INVALID; } const DataBuffer &buffer = blobs[data.first]; // index of data. if (buffer.data == nullptr) { + REPORT_INNER_ERROR("E19999", "is_input:%d buffer from user is nullptr, index:%u, mode_id:%u" + "check invalid", is_input, + data.first, model_id_); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "data_buf.data is nullptr, index=%u", data.first); return ACL_ERROR_GE_PARAM_INVALID; } @@ -3256,10 +3474,12 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map & void *basic_addr = data.second.GetBasicAddr(); uint64_t data_size = data.second.GetDataSize(); if (copy_only_addrs_.count(basic_addr) > 0) { - if (is_input) { + if (is_input && buffer.length > 0) { GELOGI("[IMAS] Find addr %p need direct copy from user malloc input %p", basic_addr, buffer.data); rtError_t rt_ret = rtMemcpy(basic_addr, data_size, buffer.data, buffer.length, RT_MEMCPY_DEVICE_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%lu, model_id:%u", + data_size, model_id_); GELOGE(rt_ret, "Non-zero copy data node copy failed"); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -3269,21 +3489,20 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map & } for (size_t count = 0; count < data.second.GetDataCount(); ++count) { - int64_t size = data.second.GetDataInfo().at(count).first; void *addr = data.second.GetDataInfo().at(count).second; void *buffer_addr = reinterpret_cast(reinterpret_cast(buffer.data) + data.second.GetRelativeOffset().at(count)); GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p, batch_label: %s", - input_or_output.c_str(), data.first, addr, size, buffer_addr, batch_label.c_str()); + is_input ? "input" : "output", data.first, addr, data.second.GetDataInfo().at(count).first, + buffer_addr, batch_label.c_str()); // For input data, just copy for rts task. - for (ZeroCopyTask &task : zero_copy_tasks_) { - if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) { + for (auto &task : zero_copy_tasks_) { + bool not_same_batch = (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label); + if (not_same_batch) { continue; } uintptr_t addr_val = reinterpret_cast(addr); - if (task.UpdateTaskParam(addr_val, buffer_addr) != SUCCESS) { - return ACL_ERROR_GE_PARAM_INVALID; - } + (void)task.UpdateTaskParam(addr_val, buffer_addr); } } } @@ -3342,11 +3561,18 @@ Status DavinciModel::InitConstant(const OpDescPtr &op_desc) { auto v_output_size = ModelUtils::GetOutputSize(op_desc); auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc); GE_IF_BOOL_EXEC(v_weights.empty() || v_output_size.empty() || v_output_addr.empty(), + REPORT_INNER_ERROR("E19999", "weight.size:%zu output_length.size:%zu output_addr.size:%zu in " + "op:%s(%s) has empty, model_id:%u, check invalid", + v_weights.size(),v_output_size.size(), v_output_addr.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str() ,model_id_); GELOGE(PARAM_INVALID, "const op:%s not set output", op_desc->GetName().c_str()); return PARAM_INVALID;); GeTensor *tensor = const_cast(v_weights[0].get()); GE_IF_BOOL_EXEC(static_cast(v_output_size[0]) < tensor->GetData().size(), + REPORT_INNER_ERROR("E19999", "Output size:%zu < weight size:%zu in op:%s(%s) model_id:%u, " + "check invalid", v_output_size[0], tensor->GetData().size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str() ,model_id_); GELOGE(PARAM_INVALID, "output size:%ld less than weight data size:%zu", v_output_size[0], tensor->GetData().size()); return PARAM_INVALID;); @@ -3396,6 +3622,8 @@ Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) { auto kernel = ge_model_->GetTBEKernelStore().FindKernel(op_desc->GetName()); auto tbe_kernel = (kernel != nullptr) ? kernel : op_desc->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); if (tbe_kernel == nullptr) { + REPORT_INNER_ERROR("E19999", "Get tbe_kernel for op:%s(%s) fail, model_id:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str() ,model_id_); GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc->GetName().c_str()); return INTERNAL_ERROR; } @@ -3422,6 +3650,9 @@ Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) { } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") { binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; } else { + REPORT_INNER_ERROR("E19999", "Attr:%s value:%s in op:%s(%s), model_id:%u, check invalid", + TVM_ATTR_NAME_MAGIC.c_str(), json_string.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str() ,model_id_); GELOGE(PARAM_INVALID, "TBE: Invalid parameter magic number! json: %s", json_string.c_str()); return PARAM_INVALID; } @@ -3511,6 +3742,11 @@ Status DavinciModel::InitStreamSwitch(const OpDescPtr &op_desc) { GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), "GetInt ACTIVE_STREAM_LIST failed."); if (active_stream_list.size() != kTrueBranchStreamNum) { + REPORT_INNER_ERROR("E19999", "Attr:%s active_stream_list.size:%zu in op:%s(%s) != kTrueBranchStreamNum:%u, " + "model_id:%u, check invalid", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), active_stream_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + kTrueBranchStreamNum, model_id_); GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum); return INTERNAL_ERROR; } @@ -3525,6 +3761,9 @@ Status DavinciModel::InitStreamSwitch(const OpDescPtr &op_desc) { Status DavinciModel::InitStreamSwitchN(const OpDescPtr &op_desc) { std::vector active_stream_list; if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s from op:%s(%s) fail, model_id:%u", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed."); return INTERNAL_ERROR; } @@ -3536,6 +3775,9 @@ Status DavinciModel::InitStreamSwitchN(const OpDescPtr &op_desc) { uint32_t batch_num = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_BATCH_NUM, batch_num)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s from op:%s(%s) fail, model_id:%u", + ATTR_NAME_BATCH_NUM.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); GELOGE(FAILED, "Failed to get attr ATTR_NAME_BATCH_NUM, StreamSwitchN: %s.", op_desc->GetName().c_str()); return FAILED; } @@ -3553,6 +3795,9 @@ Status DavinciModel::SetDynamicBatchInfo(const OpDescPtr &op_desc, uint32_t batc std::vector batch_shape; const std::string attr_name = ATTR_NAME_PRED_VALUE + "_" + std::to_string(i); if (!AttrUtils::GetListInt(op_desc, attr_name, batch_shape)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s from op:%s(%s) fail, model_id:%u", + attr_name.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), model_id_); GELOGE(FAILED, "Get attr ATTR_NAME_PRED_VALUE failed, Node: %s", op_desc->GetName().c_str()); batch_info_.clear(); return FAILED; @@ -3646,33 +3891,50 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed."); is_dynamic_ = input_data.is_dynamic_batch; - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_PRE_PROC_START)); + bool profiling_model_execute_on = ProfilingManager::Instance().ProfilingModelExecuteOn(); + bool profiling_model_load_on = ProfilingManager::Instance().ProfilingModelLoadOn(); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_PRE_PROC_START)); Status ret = CopyModelData(input_data, output_data, is_dynamic_); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Copy input data to model failed. model id: %u", model_id_); GELOGD("current_data.index=%u", input_data.index); - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_PRE_PROC_END)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_PRE_PROC_END)); if (!task_list_.empty()) { + uint64_t index_id = iterator_count_ + 1; + uint64_t model_id = static_cast(model_id_); + int32_t device_id = static_cast(device_id_); + // tag_id 0 means step begin, 1 meas step end. + if (profiling_model_load_on) { + GE_CHK_STATUS_RET_NOLOG( + ProfilingManager::Instance().ProfileStepInfo(index_id, model_id, 0, rt_model_stream_, device_id)); + } + GELOGD("rtModelExecute do"); - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_INFER_START)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_INFER_START)); rtError_t rt_ret = rtModelExecute(rt_model_handle_, rt_model_stream_, 0); GE_CHK_RT_EXEC(rt_ret, return RT_ERROR_TO_GE_STATUS(rt_ret)); - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_INFER_END)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_INFER_END)); GELOGD("rtModelExecute end"); + + if (profiling_model_load_on) { + GE_CHK_STATUS_RET_NOLOG( + ProfilingManager::Instance().ProfileStepInfo(index_id, model_id, 1, rt_model_stream_, device_id)); + } + iterator_count_++; } if (!is_async_mode_) { - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_START)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_START)); ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR, "Copy Output data to user failed."); - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_END)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_END)); } // report model time data - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), (void)SinkTimeProfile(input_data)); + GE_IF_BOOL_EXEC(profiling_model_execute_on, (void)SinkTimeProfile(input_data)); GELOGD("Model run end, model id:%u", model_id_); return SUCCESS; } @@ -3680,6 +3942,8 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa // Add active entry stream for special env. Status DavinciModel::AddHeadStream() { if (active_stream_list_.empty()) { + REPORT_INNER_ERROR("E19999", "active_stream_list is empty in model:%u, check invalid", + model_id_); GELOGE(INTERNAL_ERROR, "Active stream is empty, stream list size: %zu, stream indication size: %zu.", stream_list_.size(), active_stream_indication_.size()); return INTERNAL_ERROR; @@ -3699,6 +3963,8 @@ Status DavinciModel::AddHeadStream() { for (auto s : active_stream_list_) { std::shared_ptr active_entry = MakeShared(rt_head_stream_); if (active_entry == nullptr) { + REPORT_CALL_ERROR("E19999", "New CpuTaskActiveEntry failed, model_id:%u", + model_id_); GELOGE(MEMALLOC_FAILED, "Make CpuTaskActiveEntry task failed."); return MEMALLOC_FAILED; } @@ -3830,12 +4096,14 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id) rtContext_t ctx = nullptr; rtError_t rt_ret = rtCtxGetCurrent(&ctx); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, model_id:%u", + model_id_); GELOGE(RT_FAILED, "Failed to get current context, error_code is: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } std::vector variable_node_list; - for (ge::NodePtr &node : graph->GetDirectNode()) { + for (ge::NodePtr &node : graph->GetAllNodes()) { if (node == nullptr) { continue; } @@ -3851,7 +4119,10 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id) } void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &graph, const map &variable_by_name) { - data_dumper_.SetModelName(name_); + if(dump_model_name_.empty()) { + dump_model_name_ = name_; + } + data_dumper_.SetModelName(dump_model_name_); data_dumper_.SetModelId(model_id_); data_dumper_.SetOmName(om_name_); data_dumper_.SetComputeGraph(graph); @@ -3860,6 +4131,7 @@ void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &graph, const map infos = ge::StringUtils::Split(input, ':'); if (infos.size() != kAippInfoNum) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), aipp input size:%zu != kAippInfoNum:%u, model_id:%u, " + "check invalid", ATTR_NAME_AIPP_INPUTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), infos.size(), kAippInfoNum, + model_id_); GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID, "origin input str is invalid[%zu, %u].", infos.size(), kAippInfoNum); return ACL_ERROR_GE_AIPP_MODE_INVALID; } @@ -3942,6 +4218,8 @@ Status DavinciModel::InitOrigInputInfo(uint32_t index, const OpDescPtr &op_desc) Status DavinciModel::GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const { const auto it = orig_input_info_.find(index); if (it == orig_input_info_.end()) { + REPORT_INNER_ERROR("E19999", "Get index:%u from orig_input_info_ fail, model_id:%u", + index, model_id_); GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "there is not AIPP related with index %u.", index); return ACL_ERROR_GE_AIPP_NOT_EXIST; } @@ -3958,6 +4236,9 @@ void DavinciModel::ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_ GELOGI("ParseAIPPInfo: origin str: %s", in_out_info.c_str()); std::vector infos = ge::StringUtils::Split(in_out_info, ':'); if (infos.size() != kAippInfoNum) { + REPORT_INNER_ERROR("E19999", "in_out_info:%s size:%zu != kAippInfoNum:%u, model_id:%u, " + "check invalid", in_out_info.c_str(), infos.size(), kAippInfoNum, + model_id_); GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID, "origin input str is invalid[%zu, %u].", infos.size(), kAippInfoNum); return; } @@ -3976,7 +4257,7 @@ void DavinciModel::ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_ Status DavinciModel::InitAippInputOutputDims(uint32_t index, const OpDescPtr &op_desc) { if (!op_desc->HasAttr(ATTR_NAME_AIPP_INPUTS) || !op_desc->HasAttr(ATTR_NAME_AIPP_OUTPUTS)) { - GELOGI("there is not AIPP related with index %u.", index); + GELOGI("There is not AIPP related with index %u.", index); return SUCCESS; } @@ -3993,7 +4274,7 @@ Status DavinciModel::InitAippInputOutputDims(uint32_t index, const OpDescPtr &op ConstGeTensorDescPtr data_input_desc = op_desc->GetInputDescPtr(kDataIndex); int64_t data_input_size; (void)TensorUtils::GetSize(*(op_desc->GetInputDescPtr(kDataIndex)), data_input_size); - GELOGD("related Data[%d]: tensor_name: %s, dim_num: %zu, tensor_size: %zu, format: %s, data_type: %s, shape: %s.", + GELOGD("Related Data[%d]: tensor_name: %s, dim_num: %zu, tensor_size: %zu, format: %s, data_type: %s, shape: %s.", index, op_desc->GetName().c_str(), data_input_desc->GetShape().GetDimNum(), data_input_size, TypeUtils::FormatToSerialString(data_input_desc->GetFormat()).c_str(), TypeUtils::DataTypeToSerialString(data_input_desc->GetDataType()).c_str(), @@ -4020,6 +4301,8 @@ Status DavinciModel::GetAllAippInputOutputDims(uint32_t index, vector &output_dims) const { const auto it = aipp_dims_info_.find(index); if (it == aipp_dims_info_.end()) { + REPORT_INNER_ERROR("E19999", "Get index:%u from aipp_dims_info_ fail, model_id:%u", + index, model_id_); GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "there is not AIPP related with index %u.", index); return ACL_ERROR_GE_AIPP_NOT_EXIST; } @@ -4040,7 +4323,7 @@ int64_t DavinciModel::GetFixedAddrsSize(string tensor_name) { Status DavinciModel::InitL1DataDumperArgs() { auto all_dump_model = GetDumpProperties().GetAllDumpModel(); bool find_by_om_name = all_dump_model.find(om_name_) != all_dump_model.end(); - bool find_by_model_name = all_dump_model.find(name_) != all_dump_model.end(); + bool find_by_model_name = all_dump_model.find(dump_model_name_) != all_dump_model.end(); bool dump_l1fusion_op = (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end()) || find_by_om_name || find_by_model_name; if (dump_l1fusion_op) { @@ -4051,6 +4334,8 @@ Status DavinciModel::InitL1DataDumperArgs() { if (rtDumpAddrSet(rt_model_handle_, l1_fusion_addr_, kDumpL1FusionOpMByteSize, kDumpFlagOfL1Fusion) != RT_ERROR_NONE) { // l1_fusion_addr_ will be free when DavinciModel destruct + REPORT_CALL_ERROR("E19999", "Call rtDumpAddrSet failed, model_id:%u", + model_id_); GELOGE(FAILED, "Call rtDumpAddrSet failed"); return FAILED; } @@ -4061,4 +4346,43 @@ Status DavinciModel::InitL1DataDumperArgs() { return SUCCESS; } +Status DavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + auto listener = dynamic_cast(listener_.get()); + GE_CHECK_NOTNULL(listener); + listener->SetCallback(callback); + return SUCCESS; +} + +void DavinciModel::UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs) { + if (fixed_mem_base_ == reinterpret_cast(mem_base_)) { + GELOGD("[Update][OpIOAddrs] No need to update op input output addr."); + return; + } + + OpDescInfo *op_desc_info = exception_dumper_.MutableOpDescInfo(task_id, stream_id); + if (op_desc_info == nullptr) { + GELOGW("[Update][OpIOAddrs] Find op desc failed, task_id: %u, stream_id: %u.", task_id, stream_id); + return; + } + size_t input_size = op_desc_info->input_addrs.size(); + size_t output_size = op_desc_info->output_addrs.size(); + if (input_size + output_size != io_addrs.size()) { + GELOGW("[Update][OpIOAddrs] Op[%s] input size[%zu] and output size[%zu] is not equal to io addr size[%zu]", + op_desc_info->op_name.c_str(), input_size, output_size, io_addrs.size()); + return; + } + + vector input_addrs; + vector output_addrs; + for (size_t i = 0; i < io_addrs.size(); i++) { + if (i < input_size) { + input_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } else { + output_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } + } + op_desc_info->input_addrs = input_addrs; + op_desc_info->output_addrs = output_addrs; + GELOGD("[Update][OpIOAddrs] Op [%s] update input output addr success.", op_desc_info->op_name.c_str()); +} } // namespace ge diff --git a/ge/graph/load/model_manager/davinci_model.h b/ge/graph/load/model_manager/davinci_model.h index 70c0f687..ac6169ad 100755 --- a/ge/graph/load/model_manager/davinci_model.h +++ b/ge/graph/load/model_manager/davinci_model.h @@ -29,6 +29,7 @@ #include "common/helper/om_file_helper.h" #include "common/opskernel/ge_task_info.h" #include "common/properties_manager.h" +#include "common/dump/exception_dumper.h" #include "common/dump/opdebug_register.h" #include "common/types.h" #include "framework/common/util.h" @@ -221,6 +222,11 @@ class DavinciModel { /// DataInputer *const GetDataInputer() const { return data_inputer_; } + uint32_t GetDataInputerSize() { + GE_CHECK_NOTNULL(data_inputer_); + return data_inputer_->Size(); + } + // get Stream number uint32_t StreamNum() const { return runtime_param_.stream_num; } @@ -248,7 +254,10 @@ class DavinciModel { string Name() const { return name_; } // om_name - string OmName() const { return om_name_; } + const string &OmName() const { return om_name_; } + + // dump_model_name + const string &DumpModelName() const { return dump_model_name_; } // version uint32_t Version() const { return version_; } @@ -273,6 +282,8 @@ class DavinciModel { const vector &GetLabelList() const { return label_list_; } + Status GetLabelGotoAddr(uint32_t label_index, rtMemType_t memory_type, void *&addr, uint32_t &size); + Status DestroyThread(); // get Op @@ -466,13 +477,17 @@ class DavinciModel { Status ReportProfilingData(); void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) { - data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); + exception_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); } void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr &op_desc, uintptr_t args) { data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args); } + Status DumpExceptionInfo(const std::vector &exception_infos) const { + return exception_dumper_.DumpExceptionInfo(exception_infos); + } + void SetKnownShapeGlobalStep(void *global_step) { known_shape_global_step_ = global_step; } @@ -481,6 +496,12 @@ class DavinciModel { data_dumper_.DumpShrink(); } + bool OpNeedDump(const string &op_name) { + return GetDumpProperties().IsLayerNeedDump(dump_model_name_, om_name_, op_name); + } + + bool ModelNeedDump(); + void SetEndGraphId(uint32_t task_id, uint32_t stream_id); DavinciModel &operator=(const DavinciModel &model) = delete; @@ -529,10 +550,10 @@ class DavinciModel { void SetKnownNode(bool known_node) { known_node_ = known_node; } bool IsKnownNode() { return known_node_; } Status MallocKnownArgs(); + Status CheckCapability(rtFeatureType_t featureType, int32_t featureInfo, bool &is_support) const; Status UpdateKnownNodeArgs(const vector &inputs, const vector &outputs); Status CreateKnownZeroCopyMap(const vector &inputs, const vector &outputs); Status UpdateKnownZeroCopyAddr(vector &total_io_addrs, bool update_args = true); - void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; } Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const; Status GetAllAippInputOutputDims(uint32_t index, vector &input_dims, @@ -540,13 +561,19 @@ class DavinciModel { // om file name void SetOmName(const string &om_name) { om_name_ = om_name; } + void SetDumpModelName(const string &dump_model_name) { dump_model_name_ = dump_model_name; } void SetDumpProperties(const DumpProperties &dump_properties) { data_dumper_.SetDumpProperties(dump_properties); } const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); } bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); + return exception_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); } + void UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs); + + bool GetRunningFlag() const { return running_flg_; } + void SetRunningFlag(bool flag) { running_flg_ = flag; } + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback); private: // memory address of weights @@ -886,6 +913,7 @@ class DavinciModel { // used for inference data dump string om_name_; + string dump_model_name_; uint32_t version_; GeModelPtr ge_model_; // release after DavinciModel::Init @@ -911,6 +939,8 @@ class DavinciModel { shared_ptr listener_; bool run_flg_; + // check whether model is running with data + bool running_flg_ = false; mutex mux_run_flg_; @@ -930,6 +960,9 @@ class DavinciModel { vector label_list_; set label_id_indication_; + mutex label_args_mutex_; + map> label_goto_args_; + mutex outside_addrs_mutex_; vector zero_copy_tasks_; // Task used Data or NetOutput addr. set copy_only_addrs_; // Address need copy to original place. @@ -985,6 +1018,7 @@ class DavinciModel { int64_t maxDumpOpNum_; // for data dump DataDumper data_dumper_; + ExceptionDumper exception_dumper_; OpdebugRegister opdebug_register_; uint64_t iterator_count_; bool is_l1_fusion_enable_; @@ -1002,8 +1036,6 @@ class DavinciModel { map known_input_data_info_; map known_output_data_info_; vector total_io_addrs_; - vector orig_total_io_addrs_; - bool base_addr_not_changed_ = false; vector> batch_info_; vector> combined_batch_info_; diff --git a/ge/graph/load/model_manager/model_manager.cc b/ge/graph/load/model_manager/model_manager.cc index e46bef88..6114467c 100755 --- a/ge/graph/load/model_manager/model_manager.cc +++ b/ge/graph/load/model_manager/model_manager.cc @@ -99,11 +99,17 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u auto kernel_size = sizeof(uint64_t) * (v_aicpu_kernel.size()); rtError_t rt_ret = rtMalloc(&aicpu_kernel_addr, kernel_size, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret: 0x%X", + kernel_size, rt_ret); + GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(aicpu_kernel_addr, kernel_size, v_aicpu_kernel.data(), kernel_size, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", + kernel_size, rt_ret); + GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret); GE_CHK_RT(rtFree(aicpu_kernel_addr)); return RT_ERROR_TO_GE_STATUS(rt_ret);) uint64_t kernel_id_addr = static_cast(reinterpret_cast(aicpu_kernel_addr)); param_base.fwkKernelBase.fwk_kernel.kernelID = kernel_id_addr; @@ -114,6 +120,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rtError_t rt_ret = rtMalloc(&(devicebase), sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret: 0x%X", + sizeof(STR_FWK_OP_KERNEL), rt_ret); GELOGE(RT_FAILED, "malloc device memory failed. ret: 0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -122,6 +130,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rt_ret = rtMemcpy(devicebase, sizeof(STR_FWK_OP_KERNEL), ¶m_base, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", + sizeof(STR_FWK_OP_KERNEL), rt_ret); GELOGE(RT_FAILED, "memory copy to device failed. ret: 0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); @@ -131,6 +141,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rtStream_t stream = nullptr; rt_ret = rtStreamCreate(&stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "create stream failed. ret: 0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); @@ -139,6 +150,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u rt_ret = rtKernelLaunchEx(devicebase, sizeof(STR_FWK_OP_KERNEL), 0, stream); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchEx failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "rtKernelLaunchEx failed. ret: 0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); @@ -147,6 +159,8 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u } rt_ret = rtStreamSynchronize(stream); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize failed, ret: 0x%X", + rt_ret); GELOGE(RT_FAILED, "rtStreamSynchronize failed. ret: 0x%X", rt_ret); GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr))); GE_CHK_RT(rtFree(devicebase)); @@ -156,6 +170,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u if (aicpu_kernel_addr != nullptr) { rt_ret = rtFree(aicpu_kernel_addr); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "free memory failed. ret: 0x%X", rt_ret); GE_CHK_RT(rtFree(devicebase)); GE_CHK_RT(rtStreamDestroy(stream)); @@ -164,12 +179,14 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u } rt_ret = rtFree(devicebase); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "free memory failed. ret: 0x%X", rt_ret); GE_CHK_RT(rtStreamDestroy(stream)); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtStreamDestroy(stream); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamDestroy failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "rtStreamDestroy failed. ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -216,6 +233,8 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { auto it = model_map_.find(model_id); if (it == model_map_.end()) { + REPORT_INNER_ERROR("E19999", "Param model_id:%u can't find in model_map, check invalid", + model_id); GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id); return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID; } @@ -233,6 +252,8 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_ Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id, sub_model_id); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call KernelLaunchEx fail, model_id:%u, sub_model_id:%u, session_id:%lu", + model_id, sub_model_id, session_id); GELOGE(FAILED, "Destroy aicpu kernel failed."); return FAILED; } @@ -259,6 +280,7 @@ ModelManager::~ModelManager() { model_map_.clear(); model_aicpu_kernel_.clear(); cust_aicpu_so_.clear(); + dump_exception_flag_ = false; GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); } @@ -271,7 +293,7 @@ ge::Status ModelManager::SetDynamicSize(uint32_t model_id, const std::vector &ge_root_model, const shared_ptr &listener) { auto hybrid_model = hybrid::HybridDavinciModel::Create(ge_root_model); @@ -279,13 +301,26 @@ ge::Status ModelManager::DoLoadHybridModelOnline(uint32_t model_id, const string hybrid_model->SetListener(listener); hybrid_model->SetModelId(model_id); hybrid_model->SetDeviceId(GetContext().DeviceId()); - hybrid_model->SetModelName(model_name); + hybrid_model->SetOmName(om_name); GE_CHK_STATUS_RET(hybrid_model->Init(), "Failed to init hybrid model. model_id = %u", model_id); auto shared_model = std::shared_ptr(hybrid_model.release()); InsertModel(model_id, shared_model); return SUCCESS; } +bool ModelManager::IsNeedHybridLoad(ge::GeRootModel &ge_root_model) { + auto root_graph = ge_root_model.GetRootGraph(); + if (root_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "root graph in param ge_root_model is nullptr, model_id:%u, " + "check invalid", ge_root_model.GetModelId()); + GELOGE(FAILED, "no model on root model"); + return false; + } + bool is_shape_unknown = root_graph->GetGraphUnknownFlag(); + bool is_dsp_partitioned_graph = false; + (void)AttrUtils::GetBool(root_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dsp_partitioned_graph); + return is_shape_unknown || is_dsp_partitioned_graph || GetContext().GetHostExecFlag(); +} /// /// @ingroup domi_ome /// @brief load model online @@ -296,19 +331,18 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptrCheckIsUnknownShape(is_shape_unknown), "CheckIsUnknownShape failed, model id:%u", - model_id); - if (is_shape_unknown || GetContext().GetHostExecFlag()) { - return DoLoadHybridModelOnline(model_id, model_name, ge_root_model, listener); + auto name_to_model = ge_root_model->GetSubgraphInstanceNameToModel(); + string om_name; + if (IsNeedHybridLoad(*ge_root_model)) { + return DoLoadHybridModelOnline(model_id, om_name, ge_root_model, listener); } mmTimespec timespec = mmGetTickCount(); std::shared_ptr davinci_model = MakeShared(0, listener); if (davinci_model == nullptr) { + REPORT_CALL_ERROR("E19999", "New DavinciModel fail, model_id:%u", model_id); GELOGE(FAILED, "davinci_model is nullptr"); return FAILED; } @@ -324,7 +358,6 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptrGetRootGraph(); GE_CHECK_NOTNULL(root_graph); string root_model_name = root_graph->GetName(); - auto name_to_model = ge_root_model->GetSubgraphInstanceNameToModel(); GeModelPtr ge_model = name_to_model[root_model_name]; Status ret = SUCCESS; do { @@ -332,7 +365,18 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptrAssign(ge_model)), GELOGW("assign model to modeldef failed."); break;); GE_TIMESTAMP_END(Assign, "GraphLoader::ModelAssign"); - + /// In multi-threaded inference, using the same session_id among multiple threads may cause some threads to fail. + /// These session_ids come from the same model, so the values of session_id are the same. + /// Update session_id for infer in load model to avoid the same session_id. + if (!ge_root_model->GetTrainFlag()) { + uint64_t new_session_id; + ret = GenSessionId(new_session_id); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Generate session_id for infer failed."); + ret = davinci_model->UpdateSessionId(new_session_id); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Update session_id for infer failed."); + ge_model->InsertSessionMap(model_id, new_session_id); + GELOGD("Update new session id: %lu.", new_session_id); + } GE_TIMESTAMP_START(Init); GE_IF_BOOL_EXEC(SUCCESS != (ret = davinci_model->Init()), GELOGW("DavinciInit failed."); break;); GE_TIMESTAMP_END(Init, "GraphLoader::ModelInit"); @@ -345,16 +389,16 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr &davinci_model) { - GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", id); +void ModelManager::InsertModel(uint32_t model_id, std::shared_ptr &davinci_model) { + GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", model_id); std::lock_guard lock(map_mutex_); - model_map_[id] = davinci_model; + model_map_[model_id] = davinci_model; } -void ModelManager::InsertModel(uint32_t id, shared_ptr &hybrid_model) { - GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id); +void ModelManager::InsertModel(uint32_t model_id, shared_ptr &hybrid_model) { + GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", model_id); std::lock_guard lock(map_mutex_); - hybrid_model_map_[id] = hybrid_model; + hybrid_model_map_[model_id] = hybrid_model; } Status ModelManager::DeleteModel(uint32_t id) { @@ -374,6 +418,8 @@ Status ModelManager::DeleteModel(uint32_t id) { } else if (hybrid_model_it != hybrid_model_map_.end()) { (void)hybrid_model_map_.erase(hybrid_model_it); } else { + REPORT_INNER_ERROR("E19999", "model_id:%u not exist in model_map, check invalid", + id); GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id); return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID; } @@ -420,6 +466,7 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d Status status = data_wrap->Init(input_data, output_data); if (status != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Init InputDataWrapper failed, input data index: %u", input_data.index); GELOGE(domi::PUSH_DATA_FAILED, "Init InputDataWrapper failed, input data index: %u.", input_data.index); return domi::PUSH_DATA_FAILED; } @@ -436,6 +483,7 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d DataInputer *inputer = model->GetDataInputer(); GE_CHECK_NOTNULL(inputer); if (inputer->Push(data_wrap) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "DataInputer queue is full, please call again later, model_id %u", model_id); GELOGE(domi::DATA_QUEUE_ISFULL, "Data queue is full, please call again later, model_id %u ", model_id); return domi::DATA_QUEUE_ISFULL; } @@ -449,6 +497,9 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ vector &cur_dynamic_dims) { GELOGD("Start get cur dynamic dims."); if (user_real_input_dims.size() != user_input_dims.size()) { + REPORT_INNER_ERROR("E19999", "Param user_real_input_dims.size:%zu != user_input_dims.size:%zu, " + "check invalid", + user_real_input_dims.size(), user_input_dims.size()); GELOGE(INTERNAL_ERROR, "The input count of user: %zu should be equal to the data count of graph: %zu", user_real_input_dims.size(), user_input_dims.size()); @@ -457,6 +508,9 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ for (size_t i = 0; i < user_input_dims.size(); ++i) { if (user_real_input_dims[i].size() != user_input_dims[i].second.size()) { + REPORT_INNER_ERROR("E19999", "Param user_real_input_dims[%zu].size:%zu != user_input_dims[%zu].size:%zu, " + "check invalid", i, user_real_input_dims[i].size(), + i, user_input_dims[i].second.size()); GELOGE(INTERNAL_ERROR, "The shape size: %zu of dynamic input: %s should be equal to the shape size of input shape: %zu.", user_real_input_dims[i].size(), user_input_dims[i].first.c_str(), user_input_dims[i].second.size()); @@ -478,6 +532,8 @@ Status ModelManager::GetCurDynamicDims(const vector> &user_real_ } } if (!cur_dynamic_dims_valid) { + REPORT_INNER_ERROR("E19999", "cur dynamic dims is %s, not exist in options, check invalid", + formats::JoinToString(cur_dynamic_dims).c_str()); GELOGE(INTERNAL_ERROR, "Cur dynamic dims is %s, not exist in options.", formats::JoinToString(cur_dynamic_dims).c_str()); return INTERNAL_ERROR; @@ -629,6 +685,8 @@ Status ModelManager::HandleCommand(const Command &command) { auto iter = cmds.find(command.cmd_type); if (iter == cmds.end()) { + REPORT_INNER_ERROR("E19999", "Unsupported command:%s check", + command.cmd_type.c_str()); GELOGE(PARAM_INVALID, "Unsupported command: %s", command.cmd_type.c_str()); return PARAM_INVALID; } else { @@ -639,6 +697,9 @@ Status ModelManager::HandleCommand(const Command &command) { Status ModelManager::GetModelByCmd(const Command &command, std::shared_ptr &davinci_model) { if (command.cmd_params.size() < kCmdParSize) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < kCmdParSize:%u, command_type:%s, " + "check invalid", command.cmd_params.size(), kCmdParSize, + command.cmd_type.c_str()); GELOGE(PARAM_INVALID, "When the cmd_type is '%s', the size of cmd_params must larger than 2.", command.cmd_type.c_str()); return PARAM_INVALID; @@ -651,12 +712,18 @@ Status ModelManager::GetModelByCmd(const Command &command, try { model_id = std::stoi(value); } catch (std::invalid_argument &) { + REPORT_INNER_ERROR("E19999", "%s param:%s, check invalid", PROFILE_MODEL_ID.c_str(), + value.c_str()); GELOGE(PARAM_INVALID, "Model id: %s is invalid.", value.c_str()); return PARAM_INVALID; } catch (std::out_of_range &) { + REPORT_INNER_ERROR("E19999", "%s param:%s, check out of range", PROFILE_MODEL_ID.c_str(), + value.c_str()); GELOGE(PARAM_INVALID, "Model id: %s is out of range.", value.c_str()); return PARAM_INVALID; } catch (...) { + REPORT_INNER_ERROR("E19999", "%s param:%s, check cannot change to int", + PROFILE_MODEL_ID.c_str(), value.c_str()); GELOGE(FAILED, "Model id: %s cannot change to int.", value.c_str()); return FAILED; } @@ -665,10 +732,14 @@ Status ModelManager::GetModelByCmd(const Command &command, GE_CHECK_NOTNULL(model_manager); davinci_model = model_manager->GetModel(static_cast(model_id)); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "GetModel from model_manager fail, model_id:%u", + model_id); GELOGE(FAILED, "Model id: %d is invaild or model is not loaded.", model_id); return FAILED; } } else { + REPORT_INNER_ERROR("E19999", "Fisrt cmd_param not %s, check invalid", + PROFILE_MODEL_ID.c_str()); GELOGE(FAILED, "The model_id parameter is not found in the command."); return FAILED; } @@ -732,10 +803,14 @@ Status ModelManager::HandleProfFinalizeCommand(const Command &command) { */ Status ModelManager::HandleProfStartCommand(const Command &command) { if (command.cmd_params.size() < kProfStartCmdParaSize) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < %zu, check invalid", + command.cmd_params.size(), kProfStartCmdParaSize); GELOGE(PARAM_INVALID, "When the cmd_type is 'profile start', the size of cmd_params must larger than 2."); return PARAM_INVALID; } if (command.cmd_params.size() > kProfCmdParaMaxSize) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu > %zu, check invalid", + command.cmd_params.size(), kProfCmdParaMaxSize); GELOGE(PARAM_INVALID, "Command para size[%zu] larger than max[1000].", command.cmd_params.size()); return PARAM_INVALID; } @@ -758,10 +833,14 @@ Status ModelManager::HandleProfStartCommand(const Command &command) { Status ModelManager::HandleProfStopCommand(const Command &command) { if (command.cmd_params.size() < kProfStartCmdParaSize) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu < %zu, check invalid", + command.cmd_params.size(), kProfStartCmdParaSize); GELOGE(PARAM_INVALID, "When the cmd_type is 'profile stop', the size of cmd_params must larger than 2."); return PARAM_INVALID; } if (command.cmd_params.size() > kProfCmdParaMaxSize) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu > %zu, check invalid", + command.cmd_params.size(), kProfCmdParaMaxSize); GELOGE(PARAM_INVALID, "Command para size[%zu] larger than max[1000].", command.cmd_params.size()); return PARAM_INVALID; } @@ -787,6 +866,8 @@ static Status ParserPara(const Command &command, const string &dump_key, string if (iter != command.cmd_params.end()) { ++iter; if (iter == command.cmd_params.end()) { + REPORT_INNER_ERROR("E19999", "dump_key:%s can't find in command.param, check invalid", + dump_key.c_str()); GELOGE(PARAM_INVALID, "Invalid access."); return PARAM_INVALID; } @@ -797,6 +878,8 @@ static Status ParserPara(const Command &command, const string &dump_key, string Status ModelManager::HandleDumpCommand(const Command &command) { if (command.cmd_params.size() % kDumpCmdPairSize != 0) { + REPORT_INNER_ERROR("E19999", "command.cmd_params.size:%zu MOD 2 != 0, check invalid", + command.cmd_params.size()); GELOGE(PARAM_INVALID, "When the cmd_type is 'dump', the size of cmd_params must be a even number."); return PARAM_INVALID; } @@ -1013,6 +1096,7 @@ Status ModelManager::GenSessionId(uint64_t &session_id) { mmTimeval tv; if (mmGetTimeOfDay(&tv, nullptr) != 0) { + REPORT_CALL_ERROR("E19999", "Call mmGetTimeOfDay fail"); GELOGE(INTERNAL_ERROR, "Failed to get current time."); return INTERNAL_ERROR; } @@ -1057,6 +1141,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model GeModelPtr ge_model = model_helper.GetGeModel(); shared_ptr davinci_model = MakeShared(model.priority, listener); if (davinci_model == nullptr) { + REPORT_CALL_ERROR("E19999", "New DavinciModel fail"); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed"); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -1072,6 +1157,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model int32_t device_id = 0; rtError_t rt_ret = rtGetDevice(&device_id); if (rt_ret != RT_ERROR_NONE || device_id < 0) { + REPORT_CALL_ERROR("E19999", "Call rtGetDevice failed, ret = 0x%X", rt_ret); GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1130,6 +1216,7 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d shared_ptr davinci_model = MakeShared(model_data.priority, nullptr); if (davinci_model == nullptr) { + REPORT_CALL_ERROR("E19999", "New DavinciModel fail"); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create model failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -1250,6 +1337,8 @@ Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_ rtContext_t rt_cur_ctx = nullptr; auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); if (rt_error != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", + rt_error); GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); return RT_FAILED; } @@ -1285,6 +1374,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { rtContext_t rt_cur_ctx = nullptr; auto rt_error = rtCtxGetCurrent(&rt_cur_ctx); if (rt_error != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, ret = 0x%X", + rt_error); GELOGE(RT_FAILED, "get current context failed, runtime result is %d", static_cast(rt_error)); return RT_FAILED; } @@ -1310,12 +1401,16 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { status = rtMalloc(&d_aicpu_data, aicpu_data_length, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret = 0x%X", + aicpu_data_length, status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } allocated_mem.push_back(d_aicpu_data); status = rtMalloc(&d_so_name, so_name.size(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", + so_name.size(), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1338,6 +1433,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { uint32_t args_size = sizeof(CustAicpuSoBuf) * v_cust_so.size(); status = rtMalloc(&args, args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", + args_size, status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1352,6 +1449,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { uint32_t batch_args_size = sizeof(BatchLoadOpFromBufArgs); status = rtMalloc(&batch_args, batch_args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", + batch_args_size, status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1364,6 +1463,8 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) { status = rtStreamSynchronize(stream); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", + status); GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1408,6 +1509,7 @@ Status ModelManager::GetModelMemAndWeightSize(const ModelData &model, size_t &me auto partition_table = reinterpret_cast(model_data); if (partition_table->num == 1) { + REPORT_INNER_ERROR("E19999", "partition_table num in model_data is 1, check invalid"); GELOGE(ACL_ERROR_GE_PARAM_INVALID, "om model is error,please use executable om model"); return ACL_ERROR_GE_PARAM_INVALID; } @@ -1474,6 +1576,7 @@ ge::Status ModelManager::SyncExecuteModel(uint32_t model_id, const vector &outputs) { auto model = GetHybridModel(model_id); if (model == nullptr) { + REPORT_INNER_ERROR("E19999", "partition_table num in model_data is 1, check invalid"); GELOGE(FAILED, "Hybrid model not found. model id = %u.", model_id); return FAILED; } @@ -1485,9 +1588,21 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3 for (const auto &model : model_map_) { auto davinci_model = model.second; if (davinci_model->GetDeviceId() == device_id) { - GELOGI("Start to GetOpDescInfo of device_id: %u.", device_id); + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in davinci model.", device_id); if (davinci_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { - GELOGI("Find specific node of stream_id: %u, task_id: %u.", stream_id, task_id); + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in davinci model.", + stream_id, task_id); + return SUCCESS; + } + } + } + for (const auto &model : hybrid_model_map_) { + auto hybrid_model = model.second; + if (hybrid_model->GetDeviceId() == device_id) { + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in hybrid model.", device_id); + if (hybrid_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in hybrid model.", + stream_id, task_id); return SUCCESS; } } @@ -1500,8 +1615,11 @@ Status ModelManager::EnableExceptionDump(const std::map &options if (iter != options.end()) { GELOGI("Find option enable_exeception_dump is %s", iter->second.c_str()); if (iter->second == "1") { + dump_exception_flag_ = true; rtError_t rt_ret = rtSetTaskFailCallback(reinterpret_cast(ExceptionCallback)); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtSetTaskFailCallback fail, ret = 0x%X", + rt_ret); GELOGE(RT_FAILED, "rtSetTaskFailCallback failed"); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1549,6 +1667,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc sysOpInfoList in SysOpCheckInfo status = rtMalloc(&d_req_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", + op_nums * sizeof(SysOpInfo), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1557,6 +1677,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc sysOpInfoList in SysOpCheckResp status = rtMalloc(&d_res_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", + op_nums * sizeof(SysOpInfo), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1565,6 +1687,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op // malloc returnCodeList in SysOpCheckResp status = rtMalloc(&d_ret_code_list, op_nums * sizeof(ReturnCode), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%zu, ret = 0x%X", + op_nums * sizeof(ReturnCode), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1576,6 +1700,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op void *d_op_type_name = nullptr; status = rtMalloc(&d_op_type_name, op_type.length(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", + op_type.length(), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1593,6 +1719,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op void *d_op_type_name = nullptr; status = rtMalloc(&d_op_type_name, op_type.size(), RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%lu, ret = 0x%X", + op_type.length(), status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1621,6 +1749,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op uint32_t args_size = sizeof(SysOpCheckInfo) + sizeof(SysOpCheckResp); status = rtMalloc(&args, args_size, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, size:%u, ret = 0x%X", + args_size, status); GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status); return RT_ERROR_TO_GE_STATUS(status); } @@ -1636,6 +1766,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op status = rtStreamSynchronize(stream); if (status != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize fail, ret = 0x%X", + status); GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status); GE_CHK_RT(rtStreamDestroy(stream)); return RT_ERROR_TO_GE_STATUS(status); @@ -1668,6 +1800,9 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op reinterpret_cast(static_cast(op_check_info_res.sysOpInfoList)), sizeof(SysOpInfo) * res_op_nums, RT_MEMCPY_DEVICE_TO_HOST)); if (res_ret_code_list.size() != res_aicpu_op_info_list.size() || res_ret_code_list.size() != res_op_nums) { + REPORT_INNER_ERROR("E19999", "res_ret_code_list.size:%zu res_aicpu_op_info_list.size:%zu res_op_nums:%lu " + "not equal, check invalid", + res_ret_code_list.size(), res_aicpu_op_info_list.size(), res_op_nums); GELOGE(FAILED, "Number of retcode is not equal to number of op type."); GE_CHK_RT(rtStreamDestroy(stream)); return FAILED; @@ -1691,6 +1826,8 @@ Status ModelManager::LaunchKernelCheckAicpuOp(std::vector &aicpu_op "<0: op_type, 1: format, 2: datatype> \n"; } fail_reason += "not support."; + REPORT_INNER_ERROR("E19999", "Check aicpu op_type failed, details:%s", + fail_reason.c_str()); GELOGE(FAILED, "Check aicpu op_type failed. details: %s", fail_reason.c_str()); GE_CHK_RT(rtStreamDestroy(stream)); return FAILED; diff --git a/ge/graph/load/model_manager/model_manager.h b/ge/graph/load/model_manager/model_manager.h index f2d55db7..bf804d32 100755 --- a/ge/graph/load/model_manager/model_manager.h +++ b/ge/graph/load/model_manager/model_manager.h @@ -294,6 +294,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::vector &output_dims); bool IsDynamicShape(uint32_t model_id); + bool IsNeedHybridLoad(ge::GeRootModel &ge_root_model); ge::Status GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info); ge::Status EnableExceptionDump(const std::map &options); @@ -312,6 +313,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { instance->AddExceptionInfo(*rt_exception_info); } + bool IsDumpExceptionOpen() { return dump_exception_flag_; } private: /// /// @ingroup domi_ome @@ -329,8 +331,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { /// @ingroup domi_ome /// @brief insert new model into model manager set /// - void InsertModel(uint32_t id, std::shared_ptr &davinci_model); - void InsertModel(uint32_t id, std::shared_ptr &hybrid_model); + void InsertModel(uint32_t model_id, std::shared_ptr &davinci_model); + void InsertModel(uint32_t model_id, std::shared_ptr &hybrid_model); /// /// @ingroup domi_ome @@ -340,6 +342,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { void GenModelId(uint32_t *id); + std::map> model_map_; std::map> hybrid_model_map_; std::map> model_aicpu_kernel_; @@ -354,6 +357,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::map> cust_aicpu_so_; static DumpProperties dump_properties_; + bool dump_exception_flag_ = false; }; } // namespace ge diff --git a/ge/graph/load/model_manager/model_utils.cc b/ge/graph/load/model_manager/model_utils.cc index 8648d892..80bdec9b 100755 --- a/ge/graph/load/model_manager/model_utils.cc +++ b/ge/graph/load/model_manager/model_utils.cc @@ -25,6 +25,9 @@ #define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ do { \ if (SIZE <= static_cast(OFFSET)) { \ + REPORT_INNER_ERROR("E19999", \ + "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ + OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \ return {}; \ } \ @@ -305,6 +308,9 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co vector v_memory_type; bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type); if (has_mem_type_attr && (v_memory_type.size() != inputs_size)) { + REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", + ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu", op_desc->GetName().c_str(), v_memory_type.size(), inputs_size); return v_input_data_addr; @@ -384,7 +390,9 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc switch (mem_type) { case RT_MEMORY_RDMA_HBM: if (offset < 0) { - GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p", reinterpret_cast(offset)); + REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); + GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p", + reinterpret_cast(static_cast(offset))); return PARAM_INVALID; } var_addr = reinterpret_cast(static_cast(offset)); @@ -394,6 +402,8 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc var_addr = model_param.var_base + offset - model_param.logic_var_base; break; default: + REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", + mem_type, offset); GELOGE(PARAM_INVALID, "unsupported memory type %u", mem_type); return PARAM_INVALID; } @@ -419,6 +429,9 @@ vector ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C vector v_memory_type; bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type); if (has_mem_type_attr && (v_memory_type.size() != outputs_size)) { + REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", + ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu", op_desc->GetName().c_str(), v_memory_type.size(), outputs_size); @@ -567,6 +580,7 @@ Status ModelUtils::GetRtAddress(const RuntimeParam ¶m, uintptr_t logic_addr, param.var_size); } else if (logic_addr != 0) { mem_addr = nullptr; + REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr); return PARAM_INVALID; } diff --git a/ge/graph/load/model_manager/task_info/end_graph_task_info.cc b/ge/graph/load/model_manager/task_info/end_graph_task_info.cc index c306c650..d3c98684 100644 --- a/ge/graph/load/model_manager/task_info/end_graph_task_info.cc +++ b/ge/graph/load/model_manager/task_info/end_graph_task_info.cc @@ -27,6 +27,7 @@ namespace ge { Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("InitEndGraphTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -45,13 +46,11 @@ Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin Status EndGraphTaskInfo::Distribute() { GELOGI("EndGraphTaskInfo Distribute Start."); GE_CHECK_NOTNULL(davinci_model_); - auto all_dump_model = davinci_model_->GetDumpProperties().GetAllDumpModel(); - if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || - all_dump_model.find(davinci_model_->Name()) != all_dump_model.end() || - all_dump_model.find(davinci_model_->OmName()) != all_dump_model.end()) { + if (davinci_model_->ModelNeedDump()) { GELOGI("Start to call rtEndGraphEx"); rtError_t rt_ret = rtEndGraphEx(model_, stream_, kDumpFlag); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtEndGraphEx failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rtEndGraphEx failed, ret: 0x%x", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -59,6 +58,7 @@ Status EndGraphTaskInfo::Distribute() { GELOGI("Start to call rtEndGraph"); rtError_t rt_ret = rtEndGraph(model_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtEndGraph failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rtEndGraph failed, ret: 0x%x", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -68,6 +68,8 @@ Status EndGraphTaskInfo::Distribute() { uint32_t stream_id = 0; rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id, &stream_id); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelGetTaskId failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/event_record_task_info.cc b/ge/graph/load/model_manager/task_info/event_record_task_info.cc index f736c386..13dae9ee 100755 --- a/ge/graph/load/model_manager/task_info/event_record_task_info.cc +++ b/ge/graph/load/model_manager/task_info/event_record_task_info.cc @@ -23,6 +23,7 @@ namespace ge { Status EventRecordTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("EventRecordTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -34,6 +35,8 @@ Status EventRecordTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da const auto &eventList = davinci_model->GetEventList(); if (task_def.event_id() >= eventList.size()) { + REPORT_INNER_ERROR("E19999", "Task event_id:%u > model event size:%zu, check invalid", + task_def.event_id(), eventList.size()); GELOGE(INTERNAL_ERROR, "event list size:%zu, cur:%u!", eventList.size(), task_def.event_id()); return INTERNAL_ERROR; } @@ -47,6 +50,8 @@ Status EventRecordTaskInfo::Distribute() { GELOGI("EventRecordTaskInfo Distribute Start."); rtError_t rt_ret = rtEventRecord(event_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtEventRecord failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/event_wait_task_info.cc b/ge/graph/load/model_manager/task_info/event_wait_task_info.cc index 34058502..8fae9225 100755 --- a/ge/graph/load/model_manager/task_info/event_wait_task_info.cc +++ b/ge/graph/load/model_manager/task_info/event_wait_task_info.cc @@ -23,6 +23,7 @@ namespace ge { Status EventWaitTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("EventWaitTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -34,6 +35,8 @@ Status EventWaitTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davi const auto &eventList = davinci_model->GetEventList(); if (task_def.event_id() >= eventList.size()) { + REPORT_INNER_ERROR("E19999", "Task event_id:%u > model event size:%zu, check invalid", + task_def.event_id(), eventList.size()); GELOGE(INTERNAL_ERROR, "event list size:%zu, cur:%u!", eventList.size(), task_def.event_id()); return INTERNAL_ERROR; } @@ -48,12 +51,16 @@ Status EventWaitTaskInfo::Distribute() { GELOGI("EventWaitTaskInfo Distribute Start."); rtError_t rt_ret = rtStreamWaitEvent(stream_, event_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamWaitEvent failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtEventReset(event_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtEventReset failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/fusion_start_task_info.cc b/ge/graph/load/model_manager/task_info/fusion_start_task_info.cc index 6feea9e4..b47ac097 100755 --- a/ge/graph/load/model_manager/task_info/fusion_start_task_info.cc +++ b/ge/graph/load/model_manager/task_info/fusion_start_task_info.cc @@ -23,6 +23,7 @@ namespace ge { Status FusionStartTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("FusionStartTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -39,6 +40,8 @@ Status FusionStartTaskInfo::Distribute() { GELOGI("FusionStartTaskInfo Distribute Start."); rtError_t rt_ret = rtKernelFusionStart(stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelFusionStart failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/fusion_stop_task_info.cc b/ge/graph/load/model_manager/task_info/fusion_stop_task_info.cc index 22d1589c..6188cfc8 100755 --- a/ge/graph/load/model_manager/task_info/fusion_stop_task_info.cc +++ b/ge/graph/load/model_manager/task_info/fusion_stop_task_info.cc @@ -23,6 +23,7 @@ namespace ge { Status FusionStopTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("FusionStopTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -39,6 +40,7 @@ Status FusionStopTaskInfo::Distribute() { GELOGI("FusionStopTaskInfo Distribute Start."); rtError_t rt_ret = rtKernelFusionEnd(stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelFusionEnd failed, ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/hccl_task_info.cc b/ge/graph/load/model_manager/task_info/hccl_task_info.cc index 2d0ad560..7a435f91 100644 --- a/ge/graph/load/model_manager/task_info/hccl_task_info.cc +++ b/ge/graph/load/model_manager/task_info/hccl_task_info.cc @@ -30,6 +30,7 @@ HcclTaskInfo::~HcclTaskInfo() { if (private_def_ != nullptr) { rtError_t ret = rtFreeHost(private_def_); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFreeHost failed, ret:0x%X", ret); GELOGE(RT_FAILED, "Call rtFree Fail, ret = 0x%X.", ret); } private_def_ = nullptr; @@ -41,6 +42,7 @@ HcclTaskInfo::~HcclTaskInfo() { Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("HcclTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -67,22 +69,30 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m // Only in Horovod scenario should get the inputName and GeShape ret = HcomOmeUtil::GetHorovodInputs(op_desc, kernel_hccl_infos_); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call GetHorovodInputs fail for op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(ret, "davinci_model: GetHorovodInputs fail! domi error: %u", ret); return ret; } Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc, kernel_hccl_infos_); if (dmrt != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call GetHcclDataType fail for op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(dmrt, "davinci_model: GetHcomDataType fail! domi error: %u", dmrt); return dmrt; } dmrt = HcomOmeUtil::GetHcclCount(op_desc, kernel_hccl_infos_); if (dmrt != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call GetHcclCount fail for op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(dmrt, "davinci_model: GetHcomCount fail! domi error: %u", dmrt); return dmrt; } // Only HCOMBROADCAST and HVDCALLBACKBROADCAST need to get the rootId dmrt = HcomOmeUtil::GetAllRootId(op_desc, kernel_hccl_infos_); if (dmrt != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call GetAllRootId fail for op:%s(%s)", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(dmrt, "davinci_model: Get rootId fail! domi error: %u", dmrt); return dmrt; } @@ -169,12 +179,16 @@ Status HcclTaskInfo::CreateStream(int64_t stream_num, DavinciModel *davinci_mode rtError_t rt_ret = rtStreamCreateWithFlags(&stream, davinci_model->Priority(), RT_STREAM_PERSISTENT | RT_STREAM_FORCE_COPY); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamCreateWithFlags failed, ret:0x%X, stream_idx:%ld, stream_num:%ld", + rt_ret, i, stream_num); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } // Create slave stream, inactive by default, activated by hccl rt_ret = rtModelBindStream(davinci_model->GetRtModelHandle(), stream, RT_MODEL_WAIT_ACTIVE_STREAM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret:0x%X, stream_idx:%ld, stream_num:%ld", + rt_ret, i, stream_num); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); (void)rtStreamDestroy(stream); return RT_ERROR_TO_GE_STATUS(rt_ret); @@ -192,6 +206,7 @@ Status HcclTaskInfo::CreateStream(int64_t stream_num, DavinciModel *davinci_mode Status HcclTaskInfo::Distribute() { GELOGI("HcclTaskInfo Distribute Start. begin to call function LoadTask in hccl."); if (ops_kernel_store_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param ops_kernel_store_ nullptr"); GELOGE(INTERNAL_ERROR, "ops kernel store is null."); return INTERNAL_ERROR; } @@ -201,6 +216,7 @@ Status HcclTaskInfo::Distribute() { TransToGETaskInfo(ge_task); auto result = ops_kernel_info_store->LoadTask(ge_task); if (result != HCCL_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call ops_kernel_info_store LoadTask fail"); GELOGE(INTERNAL_ERROR, "davinci_model : load task fail, return ret: %u", result); return INTERNAL_ERROR; } @@ -316,6 +332,8 @@ void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) { private_def_len_ = private_def_temp.size(); rtError_t ret = rtMallocHost(&private_def_, private_def_len_); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, ret:0x%X, size:%u", + ret, private_def_len_); GELOGE(RT_FAILED, "Call rtMallocHost Fail, ret = 0x%X.", ret); return; } @@ -323,6 +341,8 @@ void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) { ret = rtMemcpy(private_def_, private_def_len_, task.private_def().c_str(), private_def_len_, RT_MEMCPY_HOST_TO_HOST); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret:0x%X, size:%u", + ret, private_def_len_); GELOGE(RT_FAILED, "Call rtMemcpy Fail, ret = 0x%X.", ret); return; } diff --git a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc index 2317f961..e2f600b3 100644 --- a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc @@ -75,11 +75,15 @@ Status KernelExTaskInfo::InitTaskExtInfo(const std::string &ext_info, const OpDe } auto rt_ret = rtMalloc(&ext_info_addr_, ext_handle->GetExtInfoLen(), RT_MEMORY_HBM); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + ext_info.size(), rt_ret); GELOGE(RT_FAILED, "rtMalloc ext_info error: 0x%X, size=%zu", rt_ret, ext_info.size()); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(ext_info_addr_, ext_handle->GetExtInfoLen(), ext_handle->GetExtInfo(), ext_handle->GetExtInfoLen(), RT_MEMCPY_HOST_TO_DEVICE); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + ext_handle->GetExtInfoLen(), rt_ret); GELOGE(RT_FAILED, "rtMemcpy ext_info error: 0x%X, size=%zu", rt_ret, ext_info.size()); return RT_ERROR_TO_GE_STATUS(rt_ret);) return SUCCESS; @@ -101,6 +105,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin uint32_t op_index = kernel_ex_def.op_index(); OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + op_index); GELOGE(INTERNAL_ERROR, "Init aicpu task info error, index is out of range!"); return INTERNAL_ERROR; } @@ -108,6 +114,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin // 2. Reconstruct kernelExDef.args to STR_FWK_OP_KERNEL STR_FWK_OP_KERNEL fwk_op_kernel = {0}; if (sizeof(STR_FWK_OP_KERNEL) < kernel_ex_def.args_size()) { + REPORT_INNER_ERROR("E19999", "Param kernel_ex_def.args_size():%u > sizeof(STR_FWK_OP_KERNEL):%zu, " + "check invalid", kernel_ex_def.args_size(), sizeof(STR_FWK_OP_KERNEL)); GELOGE(FAILED, "sizeof STR_FWK_OP_KERNEL is: %zu, but args_size is: %u", sizeof(STR_FWK_OP_KERNEL), kernel_ex_def.args_size()); return FAILED; @@ -115,6 +123,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin errno_t sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), kernel_ex_def.args().data(), kernel_ex_def.args_size()); if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s fail, size:%zu, ret:0x%X", + sizeof(STR_FWK_OP_KERNEL), sec_ret); GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); return FAILED; } @@ -136,12 +146,16 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID; GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), davinci_model->SubModelId(), kernel_id) != SUCCESS, + REPORT_CALL_ERROR("E19999", "CreateAicpuKernel fail, session_id:%lu, model_id:%u, kernel_id:%lu", + session_id, davinci_model->Id(), kernel_id); GELOGE(FAILED, "CreateAicpuKernel error."); return FAILED;) // 2.3 Create session GE_CHECK_NOTNULL(ModelManager::GetInstance()); ret = ModelManager::GetInstance()->CreateAicpuSession(session_id); GE_IF_BOOL_EXEC(ret != SUCCESS, + REPORT_CALL_ERROR("E19999", "CreateAicpuSession fail, session_id:%lu", + session_id); GELOGE(ret, "CreateAicpuSession error. session id: %lu", session_id); return ret;) @@ -152,7 +166,10 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin static_cast(reinterpret_cast(input_output_addr)); void *workspace_base_addr = nullptr; rtError_t rt_ret = rtMalloc(&workspace_base_addr, kernel_ex_def.task_info_size(), RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error, ret: Ox%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + kernel_ex_def.task_info_size(), rt_ret); + GELOGE(RT_FAILED, "rtMalloc error, ret: Ox%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);); rt_ret = rtMemcpy(workspace_base_addr, kernel_ex_def.task_info_size(), kernel_ex_def.task_info().data(), kernel_ex_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE); @@ -163,12 +180,18 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast(ext_info_addr_); rt_ret = rtMalloc(&kernel_buf_, kernel_buf_size_, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, ret:0x%X, size:%u", + rt_ret, kernel_buf_size_); + GELOGE(RT_FAILED, "rtMalloc error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(kernel_buf_, kernel_buf_size_, static_cast(&fwk_op_kernel), kernel_buf_size_, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy error, ret: Ox%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret:0x%X, size:%u", + rt_ret, kernel_buf_size_); + GELOGE(RT_FAILED, "rtMemcpy error, ret: Ox%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) SetIoAddrs(op_desc); @@ -186,6 +209,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin const vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc); if (workspace_data_addrs.empty()) { + REPORT_CALL_ERROR("E19999", "workspace_data_addrs is empty in op:%s(%s), check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "workspace_data_addrs is empty."); return FAILED; } @@ -200,11 +225,17 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin auto addrs_size = sizeof(uint64_t) * (io_addrs.size()); if (addrs_size > 0) { rtError_t rt_ret = rtMalloc(&input_output_addr_, addrs_size, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, ret:0x%X, size:%lu", + rt_ret, addrs_size); + GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(input_output_addr_, addrs_size, io_addrs.data(), addrs_size, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret:0x%X, size:%lu", + rt_ret, addrs_size); + GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) InitDumpTask(input_output_addr_, op_desc); @@ -223,12 +254,18 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin // 4. Return result rtError_t rt_ret = rtMalloc(&kernel_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, ret:0x%X, size:%zu", + rt_ret, sizeof(STR_FWK_OP_KERNEL)); + GELOGE(RT_FAILED, "rtMalloc error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(kernel_buf_, sizeof(STR_FWK_OP_KERNEL), static_cast(&fwk_op_kernel), sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy error, ret: Ox%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret:0x%X, size:%zu", + rt_ret, sizeof(STR_FWK_OP_KERNEL)); + GELOGE(RT_FAILED, "rtMemcpy error, ret: Ox%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, io_addrs.data(), input_output_addr_, addrs_size, 0); @@ -238,8 +275,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin } void KernelExTaskInfo::InitDumpTask(void *addr, const OpDescPtr &op_desc) { - if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(), - op_desc->GetName())) { + if (davinci_model_->OpNeedDump(op_desc->GetName())) { dump_flag_ = RT_KERNEL_DUMPFLAG; dump_args_ = addr; } @@ -250,6 +286,8 @@ Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciMod uint32_t op_index = kernel_ex_def.op_index(); OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + op_index); GELOGE(INTERNAL_ERROR, "Init aicpu task info error, index is out of range!"); return INTERNAL_ERROR; } @@ -267,6 +305,9 @@ Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciMod if (AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) { uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name); if (output_index > outputs_size) { + REPORT_INNER_ERROR("E19999", "The output size[%zu] and output index[%u] in op:%s(%s) are inconsistent, " + "check invalid", outputs_size, output_index, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", outputs_size, output_index); return FAILED; } @@ -293,6 +334,9 @@ void KernelExTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) { if (AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) { uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name); if (output_index > output_data_addrs.size()) { + REPORT_INNER_ERROR("E19999", "The output data addr size[%zu] and output index[%u] in op:%s(%s) are inconsistent" + ", check invalid", output_data_addrs.size(), output_index, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.", output_data_addrs.size(), output_index); return; @@ -313,6 +357,7 @@ void KernelExTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) { Status KernelExTaskInfo::UpdateArgs() { GELOGI("KernelExTaskInfo::UpdateArgs in."); davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); GELOGI("KernelExTaskInfo::UpdateArgs success."); return SUCCESS; } @@ -323,17 +368,25 @@ Status KernelExTaskInfo::CopyTaskInfo(const domi::KernelExDef &kernel_def, const const vector workspace_data_sizes = ModelUtils::GetWorkspaceSize(op_desc); const vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc); if (workspace_data_addrs.empty() || workspace_data_sizes.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) workspace addr:%zu or size:%zu empty, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + workspace_data_addrs.size(), workspace_data_sizes.size()); GELOGE(FAILED, "Node:%s invalid workspace, addrs is %zu, size is %zu.", op_desc->GetName().c_str(), workspace_data_addrs.size(), workspace_data_sizes.size()); return FAILED; } if (workspace_data_addrs[0] == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) workspace addr is nullptr, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Node:%s workspace addrs is null.", op_desc->GetName().c_str()); return FAILED; } if (workspace_data_sizes[0] < static_cast(kernel_def.task_info_size())) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) workspace size:%ld < task info size:%d, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + workspace_data_sizes[0], kernel_def.task_info_size()); GELOGE(FAILED, "Node:%s workspace size is %ld, task info size is %d.", op_desc->GetName().c_str(), workspace_data_sizes[0], kernel_def.task_info_size()); return FAILED; @@ -342,6 +395,8 @@ Status KernelExTaskInfo::CopyTaskInfo(const domi::KernelExDef &kernel_def, const rtError_t rt_ret = rtMemcpy(workspace_data_addrs[0], kernel_def.task_info_size(), kernel_def.task_info().data(), kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret:0x%X, size:%d", + rt_ret, kernel_def.task_info_size()); GELOGE(RT_FAILED, "rtMemcpy error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -353,11 +408,14 @@ Status KernelExTaskInfo::Distribute() { GELOGI("KernelExTaskInfo Distribute Start."); rtError_t rt_ret = rtKernelLaunchEx(kernel_buf_, kernel_buf_size_, dump_flag_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchEx failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } if (davinci_model_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model_ is null."); return PARAM_INVALID; } @@ -366,6 +424,8 @@ Status KernelExTaskInfo::Distribute() { uint32_t stream_id = 0; // for profiling rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id, &stream_id); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelGetTaskId failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/kernel_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_task_info.cc index c8d9f97a..82c3e286 100755 --- a/ge/graph/load/model_manager/task_info/kernel_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_task_info.cc @@ -93,8 +93,13 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci // new aicpu kernel(rtCpuKernelLaunch) no need to check function if (kernel_type_ == ccKernelType::CCE_AI_CORE) { rtError_t rt_ret = rtGetFunctionByName(const_cast(kernel_def.stub_func().c_str()), &stub_func_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s", - kernel_def.stub_func().c_str()); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtGetFunctionByName failed for op:%s(%s), " + "bin_file_key:%s, ret:0x%X", + op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), + kernel_def.stub_func().c_str(), rt_ret); + GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s", + kernel_def.stub_func().c_str()); return RT_ERROR_TO_GE_STATUS(rt_ret);); } else if (kernel_type_ == ccKernelType::TE) { // get bin_file_key @@ -103,11 +108,18 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id); rtError_t rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtGetFunctionByName failed for op:%s(%s), " + "bin_file_key:%s, ret:0x%X", + op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), + bin_file_key, rt_ret); GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. bin_file_key: %s", bin_file_key); return RT_ERROR_TO_GE_STATUS(rt_ret);); } if (context.origin_op_index_size() > CC_FUSION_OP_MAX) { + REPORT_INNER_ERROR("E19999", "context.origin_op_index_size():%d is more than CC_FUSION_OP_MAX(%d), op:%s(%s) ," + "check invalid", context.origin_op_index_size(), CC_FUSION_OP_MAX, + op_desc_->GetName().c_str(), op_desc_->GetType().c_str()); GELOGE(PARAM_INVALID, "context.origin_op_index_size() is more than CC_FUSION_OP_MAX(%d)", CC_FUSION_OP_MAX); return PARAM_INVALID; } @@ -120,17 +132,23 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci ctx_.opIndex = context.op_index(); uint16_t *args_offset_tmp = reinterpret_cast(const_cast(context.args_offset().data())); if (context.args_offset().size() / sizeof(uint16_t) < 1) { + REPORT_INNER_ERROR("E19999", "context.args_offset().size():%zu / sizeof(uint16_t) less than 1, op:%s(%s) ," + "check invalid", context.args_offset().size(), + op_desc_->GetName().c_str(), op_desc_->GetType().c_str()); GELOGE(FAILED, "context.args_offset().size() / sizeof(uint16_t) less than 1"); return FAILED; } - ret = InitTVMTask(args_offset_tmp[0], kernel_def); + io_addr_offset_ = args_offset_tmp[0]; + ret = InitTVMTask(io_addr_offset_, kernel_def); } else if (kernel_type_ == ccKernelType::CUSTOMIZED) { ret = InitAICPUCustomTask(context.op_index(), kernel_def); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { ret = InitAicpuTask(context.op_index(), kernel_def); } else { if (kernel_def.args().empty() || args_size_ == 0) { + REPORT_INNER_ERROR("E19999", "kernel_def.args() is empty, op:%s(%s), check invalid", + op_desc_->GetName().c_str(), op_desc_->GetType().c_str()); GELOGE(FAILED, "args is null."); return FAILED; } @@ -163,6 +181,8 @@ void KernelTaskInfo::UpdateSKTTaskId() { if (davinci_model_ != nullptr) { rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id, &stream_id); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelGetTaskId failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return; } @@ -181,6 +201,8 @@ void KernelTaskInfo::UpdateTaskId() { if (davinci_model_ != nullptr) { rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id, &stream_id); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelGetTaskId failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return; } @@ -236,6 +258,8 @@ Status KernelTaskInfo::SuperKernelLaunch() { static_cast(skt_info.last_sm_desc), skt_info.last_stream, skt_info.last_dump_flag); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchWithFlag failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "SuperKernelLaunch: Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -248,6 +272,8 @@ Status KernelTaskInfo::SuperKernelLaunch() { // Init super kernel factory Status ge_ret = factory->Init(); if (ge_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call SuperKernelFactory init fail, ret:0x%X", + ge_ret); GELOGE(ge_ret, "SuperKernelLaunch: SuperKernelFactory init failed"); return ge_ret; } @@ -255,6 +281,8 @@ Status KernelTaskInfo::SuperKernelLaunch() { std::unique_ptr superKernel = nullptr; ge_ret = factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info.last_block_dim, superKernel); if (ge_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call SuperKernelFactory FuseKernels fail, ret:0x%X", + ge_ret); GELOGE(ge_ret, "SuperKernelLaunch: fuse call failed"); return ge_ret; } @@ -262,6 +290,8 @@ Status KernelTaskInfo::SuperKernelLaunch() { skt_dump_flag_ = GetDumpFlag(); ge_ret = superKernel->Launch(skt_info.last_stream, skt_dump_flag_); if (ge_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call SuperKernelFactory Launch fail, ret:0x%X", + ge_ret); GELOGE(ge_ret, "SuperKernelLaunch: launch failed"); return ge_ret; } @@ -300,11 +330,14 @@ Status KernelTaskInfo::SaveSuperKernelInfo() { bool KernelTaskInfo::IsMarkedLastNode() { if (davinci_model_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return false; } OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + ctx_.opIndex); GELOGE(INTERNAL_ERROR, "InitTVMTaskInfo error, index is out of range!"); return false; } @@ -315,11 +348,14 @@ bool KernelTaskInfo::IsMarkedLastNode() { bool KernelTaskInfo::IsMarkedFirstNode() { if (davinci_model_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return false; } OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + ctx_.opIndex); GELOGE(INTERNAL_ERROR, "InitTVMTaskInfo error, index is out of range!"); return false; } @@ -360,6 +396,8 @@ Status KernelTaskInfo::SuperKernelDistribute() { rtError_t rt_ret = rtKernelLaunchWithFlag(stub_func_, block_dim_, args_, args_size_, static_cast(sm_desc_), stream_, dump_flag_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchWithFlag failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return rt_ret; } @@ -380,7 +418,8 @@ Status KernelTaskInfo::Distribute() { GELOGD("KernelTaskInfo Distribute Start."); if (davinci_model_->IsKnownNode()) { if (kernel_type_ == ccKernelType::TE) { - args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_) + : davinci_model_->GetCurrentArgsAddr(args_offset_); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_); } @@ -407,10 +446,7 @@ Status KernelTaskInfo::Distribute() { call_skt, task_id_, skt_id_, skt_info.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); // l1 fusion enable and env flag open (kCloseSkt for skt debug) bool open_dump = false; - auto all_dump_model = davinci_model_->GetDumpProperties().GetAllDumpModel(); - if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || - all_dump_model.find(davinci_model_->Name()) != all_dump_model.end() || - all_dump_model.find(davinci_model_->OmName()) != all_dump_model.end()) { + if (davinci_model_->ModelNeedDump()) { open_dump = true; } if (call_skt && (env_flag != kCloseSkt) && !open_dump) { @@ -423,6 +459,8 @@ Status KernelTaskInfo::Distribute() { } } if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchWithFlag or rtCpuKernelLaunchWithFlag failed, " + "ret:0x%X", rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -449,29 +487,46 @@ void KernelTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) { } } +Status KernelTaskInfo::CopyNoncontinuousArgs(uint16_t offset) { + GE_CHECK_NOTNULL(davinci_model_); + // copy new io addrs + vector io_addrs = io_addrs_; + davinci_model_->UpdateKnownZeroCopyAddr(io_addrs); + auto addr_size = kAddrLen * io_addrs.size(); + + // copy io addr + errno_t sec_ret = memcpy_s(args_addr.get() + offset, addr_size, io_addrs.data(), addr_size); + if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s fail, size:%zu, ret:0x%X", + addr_size, sec_ret); + GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); + return FAILED; + } + + // copy args to device + rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, rt_ret); + GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + GELOGD("Copy noncontinuous args success, kernel type %d.", kernel_type_); + return SUCCESS; +} + Status KernelTaskInfo::UpdateArgs() { GELOGI("KernelTaskInfo::UpdateArgs in."); + GE_CHECK_NOTNULL(davinci_model_); if (kernel_type_ == ccKernelType::TE) { + if (l2_buffer_on_) { + return CopyNoncontinuousArgs(io_addr_offset_); + } davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { - vector io_addrs = io_addrs_; - davinci_model_->UpdateKnownZeroCopyAddr(io_addrs); - uintptr_t io_addr = reinterpret_cast(args_addr.get()) + sizeof(aicpu::AicpuParamHead); - auto addrs_size = sizeof(uint64_t) * io_addrs.size(); - errno_t sec_ret = memcpy_s(reinterpret_cast(io_addr), addrs_size, io_addrs.data(), addrs_size); - if (sec_ret != EOK) { - GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); - return FAILED; - } - // copy args to device - rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } + return CopyNoncontinuousArgs(sizeof(aicpu::AicpuParamHead)); } - - GELOGI("KernelTaskInfo::UpdateArgs success."); return SUCCESS; } @@ -502,6 +557,7 @@ Status KernelTaskInfo::Release() { ret = (sm_desc_ != nullptr) ? rtMemFreeManaged(sm_desc_) : RT_ERROR_NONE; if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemFreeManaged failed, ret:0x%X", ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", static_cast(ret)); return RT_ERROR_TO_GE_STATUS(ret); } @@ -516,8 +572,8 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) { return SUCCESS; } - char *sm_contrl = const_cast(sm_desc.data()); - rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast(sm_contrl); + char *sm_control = const_cast(sm_desc.data()); + rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast(sm_control); uint64_t gen_base_addr = davinci_model_->GetRtBaseAddr(); // There is no weight for te op now. Update L2_mirror_addr by data memory base. @@ -532,12 +588,16 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) { rtError_t rt_ret = rtMemAllocManaged(&sm_desc_, sm_desc.size(), RT_MEMORY_SPM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemAllocManaged failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtMemcpy(sm_desc_, sm_desc.size(), sm_desc.data(), sm_desc.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + sm_desc.size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -545,19 +605,31 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) { return SUCCESS; } +void KernelTaskInfo::SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model) { + args_offset_ = davinci_model->GetTotalArgsSize(); + davinci_model->SetTotalArgsSize(args_size); +} + +void KernelTaskInfo::SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model) { + hybrid_args_offset_ = davinci_model->GetHybridArgsSize(); + davinci_model->SetHybridArgsSize(args_size); +} + Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) { + GE_CHECK_NOTNULL(davinci_model); const domi::KernelDef &kernel_def = task_def.kernel(); const domi::KernelContext &context = kernel_def.context(); kernel_type_ = static_cast(context.kernel_type()); + uint32_t args_size = kernel_def.args_size(); if (kernel_type_ == ccKernelType::TE) { - uint32_t args_size = kernel_def.args_size(); - args_offset_ = davinci_model->GetTotalArgsSize(); - davinci_model->SetTotalArgsSize(args_size); - GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); + if (kernel_def.sm_desc().empty()) { + SetContinuousArgs(args_size, davinci_model); + return SUCCESS; + } + l2_buffer_on_ = true; + SetNoncontinuousArgs(args_size, davinci_model); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { - hybrid_args_offset_ = davinci_model->GetHybridArgsSize(); - davinci_model->SetHybridArgsSize(kernel_def.args_size()); - GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_); + SetNoncontinuousArgs(args_size, davinci_model); } return SUCCESS; } @@ -568,8 +640,25 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne // get tvm op desc OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); GE_CHECK_NOTNULL(op_desc); + + args_addr = std::unique_ptr(new (std::nothrow) uint8_t[args_size_]); + errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_); + if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s fail, size:%u, ret:0x%X", + args_size_, sec_ret); + GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); + return FAILED; + } + + Status ge_ret = UpdateL2Data(kernel_def); + // update origin l2 data + if (ge_ret != SUCCESS) { + return ge_ret; + } + if (davinci_model_->IsKnownNode()) { - args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_) + : davinci_model_->GetCurrentArgsAddr(args_offset_); InitDumpTask(offset); return SUCCESS; } @@ -599,6 +688,8 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne // malloc args memory rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -606,17 +697,16 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne // copy orign args rt_ret = rtMemcpy(args_, args_size_, kernel_def.args().data(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } - vector args_info(args_size_); - errno_t sec_ret = memcpy_s(args_info.data(), args_size_, kernel_def.args().data(), args_size_); - if (sec_ret != EOK) { - GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); - return FAILED; - } if ((args_size_ <= offset) || (args_size_ - offset < kAddrLen * tensor_device_addrs.size())) { + REPORT_INNER_ERROR("E19999", "offset:%u >= kernelInfo.argsSize:%u or copy content:%zu beyond applied memory:%u, " + "check invalid", + offset, args_size_, kAddrLen * tensor_device_addrs.size(), args_size_ - offset); GELOGE(FAILED, "offset >= kernelInfo.argsSize or copy content beyond applied memory."); return FAILED; } @@ -625,12 +715,16 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne rt_ret = rtMemcpy(static_cast(args_) + offset, args_size_ - offset, tensor_device_addrs.data(), kAddrLen * tensor_device_addrs.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + args_size_ - offset, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } - sec_ret = memcpy_s(args_info.data() + offset, args_size_ - offset, tensor_device_addrs.data(), + sec_ret = memcpy_s(args_addr.get() + offset, args_size_ - offset, tensor_device_addrs.data(), kAddrLen * tensor_device_addrs.size()); if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s failed, size:%u, ret:0x%X", + args_size_ - offset, sec_ret); GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); return FAILED; } @@ -640,19 +734,13 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast(args_) + offset, "Op debug is open in TVM task info"); - Status ge_ret = UpdateL2Data(kernel_def); - // update origin l2 data - if (ge_ret != SUCCESS) { - return ge_ret; - } - vector virtual_io_addrs; // use virtual address for zero copy key. virtual_io_addrs.insert(virtual_io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); virtual_io_addrs.insert(virtual_io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); if (op_desc->GetType() == ATOMICADDRCLEAN) { virtual_io_addrs.insert(virtual_io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); } - davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_info.data(), args_, args_size_, offset); + davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_addr.get(), args_, args_size_, offset); GELOGD("Do InitTVMTask end"); return SUCCESS; @@ -681,6 +769,8 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel GELOGI("Do InitAICPUCustomTask"); OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + op_index); GELOGE(INTERNAL_ERROR, "index is out of range, index: %u", op_index); return INTERNAL_ERROR; } @@ -691,11 +781,17 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel const uint32_t kCustomAicpuArgsLen = 5; ctx_.argsOffset = new (std::nothrow) uint16_t[kCustomAicpuArgsLen](); if (ctx_.argsOffset == nullptr) { + REPORT_CALL_ERROR("E19999", "New ctx_.argsOffset fail, size:%u, op:%s(%s)", + kCustomAicpuArgsLen, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "ctx_.argsOffset is null!"); return PARAM_INVALID; } if (context.args_offset().size() / sizeof(uint16_t) < kCustomAicpuArgsLen) { + REPORT_INNER_ERROR("E19999", "context.args_offset().size():%zu / sizeof(uint16_t) is less than " + "kCustomAicpuArgsLen:%u, op:%s(%s), check invalid", + context.args_offset().size(), kCustomAicpuArgsLen, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "context.args_offset().size() / sizeof(uint16_t) is less than kCustomAicpuArgsLen"); return PARAM_INVALID; } @@ -716,24 +812,32 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel // attrHandle Buffer buffer; if (!AttrUtils::GetBytes(op_desc, ATTR_NAME_OPATTR, buffer)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", ATTR_NAME_OPATTR.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "can't find opattr bytes!."); return FAILED; } uint32_t op_attr_size = buffer.GetSize(); if (op_attr_size == 0) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s) size is 0, check invalid", + ATTR_NAME_OPATTR.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "param op_attr_size is out of range"); return PARAM_INVALID; } rtError_t rt_ret = rtMalloc(&custom_info_.attr_handle, op_attr_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), op_attr_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtMemcpy(custom_info_.attr_handle, op_attr_size, buffer.GetData(), op_attr_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), op_attr_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -743,6 +847,10 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel for (uint32_t i = 0; i < kCustomAicpuArgsLen; ++i) { if (kernel_def.args().size() < ((size_t)ctx_.argsOffset[i] + sizeof(uint64_t))) { + REPORT_INNER_ERROR("E19999", "ctx.argsOffset[%u]: %u + sizeof(uint64_t): %zu >= kernelDef.args().size():%zu, " + "op:%s(%s) check invalid", i, (uint32_t)ctx_.argsOffset[i], + sizeof(uint64_t), kernel_def.args().size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "ctx.argsOffset[%u]: %u + sizeof(uint64_t): %zu >= kernelDef.args().size():%zu", i, (uint32_t)ctx_.argsOffset[i], sizeof(uint64_t), kernel_def.args().size()); return FAILED; @@ -761,6 +869,8 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -768,6 +878,9 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel rt_ret = rtMemcpy(args_, kernel_def.args_size(), kernel_def.args().data(), kernel_def.args_size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + kernel_def.args_size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -782,6 +895,7 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { GELOGI("Do InitCCETask"); if (davinci_model_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -796,6 +910,7 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { if (context.is_flowtable()) { if (flowtable.empty()) { + REPORT_INNER_ERROR("E19999", "kernel_def.flowtable is empty, check invalid"); GELOGE(FAILED, "flowtable is null."); return FAILED; } @@ -830,6 +945,8 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { // args rtError_t rt_ret = rtMalloc(&args_, kernel_def.args_size(), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%u, ret:0x%X", + kernel_def.args_size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -838,6 +955,8 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { rt_ret = rtMemcpy(args_, kernel_def.args_size(), kernel_def.args().data(), kernel_def.args_size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%u, ret:0x%X", + kernel_def.args_size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -846,12 +965,16 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) { if (!sm_desc.empty()) { rt_ret = rtMemAllocManaged(&sm_desc_, sm_desc.size(), RT_MEMORY_SPM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemAllocManaged failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtMemcpy(sm_desc_, sm_desc.size(), sm_desc.data(), sm_desc.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + sm_desc.size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -866,6 +989,8 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + op_index); GELOGE(INTERNAL_ERROR, "index is out of range, index: %u", op_index); return INTERNAL_ERROR; } @@ -883,6 +1008,8 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_) errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_); if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s fail, size:%u, ret:0x%X", + args_size_, sec_ret); GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); return FAILED; } @@ -917,6 +1044,8 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k auto addrs_size = sizeof(uint64_t) * io_addrs.size(); sec_ret = memcpy_s(reinterpret_cast(io_addr), addrs_size, io_addrs.data(), addrs_size); if (sec_ret != EOK) { + REPORT_CALL_ERROR("E19999", "Call memcpy_s fail, size:%lu, ret:0x%X", + addrs_size, sec_ret); GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); return FAILED; } @@ -925,6 +1054,8 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k // malloc device memory for args rtError_t rt_ret = rtMalloc(static_cast(&args_), args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api(rtMalloc) failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -933,6 +1064,8 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k // copy args to device rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -951,8 +1084,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } void KernelTaskInfo::InitDumpTask(uint32_t offset) { - if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(), - op_desc_->GetName())) { + if (davinci_model_->OpNeedDump(op_desc_->GetName())) { if (IsL1FusionOp(op_desc_)) { dump_flag_ = RT_FUSION_KERNEL_DUMPFLAG; } else { @@ -1005,12 +1137,18 @@ Status KernelTaskInfo::InitAicpuTaskExtInfo(const std::string &ext_info) { } auto rt_ret = rtMalloc(&aicpu_ext_info_addr_, ext_handle->GetExtInfoLen(), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%zu, ret:0x%X", + op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), + ext_handle->GetExtInfoLen(), rt_ret); GELOGE(RT_FAILED, "rtMalloc ext_info error: 0x%X, size=%zu", rt_ret, ext_info.size()); return RT_ERROR_TO_GE_STATUS(rt_ret); } rt_ret = rtMemcpy(aicpu_ext_info_addr_, ext_handle->GetExtInfoLen(), ext_handle->GetExtInfo(), ext_handle->GetExtInfoLen(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%zu, ret:0x%X", + op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), + ext_handle->GetExtInfoLen(), rt_ret); GELOGE(RT_FAILED, "rtMemcpy ext_info error: 0x%X, size=%zu", rt_ret, ext_info.size()); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1028,6 +1166,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d // inputDescs rtError_t rt_ret = rtMalloc(&custom_info_.input_descs, sizeof(opTensor_t) * input_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + sizeof(opTensor_t) * input_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1036,6 +1176,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d rt_ret = rtMemcpy(static_cast(custom_info_.input_descs) + i, sizeof(opTensor_t), const_cast(&input_descs[i]), sizeof(opTensor_t), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + sizeof(opTensor_t), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1044,6 +1186,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d // inputAddrs rt_ret = rtMalloc(&custom_info_.input_addrs, sizeof(opTensor_t) * input_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + sizeof(opTensor_t) * input_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1052,6 +1196,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d rt_ret = rtMemcpy(custom_info_.input_addrs, kAddrLen * input_size, &input_data_addrs[0], kAddrLen * input_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + kAddrLen * input_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1060,6 +1206,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d // outputDescs rt_ret = rtMalloc(&custom_info_.output_descs, sizeof(opTensor_t) * output_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + sizeof(opTensor_t) * output_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1067,6 +1215,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d rt_ret = rtMemcpy(static_cast(custom_info_.output_descs) + i, sizeof(opTensor_t), const_cast(&input_descs[i]), sizeof(opTensor_t), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + sizeof(opTensor_t), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1075,6 +1225,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d // outputAddrs rt_ret = rtMalloc(&custom_info_.output_addrs, sizeof(opTensor_t) * output_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + sizeof(opTensor_t) * output_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1083,6 +1235,8 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d rt_ret = rtMemcpy(custom_info_.output_addrs, kAddrLen * output_size, &output_data_addrs[0], kAddrLen * output_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + kAddrLen * output_size, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1099,11 +1253,15 @@ Status KernelTaskInfo::SetContext(const domi::KernelDef &kernel_def) { ctx_.isFlowtable = context.is_flowtable(); ctx_.argsCount = context.args_count(); if (ctx_.argsCount == 0) { + REPORT_INNER_ERROR("E19999", "kernel_def.context.args_count is 0, check invalid"); GELOGE(INTERNAL_ERROR, "check argsCount fail:%u.", ctx_.argsCount); return INTERNAL_ERROR; } if (context.args_offset().size() / sizeof(uint16_t) < ctx_.argsCount) { + REPORT_INNER_ERROR("E19999", "param [context.args_offset().size():%zu / sizeof(uint16_t)] " + "is less than [ctx_.argsCount:%u], check invalid", + context.args_offset().size(), ctx_.argsCount); GELOGE(PARAM_INVALID, "param [context.args_offset().size() / sizeof(uint16_t)] is less than [ctx_.argsCount]"); return PARAM_INVALID; } @@ -1111,6 +1269,8 @@ Status KernelTaskInfo::SetContext(const domi::KernelDef &kernel_def) { // ctx_.argsOffset stores the offset of the internal information of agrs_, equal to the ctx_.argsCount ctx_.argsOffset = new (std::nothrow) uint16_t[ctx_.argsCount](); if (ctx_.argsOffset == nullptr) { + REPORT_CALL_ERROR("E19999", "New ctx_.argsOffset fail, size:%u", + ctx_.argsCount); GELOGE(PARAM_INVALID, "(param [ctx_.argsOffset] must not be null."); return PARAM_INVALID; } @@ -1128,6 +1288,7 @@ void KernelTaskInfo::FreeRtMem(void **ptr) { } rtError_t ret = rtFree(*ptr); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFree failed, ret:0x%X", ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret); } @@ -1175,6 +1336,8 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u if (handle == nullptr) { error = mmDlerror(); GE_IF_BOOL_EXEC(error == nullptr, error = ""); + REPORT_INNER_ERROR("E19999", "Failed in dlopen:%s, dlerror:%s", + canonicalPath.c_str(), error); GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", error); return FAILED; } @@ -1183,6 +1346,8 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u auto cceUpdateKernelArgs = (ccStatus_t(*)(ccOpContext &, uint64_t, uint64_t, uint64_t, void *, uint64_t, void *))mmDlsym(handle, const_cast(update_kernel_args.c_str())); if (cceUpdateKernelArgs == nullptr) { + REPORT_INNER_ERROR("E19999", "No symbol:%s in %s, check invalid", + update_kernel_args.c_str(), canonicalPath.c_str()); GELOGE(FAILED, "Failed to invoke function ccUpdateKernelArgs"); if (mmDlclose(handle) != 0) { error = mmDlerror(); @@ -1207,6 +1372,8 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u return FAILED; } if (cc_ret != CC_STATUS_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call cceUpdateKernelArgs fail, ret:0x%X", + cc_ret); GELOGE(CCE_FAILED, "Call cce api failed, ret: 0x%X", cc_ret); return CCE_FAILED; } @@ -1220,6 +1387,8 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe if (context.is_flowtable()) { rtError_t rt_ret = rtMalloc(&flowtable_, flowtable.size(), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%zu, ret:0x%X", + flowtable.size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1227,6 +1396,8 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe rt_ret = rtMemcpy(flowtable_, flowtable.size(), flowtable.data(), flowtable.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", + flowtable.size(), rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -1236,6 +1407,11 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe if (kernel_def.args().size() < ((reinterpret_cast(const_cast(context.args_offset().data())))[0] + sizeof(uint64_t))) { + REPORT_INNER_ERROR( + "E19999", "(context.args_offset().data()))[0]:%u + sizeof(uint64_t):%zu > " + "kernelDef.args().size():%zu, check invalid", + (uint32_t)((reinterpret_cast(const_cast(context.args_offset().data())))[0]), + sizeof(uint64_t), kernel_def.args().size()); GELOGE(FAILED, "(context.args_offset().data()))[0]:%u + sizeof(uint64_t):%zu > kernelDef.args().size():%zu", (uint32_t)((reinterpret_cast(const_cast(context.args_offset().data())))[0]), sizeof(uint64_t), kernel_def.args().size()); diff --git a/ge/graph/load/model_manager/task_info/kernel_task_info.h b/ge/graph/load/model_manager/task_info/kernel_task_info.h index 7cabf259..4156c511 100644 --- a/ge/graph/load/model_manager/task_info/kernel_task_info.h +++ b/ge/graph/load/model_manager/task_info/kernel_task_info.h @@ -129,6 +129,9 @@ class KernelTaskInfo : public TaskInfo { bool IsL1FusionOp(const OpDescPtr &op_desc); void SetIoAddrs(const OpDescPtr &op_desc); void InitDumpTask(uint32_t offset); + void SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model); + void SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model); + Status CopyNoncontinuousArgs(uint16_t offset); // For super kernel Status SaveSKTDumpInfo(); @@ -163,6 +166,8 @@ class KernelTaskInfo : public TaskInfo { uint32_t hybrid_args_offset_ = 0; int64_t fixed_addr_offset_ = 0; std::unique_ptr args_addr = nullptr; + uint16_t io_addr_offset_ = 0; + bool l2_buffer_on_ = false; bool call_save_dump_ = false; // aicpu ext_info device mem diff --git a/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.cc b/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.cc index 1921c85d..b858259e 100755 --- a/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.cc +++ b/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.cc @@ -17,9 +17,15 @@ #include "graph/load/model_manager/task_info/label_goto_ex_task_info.h" #include "graph/load/model_manager/davinci_model.h" -#include "graph/debug/ge_attr_define.h" namespace ge { +constexpr uint8_t kGotoBranchMax = 1; + +LabelGotoExTaskInfo::~LabelGotoExTaskInfo() { + args_ = nullptr; + GE_FREE_RT_LOG(index_value_); +} + Status LabelGotoExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("LabelGotoExTaskInfo Init Start."); GE_CHECK_NOTNULL(davinci_model); @@ -28,36 +34,66 @@ Status LabelGotoExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da return FAILED; } - // Get LabelGoto task def + // Get LabelGotoEx task def const domi::LabelGotoExDef &label_goto = task_def.label_goto_ex(); OpDescPtr op_desc = davinci_model->GetOpByIndex(label_goto.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + label_goto.op_index()); GELOGE(INTERNAL_ERROR, "Task op index:%u out of range!", label_goto.op_index()); return INTERNAL_ERROR; } uint32_t label_index = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, label_index)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_LABEL_SWITCH_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelGotoExTaskInfo: %s attr [%s] not exist.", op_desc->GetName().c_str(), ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); return INTERNAL_ERROR; } - const vector &label_list = davinci_model->GetLabelList(); - if (label_index >= label_list.size()) { - GELOGE(PARAM_INVALID, "LabelGotoExTaskInfo: Invalid label id:%u, label size:%zu", label_index, label_list.size()); - return INTERNAL_ERROR; + rtMemType_t memory_type = op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE) ? RT_MEMORY_TS_4G : RT_MEMORY_HBM; + GELOGI("memory_type: %u", memory_type); + + GE_CHK_STATUS_RET_NOLOG(davinci_model->GetLabelGotoAddr(label_index, memory_type, args_, args_size_)); + + rtError_t rt_ret = rtMalloc(&index_value_, sizeof(uint64_t), memory_type); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%lu, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), sizeof(uint64_t), rt_ret); + GELOGE(RT_FAILED, "Call rtMalloc failed, error: %#x", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); } - label_ = label_list[label_index]; - GELOGI("LabelGotoExTaskInfo Init Success, label id:%u, label:%p.", label_index, label_); + uint64_t branch_index = 0; + rt_ret = rtMemcpy(index_value_, sizeof(uint64_t), &branch_index, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%lu, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), sizeof(uint64_t), rt_ret); + GELOGE(RT_FAILED, "Call rtMemcpy failed, error: %#x", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + + GELOGI("LabelGotoExTaskInfo Init Success, label id:%u", label_index); return SUCCESS; } Status LabelGotoExTaskInfo::Distribute() { GELOGI("LabelGotoExTaskInfo Distribute Start."); - rtError_t rt_ret = rtLabelGotoEx(label_, stream_); + GE_CHECK_NOTNULL(args_); + GE_CHECK_NOTNULL(index_value_); + if (args_size_ == 0) { + REPORT_INNER_ERROR("E19999", "Param args_size_ is 0, check fail"); + GELOGE(PARAM_INVALID, "branch max: %u, args size: %u invalid.", kGotoBranchMax, args_size_); + return PARAM_INVALID; + } + + rtError_t rt_ret = rtLabelSwitchByIndex(index_value_, kGotoBranchMax, args_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtLabelSwitchByIndex failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.h b/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.h index 25310368..a3668354 100755 --- a/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.h +++ b/ge/graph/load/model_manager/task_info/label_goto_ex_task_info.h @@ -14,24 +14,26 @@ * limitations under the License. */ -#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ -#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ +#ifndef GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ +#define GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ #include "graph/load/model_manager/task_info/task_info.h" namespace ge { class LabelGotoExTaskInfo : public TaskInfo { public: - LabelGotoExTaskInfo() : label_(nullptr) {} + LabelGotoExTaskInfo() = default; - ~LabelGotoExTaskInfo() override { label_ = nullptr; } + ~LabelGotoExTaskInfo() override; Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override; Status Distribute() override; private: - void *label_; + void *index_value_{nullptr}; // switch index input. + void *args_{nullptr}; // label info memory. + uint32_t args_size_{0}; // label info length. }; } // namespace ge -#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ +#endif // GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_GOTO_EX_TASK_INFO_H_ diff --git a/ge/graph/load/model_manager/task_info/label_set_task_info.cc b/ge/graph/load/model_manager/task_info/label_set_task_info.cc index 45cb586a..c8cb7975 100644 --- a/ge/graph/load/model_manager/task_info/label_set_task_info.cc +++ b/ge/graph/load/model_manager/task_info/label_set_task_info.cc @@ -32,12 +32,17 @@ Status LabelSetTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin const domi::LabelSetDef &label_set = task_def.label_set(); OpDescPtr op_desc = davinci_model->GetOpByIndex(label_set.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + label_set.op_index()); GELOGE(INTERNAL_ERROR, "Task op index:%u out of range!", label_set.op_index()); return INTERNAL_ERROR; } uint32_t label_index = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, label_index)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_LABEL_SWITCH_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSetTaskInfo: %s attr [%s] not exist.", op_desc->GetName().c_str(), ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); return INTERNAL_ERROR; @@ -45,6 +50,9 @@ Status LabelSetTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin const vector &label_list = davinci_model->GetLabelList(); if (label_index >= label_list.size()) { + REPORT_INNER_ERROR("E19999", "lable_index:%u >= label_list.size():%zu in model, op:%s(%s), " + "check invalid", label_index, label_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSetTaskInfo: Invalid label id:%u, label size:%zu", label_index, label_list.size()); return INTERNAL_ERROR; } @@ -58,6 +66,8 @@ Status LabelSetTaskInfo::Distribute() { GELOGI("LabelSetTaskInfo Distribute Start."); rtError_t rt_ret = rtLabelSet(label_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtLabelSet failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/label_set_task_info.h b/ge/graph/load/model_manager/task_info/label_set_task_info.h index 36e41f1b..64dabddf 100644 --- a/ge/graph/load/model_manager/task_info/label_set_task_info.h +++ b/ge/graph/load/model_manager/task_info/label_set_task_info.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ -#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ +#ifndef GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ +#define GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ #include "graph/load/model_manager/task_info/task_info.h" @@ -34,4 +34,4 @@ class LabelSetTaskInfo : public TaskInfo { void *label_; }; } // namespace ge -#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ +#endif // GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SET_TASK_INFO_H_ diff --git a/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.cc b/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.cc index c2997678..b7ffdb84 100644 --- a/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.cc +++ b/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.cc @@ -16,20 +16,13 @@ #include "graph/load/model_manager/task_info/label_switch_by_index_task_info.h" -#include "graph/debug/ge_attr_define.h" #include "graph/load/model_manager/davinci_model.h" namespace ge { constexpr uint8_t kLabelSwitchIndexNum = 1; LabelSwitchByIndexTaskInfo::~LabelSwitchByIndexTaskInfo() { - if (args_ != nullptr) { - rtError_t ret = rtFree(args_); - if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret); - } - } - args_ = nullptr; + GE_FREE_RT_LOG(args_); index_value_ = nullptr; } @@ -37,16 +30,17 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo GELOGI("LabelSwitchByIndexTaskInfo Init Start."); GE_CHECK_NOTNULL(davinci_model); - const vector &label_list = davinci_model->GetLabelList(); Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); if (ret != SUCCESS) { return FAILED; } - // Get LabelSwitch task def + // Get LabelSwitchByIndex task def const domi::LabelSwitchByIndexDef &label_switch = task_def.label_switch_by_index(); OpDescPtr op_desc = davinci_model->GetOpByIndex(label_switch.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + label_switch.op_index()); GELOGE(INTERNAL_ERROR, "Task op index:%u out of range!", label_switch.op_index()); return INTERNAL_ERROR; } @@ -55,6 +49,9 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); if (input_data_addr.size() != kLabelSwitchIndexNum) { + REPORT_INNER_ERROR("E19999", "input_data_addr size:%zu != kLabelSwitchIndexNum:%u, op:%s(%s), " + "check invalid", input_data_addr.size(), kLabelSwitchIndexNum, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndexTaskInfo: %s invalid addr size: %zu, num: %u!", op_desc->GetName().c_str(), input_data_addr.size(), kLabelSwitchIndexNum); return INTERNAL_ERROR; @@ -68,30 +65,40 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo davinci_model->DisableZeroCopy(index_value_); - std::vector label_idx_list; + vector label_idx_list; if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_LABEL_SWITCH_LIST, label_idx_list)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_LABEL_SWITCH_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndexTaskInfo: %s Get attr %s failed.", op_desc->GetName().c_str(), ATTR_NAME_LABEL_SWITCH_LIST.c_str()); return INTERNAL_ERROR; } if (label_idx_list.empty() || label_idx_list.size() != branch_max_) { + REPORT_INNER_ERROR("E19999", "label_idx_list in op:%s(%s) is empty, or size:%zu != branch_max_:%u" + "check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + label_idx_list.size(), branch_max_); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndexTaskInfo: %s label index size: %zu, task branch max: %u.", op_desc->GetName().c_str(), label_idx_list.size(), branch_max_); return INTERNAL_ERROR; } - label_list_.resize(branch_max_, nullptr); + vector label_used(branch_max_, nullptr); + const vector &label_list = davinci_model->GetLabelList(); for (size_t idx = 0; idx < label_idx_list.size(); ++idx) { uint32_t label_id = label_idx_list[idx]; if (label_id >= label_list.size()) { + REPORT_INNER_ERROR("E19999", "label_id:%u in op:%s(%s) >= label_list.size():%zu in model" + "check invalid", label_id, + op_desc->GetName().c_str(), op_desc->GetType().c_str(), label_list.size()); GELOGE(INTERNAL_ERROR, "LabelSwitchByIndexTaskInfo: %s index: %zu, label index: %u, model label size: %zu.", op_desc->GetName().c_str(), idx, label_id, label_list.size()); return INTERNAL_ERROR; } GE_CHECK_NOTNULL(label_list[label_id]); - - label_list_[idx] = label_list[label_id]; + label_used[idx] = label_list[label_id]; } rtMemType_t memory_type = op_desc->HasAttr(ATTR_NAME_MEMORY_TYPE_RANGE) ? RT_MEMORY_TS_4G : RT_MEMORY_HBM; @@ -99,12 +106,16 @@ Status LabelSwitchByIndexTaskInfo::Init(const domi::TaskDef &task_def, DavinciMo args_size_ = branch_max_ * sizeof(rtLabelDevInfo); rtError_t rt_ret = rtMalloc(&args_, args_size_, memory_type); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%u, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), args_size_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } - rt_ret = rtLabelListCpy(label_list_.data(), label_list_.size(), args_, args_size_); + rt_ret = rtLabelListCpy(label_used.data(), label_used.size(), args_, args_size_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtLabelListCpy failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -118,14 +129,18 @@ Status LabelSwitchByIndexTaskInfo::Distribute() { GE_CHECK_NOTNULL(args_); GE_CHECK_NOTNULL(index_value_); if (branch_max_ == 0 || args_size_ == 0) { + REPORT_INNER_ERROR("E19999", "branch_max_:%u or args_size_:%u is 0" + "check invalid", branch_max_, args_size_); GELOGE(PARAM_INVALID, "branch max: %u, args size: %u invalid.", branch_max_, args_size_); return PARAM_INVALID; } rtError_t rt_ret = rtLabelSwitchByIndex(index_value_, branch_max_, args_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtLabelSwitchByIndex failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); - return RT_FAILED; + return RT_ERROR_TO_GE_STATUS(rt_ret); } GELOGI("LabelSwitchByIndexTaskInfo Distribute Success."); @@ -141,6 +156,9 @@ Status LabelSwitchByIndexTaskInfo::CalculateArgs(const domi::TaskDef &task_def, GE_CHECK_NOTNULL(op_desc); GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str()); if (op_desc->GetInputsSize() != kLabelSwitchIndexNum) { + REPORT_INNER_ERROR("E19999", "input size:%zu in op:%s(%s) != kLabelSwitchIndexNum" + "check invalid", op_desc->GetInputsSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Label switch op only have one data input. Now input size is %zu", op_desc->GetInputsSize()); return FAILED; } diff --git a/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.h b/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.h index 00ca0844..5a8ac05a 100644 --- a/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.h +++ b/ge/graph/load/model_manager/task_info/label_switch_by_index_task_info.h @@ -14,16 +14,15 @@ * limitations under the License. */ -#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ -#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ +#ifndef GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ +#define GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ #include "graph/load/model_manager/task_info/task_info.h" namespace ge { class LabelSwitchByIndexTaskInfo : public TaskInfo { public: - LabelSwitchByIndexTaskInfo() - : index_value_(nullptr), branch_max_(0), args_(nullptr), args_size_(0), fixed_addr_offset_(0) {} + LabelSwitchByIndexTaskInfo() = default; ~LabelSwitchByIndexTaskInfo() override; @@ -34,12 +33,11 @@ class LabelSwitchByIndexTaskInfo : public TaskInfo { Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override; private: - void *index_value_; // switch index input. - uint32_t branch_max_; // max branch count. - void *args_; // label info memory. - uint32_t args_size_; // label info length. - std::vector label_list_; - int64_t fixed_addr_offset_; + void *index_value_{nullptr}; // switch index input. + uint32_t branch_max_{0}; // max branch count. + void *args_{nullptr}; // label info memory. + uint32_t args_size_{0}; // label info length. + int64_t fixed_addr_offset_{0}; }; } // namespace ge -#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ \ No newline at end of file +#endif // GE_GRAPH_LOAD_MODEL_MANAGER_TASK_INFO_LABEL_SWITCH_BY_INDEX_TASK_INFO_H_ \ No newline at end of file diff --git a/ge/graph/load/model_manager/task_info/memcpy_addr_async_task_info.cc b/ge/graph/load/model_manager/task_info/memcpy_addr_async_task_info.cc index a1f58e42..960862b4 100755 --- a/ge/graph/load/model_manager/task_info/memcpy_addr_async_task_info.cc +++ b/ge/graph/load/model_manager/task_info/memcpy_addr_async_task_info.cc @@ -36,6 +36,8 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel const auto &memcpy_async = task_def.memcpy_async(); OpDescPtr op_desc = davinci_model->GetOpByIndex(memcpy_async.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + memcpy_async.op_index()); GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async.op_index()); return INTERNAL_ERROR; } @@ -61,6 +63,9 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel GELOGI("memory_type: %u", memory_type); rtError_t rt_ret = rtMalloc(&args_, args_size + kAlignBytes, memory_type); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed for op:%s(%s), size:%lu, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + args_size + kAlignBytes, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -71,6 +76,8 @@ Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel static_cast(args_align_) + args_size, dst_, io_addrs.size()); rt_ret = rtMemcpy(args_align_, args_size, io_addrs.data(), args_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed for op:%s(%s), size:%zu, ret:0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), args_size, rt_ret); GELOGE(RT_FAILED, "Call rt api for src failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -91,6 +98,8 @@ Status MemcpyAddrAsyncTaskInfo::Distribute() { rtError_t rt_ret = rtMemcpyAsync(reinterpret_cast(reinterpret_cast(args_align_) + sizeof(void *)), dst_max_, args_align_, count_, static_cast(kind_), stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync failed, size:%lu, ret:0x%X", + dst_max_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/memcpy_async_task_info.cc b/ge/graph/load/model_manager/task_info/memcpy_async_task_info.cc index 22f9267d..0bc8fb8d 100755 --- a/ge/graph/load/model_manager/task_info/memcpy_async_task_info.cc +++ b/ge/graph/load/model_manager/task_info/memcpy_async_task_info.cc @@ -36,6 +36,8 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da dst_max_ = memcpy_async.dst_max(); OpDescPtr op_desc = davinci_model_->GetOpByIndex(memcpy_async.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + memcpy_async.op_index()); GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async.op_index()); return INTERNAL_ERROR; } @@ -86,6 +88,8 @@ Status MemcpyAsyncTaskInfo::Distribute() { rtError_t rt_ret = rtMemcpyAsync(dst_, dst_max_, src_, count_, static_cast(kind_), stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync failed, size:%lu, ret:0x%X", + dst_max_, rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/model_exit_task_info.cc b/ge/graph/load/model_manager/task_info/model_exit_task_info.cc index eb200e3f..f0e3dfb7 100644 --- a/ge/graph/load/model_manager/task_info/model_exit_task_info.cc +++ b/ge/graph/load/model_manager/task_info/model_exit_task_info.cc @@ -24,6 +24,7 @@ namespace ge { Status ModelExitTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("InitModelExitTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -43,6 +44,8 @@ Status ModelExitTaskInfo::Distribute() { GELOGI("ModelExitTaskInfo Distribute Start."); rtError_t rt_ret = rtModelExit(model_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelExit failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rtModelExit failed, ret: 0x%x", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/profiler_trace_task_info.cc b/ge/graph/load/model_manager/task_info/profiler_trace_task_info.cc index b8fd1828..4e829182 100755 --- a/ge/graph/load/model_manager/task_info/profiler_trace_task_info.cc +++ b/ge/graph/load/model_manager/task_info/profiler_trace_task_info.cc @@ -23,6 +23,7 @@ namespace ge { Status ProfilerTraceTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("ProfilerTraceTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -46,6 +47,8 @@ Status ProfilerTraceTaskInfo::Distribute() { rtError_t rt_ret = rtProfilerTrace(log_id_, notify_, flat_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtProfilerTrace failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/stream_active_task_info.cc b/ge/graph/load/model_manager/task_info/stream_active_task_info.cc index ec807777..4ab4951d 100755 --- a/ge/graph/load/model_manager/task_info/stream_active_task_info.cc +++ b/ge/graph/load/model_manager/task_info/stream_active_task_info.cc @@ -26,6 +26,7 @@ namespace ge { Status StreamActiveTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("StreamActiveTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -45,17 +46,26 @@ Status StreamActiveTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d GE_CHECK_NOTNULL(op_desc); std::vector active_stream_index_list; if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_index_list)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM fail, node name:%s.", op_desc->GetName().c_str()); return INTERNAL_ERROR; } if (internal_index >= active_stream_index_list.size()) { + REPORT_INNER_ERROR("E19999", "flowctrl index:%u >= active_stream_list size:%zu in op:%s(%s), " + "check invalid", internal_index, active_stream_index_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "InitStreamSwitchTaskInfo stream id index invalid. index:%u, list size:%zu.", internal_index, active_stream_index_list.size()); return INTERNAL_ERROR; } if (active_stream_index_list[internal_index] >= davinci_model->GetStreamList().size()) { + REPORT_INNER_ERROR("E19999", "active_stream_index:%u in op:%s(%s) >= stream size:%zu in model, " + "check invalid", active_stream_index_list[internal_index], + op_desc->GetName().c_str(), op_desc->GetType().c_str(), davinci_model->GetStreamList().size()); GELOGE(INTERNAL_ERROR, "InitStreamSwitchTaskInfo stream index invalid. index:%u, stream list size:%zu.", active_stream_index_list[internal_index], davinci_model->GetStreamList().size()); return INTERNAL_ERROR; @@ -73,6 +83,8 @@ Status StreamActiveTaskInfo::Distribute() { GELOGI("StreamActiveTaskInfo Distribute Start."); rtError_t rt_ret = rtStreamActive(active_stream_, stream_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamActive failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } diff --git a/ge/graph/load/model_manager/task_info/stream_switch_task_info.cc b/ge/graph/load/model_manager/task_info/stream_switch_task_info.cc index f129950a..33dfacf7 100644 --- a/ge/graph/load/model_manager/task_info/stream_switch_task_info.cc +++ b/ge/graph/load/model_manager/task_info/stream_switch_task_info.cc @@ -31,6 +31,7 @@ const uint32_t kTrueBranchStreamNum = 1; Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("StreamSwitchTaskInfo Init Start."); if (davinci_model == nullptr) { + REPORT_INNER_ERROR("E19999", "Check param davinci_model nullptr"); GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } @@ -49,6 +50,9 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d SetInputAndValuePtr(davinci_model, input_data_addr); uint32_t cond = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_STREAM_SWITCH_COND, cond)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_STREAM_SWITCH_COND.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitchOp get attr STREAM_SWITCH_COND fail."); return INTERNAL_ERROR; } @@ -56,6 +60,9 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d size_t input_size = op_desc->GetInputsSize(); if (input_data_addr.size() != STREAM_SWITCH_INPUT_NUM || input_size != STREAM_SWITCH_INPUT_NUM) { + REPORT_INNER_ERROR("E19999", "input_data_addr.size():%zu or input size:%zu != STREAM_SWITCH_INPUT_NUM:%u " + "in op:%s(%s), check invalid", input_data_addr.size(), input_size, + STREAM_SWITCH_INPUT_NUM, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Input num should be %u. inputAddr size:%zu, inputDesc size:%zu.", STREAM_SWITCH_INPUT_NUM, input_data_addr.size(), input_size); return INTERNAL_ERROR; @@ -63,17 +70,26 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d vector active_stream_list; if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitchOp get attr ACTIVE_STREAM_LIST fail."); return INTERNAL_ERROR; } if (active_stream_list.size() != kTrueBranchStreamNum) { + REPORT_INNER_ERROR("E19999", "active_stream_list.size():%zu in op:%s(%s) != kTrueBranchStreamNum:%u, " + "check invalid", active_stream_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), kTrueBranchStreamNum); GELOGE(FAILED, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum); return FAILED; } size_t true_stream_index = active_stream_list.front(); if (true_stream_index >= davinci_model->GetStreamList().size()) { + REPORT_INNER_ERROR("E19999", "active_stream_index:%zu in op:%s(%s) >= stream list size:%zu in model," + "check invalid", true_stream_index, + op_desc->GetName().c_str(), op_desc->GetType().c_str(), davinci_model->GetStreamList().size()); GELOGE(INTERNAL_ERROR, "InitStreamSwitchTaskInfo stream index invalid. index:%zu, stream list size:%zu.", true_stream_index, davinci_model->GetStreamList().size()); return INTERNAL_ERROR; @@ -87,6 +103,9 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d if (op_desc->HasAttr(ATTR_NAME_SWITCH_DATA_TYPE)) { int64_t data_type = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_SWITCH_DATA_TYPE, data_type)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_SWITCH_DATA_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "StreamSwitchOp[node:%s] get attr SWITCH_DATA_TYPE fail.", op_desc->GetName().c_str()); return FAILED; } @@ -103,6 +122,8 @@ Status StreamSwitchTaskInfo::Distribute() { GELOGI("StreamSwitchTaskInfo Distribute Start."); rtError_t rt_ret = rtStreamSwitchEx(input_ptr_, cond_, value_ptr_, true_stream_, stream_, data_type_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSwitchEx fail, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -119,11 +140,14 @@ Status StreamSwitchTaskInfo::CalculateArgs(const domi::TaskDef &task_def, Davinc GE_CHECK_NOTNULL(op_desc); GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str()); if (op_desc->GetInputsSize() != STREAM_SWITCH_INPUT_NUM) { + REPORT_INNER_ERROR("E19999", "input size:%zu in op:%s(%s) != STREAM_SWITCH_INPUT_NUM:%u," + "check invalid", op_desc->GetInputsSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), STREAM_SWITCH_INPUT_NUM); GELOGE(FAILED, "Stream switch op only have one data input. Now input size is %zu", op_desc->GetInputsSize()); return FAILED; } for (uint32_t i = 0; i < STREAM_SWITCH_INPUT_NUM; ++i) { - string input_tensor_name = op_desc->GetInputNameByIndex(i); + string input_tensor_name = op_desc->GetName() + std::to_string(i); int64_t fixed_addr_offset = davinci_model->GetFixedAddrsSize(input_tensor_name); fixed_addr_offset_.emplace_back(fixed_addr_offset); auto tensor_desc = op_desc->GetInputDesc(i); diff --git a/ge/graph/load/model_manager/task_info/stream_switchn_task_info.cc b/ge/graph/load/model_manager/task_info/stream_switchn_task_info.cc index 35eb23e3..40bbff02 100755 --- a/ge/graph/load/model_manager/task_info/stream_switchn_task_info.cc +++ b/ge/graph/load/model_manager/task_info/stream_switchn_task_info.cc @@ -36,6 +36,8 @@ Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel * auto stream_switchn_def = task_def.stream_switch_n(); OpDescPtr op_desc = davinci_model->GetOpByIndex(stream_switchn_def.op_index()); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Can't get op_desc from davinci_model by index:%u", + stream_switchn_def.op_index()); GELOGE(FAILED, "Index is out of range, index: %u", stream_switchn_def.op_index()); return FAILED; } @@ -46,6 +48,9 @@ Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel * // set value_ptr_ auto value = stream_switchn_def.target_value(); if (value.size() == 0) { + REPORT_INNER_ERROR("E19999", "task_Def.stream_switch_n.target_value:%d in op:%s(%s) is 0," + "check invalid", value.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "The number of gears in dynamic batch scenario can not be 0."); return FAILED; } @@ -57,6 +62,9 @@ Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel * // set element_size_ if (!AttrUtils::GetInt(op_desc, ATTR_NAME_BATCH_NUM, element_size_)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_BATCH_NUM.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Get ATTR_NAME_BATCH_NUM of switchN op failed."); return FAILED; } @@ -84,6 +92,8 @@ Status StreamSwitchNTaskInfo::Distribute() { rtError_t rt_ret = rtStreamSwitchN(input_ptr_, input_size_, value_ptr_, true_stream_ptr_, element_size_, stream_, data_type_); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSwitchN failed, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -96,11 +106,17 @@ Status StreamSwitchNTaskInfo::Distribute() { Status StreamSwitchNTaskInfo::GetTrueStreamPtr(const OpDescPtr &op_desc, DavinciModel *davinci_model) { vector true_stream_id_list; if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, true_stream_id_list)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_NAME_ACTIVE_STREAM_LIST.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "StreamSwitchNOp get attr ACTIVE_STREAM_LIST fail."); return FAILED; } if (true_stream_id_list.size() > davinci_model->GetStreamList().size()) { + REPORT_INNER_ERROR("E19999", "active_stream_list.size:%zu in op:%s(%s) >= stream list size:%zu in model," + "check invalid", true_stream_id_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), davinci_model->GetStreamList().size()); GELOGE(FAILED, "InitStreamSwitchNTaskInfo get true stream id list failed. true stream size:%zu, " "stream list size:%zu.", @@ -112,6 +128,9 @@ Status StreamSwitchNTaskInfo::GetTrueStreamPtr(const OpDescPtr &op_desc, Davinci for (size_t i = 0; i < true_stream_id_list.size(); ++i) { uint32_t true_stream_id = true_stream_id_list[i]; if (true_stream_id >= davinci_model->GetStreamList().size()) { + REPORT_INNER_ERROR("E19999", "active_stream_id:%u in op:%s(%s) >= stream list size:%zu in model," + "check invalid", true_stream_id, + op_desc->GetName().c_str(), op_desc->GetType().c_str(), davinci_model->GetStreamList().size()); GELOGE(FAILED, "InitStreamSwitchNTaskInfo stream id invalid. id:%u, stream list size:%zu.", true_stream_id, davinci_model->GetStreamList().size()); return FAILED; @@ -122,6 +141,9 @@ Status StreamSwitchNTaskInfo::GetTrueStreamPtr(const OpDescPtr &op_desc, Davinci } if (true_stream_list_.empty()) { + REPORT_INNER_ERROR("E19999", "active_stream_list.size():%zu in op:%s(%s) is empty, " + "check invalid", true_stream_id_list.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "true stream list is null."); return FAILED; } @@ -138,6 +160,9 @@ Status StreamSwitchNTaskInfo::CalculateArgs(const domi::TaskDef &task_def, Davin GE_CHECK_NOTNULL(op_desc); GELOGI("Calc opType[%s] args size. Node name is [%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str()); if (op_desc->GetInputsSize() != kStreamSwitchnInputNum) { + REPORT_INNER_ERROR("E19999", "input size:%zu in op:%s(%s) != kStreamSwitchnInputNum:%u ," + "check invalid", op_desc->GetInputsSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), kStreamSwitchnInputNum); GELOGE(FAILED, "Stream switchn op only have one data input. Now input size is %zu", op_desc->GetInputsSize()); return FAILED; } @@ -159,6 +184,9 @@ Status StreamSwitchNTaskInfo::InputPtrUpdate(const OpDescPtr &op_desc, DavinciMo const vector input_offset = op_desc->GetInputOffset(); const vector input_legnth = ModelUtils::GetInputSize(op_desc); if (input_offset.empty() || input_legnth.empty()) { + REPORT_INNER_ERROR("E19999", "input_offset size:%zu or input_length.size:%zu in op:%s(%s) is empty," + "check invalid", input_offset.size(), input_legnth.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "input offset size %zu, input legnth size: %zu", input_offset.size(), input_legnth.size()); return FAILED; } @@ -170,6 +198,9 @@ Status StreamSwitchNTaskInfo::InputPtrUpdate(const OpDescPtr &op_desc, DavinciMo } else { auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); if (input_data_addr.empty()) { + REPORT_INNER_ERROR("E19999", "input_data_addr size:%zu in op:%s(%s) is empty," + "check invalid", input_data_addr.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "input data addr is empty"); return FAILED; } diff --git a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel.cc b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel.cc index 65dca3b3..66bf5ab7 100644 --- a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel.cc +++ b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel.cc @@ -26,15 +26,24 @@ Status SuperKernel::Launch(rtStream_t stream, uint32_t dump_flag) { reinterpret_cast(static_cast(this->GetNavTableSize()))}; rtError_t rt_ret = rtMalloc(reinterpret_cast(&device_args_addr_), sizeof(args), RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%lu, ret:0x%X", + sizeof(args), rt_ret); + GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(reinterpret_cast(device_args_addr_), sizeof(args), reinterpret_cast(args), sizeof(args), RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%lu, ret:0x%X", + sizeof(args), rt_ret); + GELOGE(RT_FAILED, "rtMemcpy failied. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr_, sizeof(args), NULL, stream, dump_flag); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtKernelLaunchWithFlag failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtKernelLaunchWithFlag failed, dump_flag:%u, ret:0x%X", + dump_flag, rt_ret); + GELOGE(RT_FAILED, "rtKernelLaunchWithFlag failied. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) return SUCCESS; } diff --git a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc index 4e22cd7c..9ba62475 100644 --- a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc +++ b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc @@ -35,14 +35,16 @@ Status SuperKernelFactory::Init() { } rtError_t rt_ret; rt_ret = rtGetFunctionByName(this->sk_stub_name_.c_str(), &this->func_stub_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, - "rtGetFunctionByName " - "failed. stub_func: %s, please export LD_LIBRARY_PATH for " - "libcce_aicore.so", - this->sk_stub_name_.c_str()); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtGetFunctionByName failed, stub_func:%s, ret:0x%X", + this->sk_stub_name_.c_str(), rt_ret); + GELOGE(RT_FAILED, "rtGetFunctionByName failed. stub_func: %s, please export LD_LIBRARY_PATH for " + "libcce_aicore.so", this->sk_stub_name_.c_str()); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtGetAddrByFun(this->func_stub_, &this->func_ptr_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtGetAddrByFun failed. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtGetAddrByFun failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "rtGetAddrByFun failed. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) GELOGD( "SKT: fuseKernels super_kernel_template subFunc %p, device func " @@ -98,7 +100,9 @@ Status SuperKernelFactory::FuseKernels(const std::vector &stub_func_list for (unsigned i = 0; i < stub_func_list.size(); i++) { void *sub_device_func = nullptr; rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtGetAddrByFun failed. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtGetAddrByFun failed, ret:0x%X", rt_ret); + GELOGE(RT_FAILED, "rtGetAddrByFun failed. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func); // store two uint64_t address @@ -109,11 +113,17 @@ Status SuperKernelFactory::FuseKernels(const std::vector &stub_func_list GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * kFusedKernelSizeUnit + 1]); } rt_ret = rtMalloc(reinterpret_cast(&hbm_nav_table_addr), nav_table_size, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failed. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%lu, ret:0x%X", + nav_table_size, rt_ret); + GELOGE(RT_FAILED, "rtMalloc failed. error: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);) rt_ret = rtMemcpy(reinterpret_cast(hbm_nav_table_addr), nav_table_size, reinterpret_cast(nav_table.get()), nav_table_size, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failed. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%lu, ret:0x%X", + nav_table_size, rt_ret); + GELOGE(RT_FAILED, "rtMemcpy failed. error: 0x%X", rt_ret); GE_CHK_RT(rtFree(hbm_nav_table_addr)); return RT_ERROR_TO_GE_STATUS(rt_ret);) // Create the necessary metadata for the super kernel h = diff --git a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.h b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.h index aeb5b49b..c5058b6a 100644 --- a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.h +++ b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.h @@ -28,7 +28,7 @@ class SuperKernelFactory { void *func_stub_ = nullptr; void *func_ptr_ = nullptr; void *handle_ = nullptr; - std::string sk_stub_name_ = "_Z21super_kernel_templatePmm"; + std::string sk_stub_name_ = "super_kernel_template"; bool is_init_ = false; SuperKernelFactory() {}; ~SuperKernelFactory() { diff --git a/ge/graph/load/model_manager/task_info/task_info.cc b/ge/graph/load/model_manager/task_info/task_info.cc index e521f95c..fb446bf7 100755 --- a/ge/graph/load/model_manager/task_info/task_info.cc +++ b/ge/graph/load/model_manager/task_info/task_info.cc @@ -25,6 +25,8 @@ Status TaskInfo::SetStream(uint32_t stream_id, const std::vector &st } else if (stream_list.size() > stream_id) { stream_ = stream_list[stream_id]; } else { + REPORT_INNER_ERROR("E19999", "stream_id:%u >= stream_list.size(): %zu, check invalid", + stream_id, stream_list.size()); GELOGE(FAILED, "index: %u >= stream_list.size(): %zu.", stream_id, stream_list.size()); return FAILED; } diff --git a/ge/graph/load/model_manager/tbe_handle_store.cc b/ge/graph/load/model_manager/tbe_handle_store.cc index 591e88d0..6efb6190 100755 --- a/ge/graph/load/model_manager/tbe_handle_store.cc +++ b/ge/graph/load/model_manager/tbe_handle_store.cc @@ -23,6 +23,7 @@ namespace ge { void TbeHandleInfo::used_inc(uint32_t num) { if (used_ > std::numeric_limits::max() - num) { + REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric max.", used_); return; } @@ -32,6 +33,7 @@ void TbeHandleInfo::used_inc(uint32_t num) { void TbeHandleInfo::used_dec(uint32_t num) { if (used_ < std::numeric_limits::min() + num) { + REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric min.", used_); return; } @@ -105,6 +107,8 @@ void TBEHandleStore::ReferTBEHandle(const std::string &name) { std::lock_guard lock(mutex_); auto it = kernels_.find(name); if (it == kernels_.end()) { + REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", + name.c_str()); GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", name.c_str()); return; } @@ -124,6 +128,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map &names for (auto &item : names) { auto it = kernels_.find(item.first); if (it == kernels_.end()) { + REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", + item.first.c_str()); GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", item.first.c_str()); continue; } @@ -134,6 +140,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map &names } else { rtError_t rt_ret = rtDevBinaryUnRegister(info.handle()); if (rt_ret != RT_ERROR_NONE) { + REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", + item.first.c_str(), rt_ret); GELOGE(INTERNAL_ERROR, "Kernel[%s] UnRegister handle fail:%u.", item.first.c_str(), rt_ret); } kernels_.erase(it); diff --git a/ge/graph/load/model_manager/zero_copy_offset.cc b/ge/graph/load/model_manager/zero_copy_offset.cc index 4a448869..9d6f4e4f 100644 --- a/ge/graph/load/model_manager/zero_copy_offset.cc +++ b/ge/graph/load/model_manager/zero_copy_offset.cc @@ -76,6 +76,8 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector &input_size_list auto tensor_desc = op_desc->GetInputDescPtr(idx); GE_CHECK_NOTNULL(tensor_desc); if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { + REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); GELOGE(FAILED, "GetTensorSizeInBytes failed!"); return FAILED; } diff --git a/ge/graph/load/model_manager/zero_copy_offset.h b/ge/graph/load/model_manager/zero_copy_offset.h index 82e1bb6d..2dea5666 100644 --- a/ge/graph/load/model_manager/zero_copy_offset.h +++ b/ge/graph/load/model_manager/zero_copy_offset.h @@ -58,15 +58,15 @@ class ZeroCopyOffset { uint32_t GetDataCount() const { return data_count_; } uint32_t GetAddrCount() const { return addr_count_; } // value of *data_info_ from davinci_model - std::vector> GetDataInfo() const { return data_info_; } + const std::vector> &GetDataInfo() const { return data_info_; } // relative_offset from zero_copy_relative_offset_ - std::vector GetRelativeOffset() const { return relative_offset_; } + const std::vector &GetRelativeOffset() const { return relative_offset_; } // data_size of Data/Netoutput int64_t GetDataSize() const { return data_size_; } // value of *outside_addrs_ from davinci_model const std::vector>> &GetOutsideAddrs() const { return outside_addrs_; } // name of op - std::string GetOpName() const { return op_name_; } + const std::string &GetOpName() const { return op_name_; } const bool IsRelativeOffsetValid() const { return valid_relative_offset_; } private: diff --git a/ge/graph/load/model_manager/zero_copy_task.cc b/ge/graph/load/model_manager/zero_copy_task.cc index 367de87a..c96dd8b7 100755 --- a/ge/graph/load/model_manager/zero_copy_task.cc +++ b/ge/graph/load/model_manager/zero_copy_task.cc @@ -36,6 +36,8 @@ ZeroCopyTask::~ZeroCopyTask() { args_addr_ = nullptr; } */ Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { if (offset + sizeof(uintptr_t) > args_size_) { + REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", + offset, args_size_); GELOGE(FAILED, "[ZCPY] %s set task args failed, args size: %zu, offset: %zu", name_.c_str(), args_size_, offset); return FAILED; // unexpected error, need fix. } @@ -116,6 +118,8 @@ Status ZeroCopyTask::DistributeParam(bool async_mode, rtStream_t stream) { } if (rt_err != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret: 0x%X", + args_size_, rt_err); GELOGE(RT_FAILED, "[ZCPY] %s distribute task param failed, error=0x%x", name_.c_str(), rt_err); return RT_ERROR_TO_GE_STATUS(rt_err); } diff --git a/ge/graph/manager/graph_caching_allocator.cc b/ge/graph/manager/graph_caching_allocator.cc index 03ca352e..bfa1cdc7 100644 --- a/ge/graph/manager/graph_caching_allocator.cc +++ b/ge/graph/manager/graph_caching_allocator.cc @@ -40,7 +40,7 @@ static bool BlockComparator(const Block *left, const Block *right) { } bool CanMerge(Block *block) { - if (block == nullptr || block->allocated || !block->IsSplit()) { + if ((block == nullptr) || block->allocated || !block->IsSplit()) { return false; } return true; @@ -52,7 +52,7 @@ size_t GetBinIndex(size_t size) { if (size <= range) { break; } - ++index; + index++; } if (index > kNumBins - 1) { index = kNumBins - 1; @@ -87,15 +87,15 @@ bool ShouldSplit(const Block *block, size_t size) { void IncreaseCount(std::map &count, size_t size) { auto it = count.find(size); - if (it != count.end()) { - it->second++; - } else { + if (it == count.end()) { count.emplace(size, 1); + } else { + it->second++; } } CachingAllocator::CachingAllocator(rtMemType_t memory_type) : memory_type_(memory_type), memory_allocator_(nullptr) { - for (uint32_t i = 0; i < kNumBins; ++i) { + for (uint32_t i = 0; i < kNumBins; i++) { free_block_bins_[i] = nullptr; } } @@ -105,12 +105,13 @@ Status CachingAllocator::Initialize(uint32_t device_id) { // when redo Initialize free old memory FreeBlocks(); std::lock_guard lock(mutex_); - for (uint32_t i = 0; i < kNumBins; ++i) { + for (uint32_t i = 0; i < kNumBins; i++) { if (free_block_bins_[i] != nullptr) { continue; } auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); if (bin_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc BlockBin failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -132,20 +133,22 @@ void CachingAllocator::Finalize(uint32_t device_id) { uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device_id) { GELOGI("Start malloc pool memory, size = %zu, device id = %u", size, device_id); - uint8_t *ptr = nullptr; size = GetBlockSize(size); + uint8_t *ptr = nullptr; Block *block = FindFreeBlock(size, org_ptr, device_id); - if (block != nullptr) { - ptr = block->ptr; - } else { + if (block == nullptr) { if (ge::SUCCESS == TryExtendCache(size, device_id)) { block = FindFreeBlock(size, org_ptr, device_id); if (block != nullptr) { ptr = block->ptr; } } + } else { + ptr = block->ptr; } if (ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", + size, device_id); GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size); } return ptr; @@ -154,6 +157,8 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { GELOGI("Free device id = %u", device_id); if (ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", + device_id); GELOGE(PARAM_INVALID, "Invalid memory pointer"); return ge::PARAM_INVALID; } @@ -161,6 +166,8 @@ Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { std::lock_guard lock(mutex_); auto it = allocated_blocks_.find(ptr); if (it == allocated_blocks_.end()) { + REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", + device_id); GELOGE(PARAM_INVALID, "Invalid memory pointer"); return ge::PARAM_INVALID; } @@ -171,7 +178,7 @@ Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { } void CachingAllocator::FreeBlock(Block *block) { - if (block == nullptr || !block->allocated) { + if ((block == nullptr) || !block->allocated) { return; } GELOGI("Free block size = %zu", block->size); @@ -187,7 +194,7 @@ void CachingAllocator::FreeBlock(Block *block) { } void CachingAllocator::MergeBlocks(Block *dst, Block *src, BlockBin &bin) { - if (!CanMerge(dst) || !CanMerge(src)) { + if (!CanMerge(src) || !CanMerge(dst)) { return; } @@ -218,6 +225,8 @@ Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t d Block key(device_id, size, org_ptr); BlockBin *bin = GetBlockBin(size); if (bin == nullptr) { + REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", + size, device_id); GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); return nullptr; } @@ -249,6 +258,8 @@ Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, ui Block *remaining = block; Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); if (new_block == nullptr) { + REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", + size, device_id); GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); return block; } @@ -293,11 +304,15 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { BlockBin *bin = GetBlockBin(size); if (bin == nullptr) { + REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", + size, device_id); GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); return ge::FAILED; } Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); if (block == nullptr) { + REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", + size, device_id); GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); return ge::FAILED; } @@ -316,7 +331,7 @@ size_t CachingAllocator::FreeCachedBlocks() { GELOGI("Free cached blocks"); std::lock_guard lock(mutex_); size_t free_cached_memory_size = 0; - for (uint32_t i = 0; i < kNumBins; ++i) { + for (uint32_t i = 0; i < kNumBins; i++) { auto pool = free_block_bins_[i]; if (pool == nullptr) { continue; @@ -324,7 +339,8 @@ size_t CachingAllocator::FreeCachedBlocks() { for (auto it = pool->begin(); it != pool->end();) { Block *block = *it; // free block memory that has not been split - if ((block != nullptr) && (block->ptr != nullptr) && (block->prev == nullptr) && (block->next == nullptr) && + if ((block != nullptr) && (block->ptr != nullptr) && + (block->prev == nullptr) && (block->next == nullptr) && (memory_allocator_->FreeMemory(block->ptr) == ge::SUCCESS)) { auto itcount = malloced_memory_.find(block->size); free_cached_memory_size += block->size; @@ -345,7 +361,7 @@ size_t CachingAllocator::FreeCachedBlocks() { } void CachingAllocator::FreeBlocks() { - GELOGI("Free blocks"); + GELOGI("Free blocks."); std::lock_guard lock(mutex_); // free allocated blocks and put to cache for (auto &it : allocated_blocks_) { @@ -355,10 +371,18 @@ void CachingAllocator::FreeBlocks() { (void) FreeCachedBlocks(); } +void CachingAllocator::TryFreeBlocks() { + GELOGI("Try free blocks."); + std::lock_guard lock(mutex_); + if (allocated_blocks_.empty()) { + (void) FreeCachedBlocks(); + } +} + void CachingAllocator::FreeBlockBins() { - GELOGI("Free block bins"); + GELOGI("Free block bins."); std::lock_guard lock(mutex_); - for (uint32_t i = 0; i < kNumBins; ++i) { + for (uint32_t i = 0; i < kNumBins; i++) { if (free_block_bins_[i] != nullptr) { delete free_block_bins_[i]; free_block_bins_[i] = nullptr; @@ -367,9 +391,9 @@ void CachingAllocator::FreeBlockBins() { } void PrintCount(std::map &count, const std::string &name, size_t total_size, size_t total_count) { - GELOGI("%6s total[size:%10zu count:%10zu]", name.c_str(), total_size, total_count); + GELOGI("%6s total[size:%10zu count:%10zu].", name.c_str(), total_size, total_count); for (auto &it : count) { - GELOGI(" |- block[size:%10zu count:%10zu]", it.first, it.second); + GELOGI(" |- block[size:%10zu count:%10zu].", it.first, it.second); } } @@ -383,20 +407,20 @@ void CachingAllocator::PrintStatics() { size_t total_free_count = 0; size_t total_malloc_size = 0; size_t total_malloc_count = 0; - std::map using_block; - std::map free_block; - std::map malloc_block; + std::map using_block_stat; + std::map free_block_stat; + std::map malloc_block_stat; do { std::lock_guard lock(mutex_); - for (uint32_t i = 0; i < kNumBins; ++i) { + for (uint32_t i = 0; i < kNumBins; i++) { auto pool = free_block_bins_[i]; if (pool == nullptr) { continue; } - for (auto it = pool->begin(); it != pool->end(); ++it) { + for (auto it = pool->begin(); it != pool->end(); it++) { if ((*it) != nullptr) { total_free_size += (*it)->size; - IncreaseCount(free_block, (*it)->size); + IncreaseCount(free_block_stat, (*it)->size); total_free_count++; } } @@ -405,7 +429,7 @@ void CachingAllocator::PrintStatics() { for (auto &it : allocated_blocks_) { if (it.second != nullptr) { total_using_size += it.second->size; - IncreaseCount(using_block, it.second->size); + IncreaseCount(using_block_stat, it.second->size); total_using_count++; } } @@ -413,12 +437,12 @@ void CachingAllocator::PrintStatics() { for (auto &it : malloced_memory_) { total_malloc_size += it.first * it.second; total_malloc_count += it.second; - malloc_block[it.first] = it.second; + malloc_block_stat[it.first] = it.second; } } while (0); - PrintCount(malloc_block, "Malloc", total_malloc_size, total_malloc_count); - PrintCount(using_block, "Using", total_using_size, total_using_count); - PrintCount(free_block, "Free", total_free_size, total_free_count); + PrintCount(malloc_block_stat, "Malloc", total_malloc_size, total_malloc_count); + PrintCount(using_block_stat, "Using", total_using_size, total_using_count); + PrintCount(free_block_stat, "Free", total_free_size, total_free_count); } } // namespace ge diff --git a/ge/graph/manager/graph_caching_allocator.h b/ge/graph/manager/graph_caching_allocator.h index 27563c2d..a9c3202a 100644 --- a/ge/graph/manager/graph_caching_allocator.h +++ b/ge/graph/manager/graph_caching_allocator.h @@ -94,6 +94,13 @@ class CachingAllocator { /// Status Free(uint8_t *memory_addr, uint32_t device_id = 0); + /// + /// @ingroup ge_graph + /// @brief try to free memory when no memory is referenced + /// @return void + /// + void TryFreeBlocks(); + private: /// diff --git a/ge/graph/manager/graph_context.cc b/ge/graph/manager/graph_context.cc index 6a5b2913..3a705ad9 100644 --- a/ge/graph/manager/graph_context.cc +++ b/ge/graph/manager/graph_context.cc @@ -44,6 +44,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { if (graph_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "graphNode is NULL!"); return GE_GRAPH_PARAM_NULLPTR; } @@ -54,6 +55,7 @@ Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { if (compute_graph_ == nullptr) { std::shared_ptr graph = graph_node->GetGraph(); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } @@ -70,11 +72,14 @@ Status GraphContext::Finalize() const { return SUCCESS; } Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { if (var_data_name.empty()) { + REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "Variable data name is empty!"); return GE_GRAPH_EMPTY_STRING_NAME; } if (GetVarNodeTensorTable().empty()) { + REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", + var_data_name.c_str()); GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "VarNodeTensorTable is empty!"); return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; } @@ -83,6 +88,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); auto ret = returned_tensor.SetData(var_record.second.GetData()); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", + var_data_name.c_str()); GELOGE(ret, "Set Tensor data failed!"); return ret; } @@ -91,6 +98,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso } } + REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", + var_data_name.c_str()); GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "VarRecord with data_name %s does NOT exist!", var_data_name.c_str()); return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index 5c97b12e..f2b4211d 100755 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -55,11 +55,13 @@ #include "graph/passes/dimension_compute_pass.h" #include "graph/passes/flow_ctrl_pass.h" #include "graph/passes/fuse_data_nodes_with_common_input_pass.h" +#include "graph/passes/hccl_tailing_optimization_pass.h" #include "graph/passes/identity_pass.h" #include "graph/passes/input_output_connection_identify_pass.h" #include "graph/passes/iterator_op_pass.h" #include "graph/passes/link_gen_mask_nodes_pass.h" #include "graph/passes/mark_graph_unknown_status_pass.h" +#include "graph/passes/mark_node_unknown_shape_pass.h" #include "graph/passes/merge_pass.h" #include "graph/passes/merge_input_memcpy_pass.h" #include "graph/passes/merge_to_stream_merge_pass.h" @@ -93,6 +95,8 @@ #include "graph/passes/global_step_insert_pass.h" #include "graph/passes/memcpy_addr_async_pass.h" #include "graph/passes/hccl_continuous_memcpy_pass.h" +#include "graph/passes/parallel_group_pass.h" +#include "graph/passes/buffer_pool_memory_pass.h" #include "graph/build/label_allocator.h" #include "graph/utils/tensor_adapter.h" #include "inc/pass_manager.h" @@ -117,6 +121,10 @@ const char *const kAIcoreEngine = "AIcoreEngine"; const int32_t kDynamicDimsTypeIsGetNext = 0; const int32_t kDynamicDimsTypeIsData = 1; const char *const kGetNextName = "IteratorV2"; +const uint32_t kInitGraphCount = 1; +const uint32_t kNotAdded = 0; +const uint32_t kStartAdd = 1; +const uint32_t kDoneAdded = 2; bool IsTailingOptimization() { string is_tailing_optimization_option; @@ -136,6 +144,7 @@ ge::Status CheckFpCeilingMode() { auto ret = ge::GetContext().GetOption("ge.fpCeilingMode", mode); if (ret == ge::GRAPH_SUCCESS) { if (kValidFpCeilingMode.count(mode) == 0) { + REPORT_INNER_ERROR("E19999", "Option ge.fpCeilingMode is invalid, value:%s", mode.c_str()); GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "The fp_ceiling_mode %s is invalid, options are 0, 1, and 2.", mode.c_str()); return ge::GE_GRAPH_OPTIONS_INVALID; } @@ -164,12 +173,14 @@ Status GraphManager::Initialize(const std::map &options) { // malloc graph_run_listener_ = MakeShared(sync_run_mutex_, condition_); if (graph_run_listener_ == nullptr) { + REPORT_CALL_ERROR("E19999", "New GraphModelListener fail"); GELOGE(MEMALLOC_FAILED, "Make shared failed"); return MEMALLOC_FAILED; } // graph context graph_context_ = MakeShared(); if (graph_context_ == nullptr) { + REPORT_CALL_ERROR("E19999", "New GraphModelListener fail"); GELOGE(MEMALLOC_FAILED, "Make shared failed."); return MEMALLOC_FAILED; } @@ -195,6 +206,8 @@ Status GraphManager::Initialize(const std::map &options) { graph_map_.clear(); cache_helper_map_.clear(); + graph_id_to_add_graph_cond_.clear(); + graph_count_.clear(); init_flag_ = true; thread_run_flag_ = true; @@ -204,6 +217,20 @@ Status GraphManager::Initialize(const std::map &options) { return SUCCESS; } +Status GraphManager::UnloadModel(GeRootModelPtr ge_root_model, uint32_t graph_id) { + Status ret = SUCCESS; + for (size_t i = 0; i < ge_root_model->GetAllModelId().size(); ++i) { + uint32_t model_id = ge_root_model->GetAllModelId()[i]; + GELOGI("Unload model %u.", model_id); + ret = GraphLoader::UnloadModel(model_id); + if (ret != SUCCESS) { + GELOGW("[GraphManager] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); + return ret; + } + } + return ret; +} + Status GraphManager::Finalize() { if (!init_flag_) { GELOGW("GraphManager has not been initialized."); @@ -234,7 +261,6 @@ Status GraphManager::Finalize() { unload_model_ret = GE_GRAPH_GRAPH_IS_RUNNING; continue; } - // unload model auto ge_root_model = graph_node->GetGeRootModel(); if (ge_root_model != nullptr && ge_root_model->GetModelId() != INVALID_MODEL_ID && graph_node->GetLoadFlag()) { @@ -244,15 +270,14 @@ Status GraphManager::Finalize() { unload_model_ret = FAILED; continue; } - ret = GraphLoader::UnloadModel(ge_root_model->GetModelId()); + ret = UnloadModel(ge_root_model, iter->first); if (ret != SUCCESS) { - GELOGW("[GraphManager] unload model failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), iter->first); + GELOGW("[GraphManager] unload model failed, graph_id=%u.", iter->first); unload_model_ret = ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - GELOGW("[GraphManager] rtDeviceReset failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), - iter->first); + GELOGW("[GraphManager] rtDeviceReset failed, graphId=%u.", iter->first); unload_model_ret = FAILED; continue; } @@ -267,6 +292,7 @@ Status GraphManager::Finalize() { } graph_map_.clear(); cache_helper_map_.clear(); + graph_count_.clear(); // graph context if (graph_context_ != nullptr) { @@ -291,6 +317,8 @@ Status GraphManager::InitDynamicParams(ComputeGraphPtr &compute_graph) { std::string op_type; auto ret = GetOriginalType(node, op_type); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "GetOriginalType from op:%s fail", + node->GetName().c_str()); GELOGE(FAILED, "Failed to get node %s original type.", node->GetName().c_str()); return FAILED; } @@ -317,30 +345,59 @@ Status GraphManager::InitDynamicParams(ComputeGraphPtr &compute_graph) { return SUCCESS; } -Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, - const std::map &options, - const OmgContext &omg_context) { - if (HasGraphNode(graph_id)) { - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u.", graph_id); - return GE_GRAPH_GRAPH_ALREADY_EXIST; +void GraphManager::SetAddGraphCondition(GraphId graph_id, uint32_t cond) { + std::lock_guard lock(add_graph_cond_mutex_); + graph_id_to_add_graph_cond_[graph_id] = cond; + GELOGD("Graph [id:%u] has been added.", graph_id); +} + +uint32_t GraphManager::GetAddGraphCondition(GraphId graph_id) { + std::lock_guard lock(add_graph_cond_mutex_); + auto it = graph_id_to_add_graph_cond_.find(graph_id); + if (it != graph_id_to_add_graph_cond_.end()) { + return it->second; + } else { + GELOGD("Graph [id:%u] has not been added.", graph_id); + return kNotAdded; } +} - auto compute_graph = GraphUtils::GetComputeGraph(graph); - if (compute_graph != nullptr) { - compute_graph->SetGraphID(graph_id); - bool graph_has_been_added = false; - if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) - && graph_has_been_added) { - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, - "[GraphManager] same graph object can not be added again, graph_id = %u.", graph_id); - return GE_GRAPH_GRAPH_ALREADY_EXIST; - } - (void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true); - compute_graph_ = compute_graph; +void GraphManager::RemoveAddGraphCondition(GraphId graph_id) { + std::lock_guard lock(add_graph_cond_mutex_); + auto it = graph_id_to_add_graph_cond_.find(graph_id); + if (it != graph_id_to_add_graph_cond_.end()) { + graph_id_to_add_graph_cond_.erase(it); + GELOGD("Successfully removed add_graph_cond of graph [id:%u].", graph_id); } else { - GELOGE(FAILED, "compute graph is null"); - return FAILED; + GELOGD("Graph [id:%u] has not been added. no need to remove.", graph_id); } +} + +Status GraphManager::CheckRepeatAdd(uint32_t graph_id, bool &is_added) { + uint32_t count = 0; + if (GetGraphCount(graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + return INTERNAL_ERROR; + } + // previous thread owns same graph_id has been in the middle of the AddGraph procession + if (count > 1 && GetAddGraphCondition(graph_id) == kStartAdd) { + std::unique_lock lock(add_graph_mutex_); + GELOGD("Waitting for build end of previous thread."); + while (GetAddGraphCondition(graph_id) != kDoneAdded) { + add_graph_cv_.wait(lock); + } + GraphNodePtr graph_node; + Status ret = GetGraphNode(graph_id, graph_node); + if (ret != SUCCESS) { + GELOGE(ret, "[AddGraph] GetGraphNode failed, graph_id = %u.", graph_id); + return ret; + } + is_added = true; + } + return SUCCESS; +} + +void GraphManager::SetSessionGraphId(ComputeGraphPtr compute_graph, uint32_t graph_id) { std::string session_graph_id; if (!AttrUtils::GetStr(*compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id) || session_graph_id.empty()) { session_graph_id = "-1_" + to_string(graph_id); @@ -352,17 +409,98 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, } GELOGD("Get graph session_graph_id attr failed, set session id to default value: [0]"); } +} + +Status GraphManager::NotifyWaittingGraph(uint32_t graph_id) { + uint32_t count = 0; + if (GetGraphCount(graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + return INTERNAL_ERROR; + } + GELOGD("Add graph finished, graph_id:%u", graph_id); + if (count > 1) { + GELOGD("Finish addgraph, graph_id:%u, graph_count:%u, start to notify.", graph_id, count); + add_graph_cv_.notify_all(); + } + return SUCCESS; +} +Status GraphManager::CreateGraphNode(uint32_t graph_id, const Graph &graph, + const std::map &options) { GraphNodePtr graph_node = MakeShared(graph_id); - GE_IF_BOOL_EXEC(graph_node == nullptr, GELOGE(FAILED, "GraphNode make shared failed"); + GE_IF_BOOL_EXEC(graph_node == nullptr, + REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", + graph_id); + GELOGE(FAILED, "GraphNode make shared failed"); return FAILED); std::shared_ptr graph_ptr = MakeShared(graph); - GE_IF_BOOL_EXEC(graph_ptr == nullptr, GELOGE(FAILED, "GraphPtr make shared failed"); + GE_IF_BOOL_EXEC(graph_ptr == nullptr, + REPORT_CALL_ERROR("E19999", "New Graph fail, graph_id:%u", + graph_id); + GELOGE(FAILED, "GraphPtr make shared failed"); return FAILED); - + // update option about tuning graph + ParseOption(options, BUILD_MODE, options_.build_mode); + ParseOption(options, BUILD_STEP, options_.build_step); + ParseOption(options, TUNING_PATH, options_.tuning_path); graph_node->SetGraph(graph_ptr); graph_node->SetOptions(options); + graph_node->IncreaseLoadCount(); AddGraphNode(graph_id, graph_node); + return SUCCESS; +} + +Status GraphManager::SetStagesOptions(uint32_t graph_id, const GraphManagerOptions &options) { + CompilerStages &stages = GetCompilerStages(graph_id); + stages.preparer.SetOptions(options_); + Status status = stages.optimizer.SetOptions(options_); + if (status != SUCCESS) { + GELOGE(status, "Graph optimizer set options failed."); + return status; + } + stages.builder.SetOptions(options_); + return SUCCESS; +} + +Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, + const std::map &options, + const OmgContext &omg_context) { + IncreaseGraphCount(graph_id); + // validation for adding graphs of same graph_id in multi-thread secenario + // 1.previous thread owns same graph_id has finished the AddGraph procession + if (GetAddGraphCondition(graph_id) == kDoneAdded) { + GraphNodePtr graph_node; + if (GetGraphNode(graph_id, graph_node) != SUCCESS) { + GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "Graph not exist while done adding previously, graph_id = %u.", graph_id); + return GE_GRAPH_GRAPH_NOT_EXIST; + } + graph_node->IncreaseLoadCount(); + return SUCCESS; + } + // In multi-thread scenario, former thread owns same graph_id has been + // in the middle of the AddGraph procession while following threads have to wait until + // done adding graph of the former graph, avoiding repeatively adding same graph. + bool is_added = false; + if (CheckRepeatAdd(graph_id, is_added) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "CheckRepeatAdd for graph[id:%u] failed.", graph_id); + return INTERNAL_ERROR; + } + // The former graph (from different thread) owns same graph id has been successfully added. + if (is_added) { + return SUCCESS; + } + // Do add graph + SetAddGraphCondition(graph_id, kStartAdd); + auto compute_graph = GraphUtils::GetComputeGraph(graph); + GE_CHECK_NOTNULL(compute_graph); + compute_graph->SetGraphID(graph_id); + + SetSessionGraphId(compute_graph, graph_id); + + if (CreateGraphNode(graph_id, graph, options) != SUCCESS) { + GELOGE(FAILED, "Failed to create graph_node."); + return FAILED; + } AddLocalOmgContext(graph_id, omg_context); if (!options_.output_datatype.empty()) { @@ -373,16 +511,18 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, return GRAPH_PARAM_INVALID; } - CompilerStages &stages = GetCompilerStages(graph_id); - stages.preparer.SetOptions(options_); - Status status = stages.optimizer.SetOptions(options_); - if (status != SUCCESS) { - GELOGE(status, "Graph optimizer set options failed."); - return status; + if (SetStagesOptions(graph_id, options_) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Set stage options failed."); + return INTERNAL_ERROR; } - stages.builder.SetOptions(options_); var_acc_ctrl_.AddGraph(graph_id, compute_graph); + SetAddGraphCondition(graph_id, kDoneAdded); + // There are threads waitting for adding same graph + if (NotifyWaittingGraph(graph_id) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "NotifyWaittingGraph failed."); + return INTERNAL_ERROR; + } return SUCCESS; } @@ -390,6 +530,7 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap const std::map &options, const OmgContext &omg_context) { if (HasGraphNode(graph_id)) { + REPORT_INNER_ERROR("E19999", "graph_id:%u is exist, check invalid", graph_id); GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u.", graph_id); return GE_GRAPH_GRAPH_ALREADY_EXIST; } @@ -399,11 +540,15 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap bool graph_has_been_added = false; if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) && graph_has_been_added) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%u fail", + ATTR_NAME_GRAPH_HAS_BEEN_ADDED.c_str(), graph_id); GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] same graph object can not be added again, graph_id = %u.", graph_id); return GE_GRAPH_GRAPH_ALREADY_EXIST; } } else { + REPORT_INNER_ERROR("E19999", "compute_graph from graph:%u is nullptr, check invalid", + graph_id); GELOGE(FAILED, "compute graph is null"); return FAILED; } @@ -425,14 +570,22 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap GraphNodePtr graph_node = MakeShared(graph_id); if (graph_node == nullptr) { + REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", + graph_id); GELOGE(FAILED, "GraphNode make shared failed"); return FAILED; } std::shared_ptr graph_ptr = GraphUtils::CreateGraphPtrFromComputeGraph(new_compute_graph); if (graph_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New Graph fail, graph_id:%u", + graph_id); GELOGE(FAILED, "GraphPtr make shared failed"); return FAILED; } + // update option about tuning graph + ParseOption(options, BUILD_MODE, options_.build_mode); + ParseOption(options, BUILD_STEP, options_.build_step); + ParseOption(options, TUNING_PATH, options_.tuning_path); graph_node->SetGraph(graph_ptr); graph_node->SetOptions(options); @@ -469,6 +622,8 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::Com Status ret_topo = compute_graph->TopologicalSorting(); if (ret_topo != SUCCESS) { + REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", + compute_graph->GetGraphID()); GELOGE(ret_topo, "[GraphManager]: TopologicalSorting the merged graph failed."); return ret_topo; } @@ -504,11 +659,15 @@ Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_gr std::vector output_nodes; ComputeGraphPtr new_compute_graph = GraphUtils::CloneGraph(old_compute_graph, "", input_nodes, output_nodes); if (new_compute_graph == nullptr) { + REPORT_CALL_ERROR("E19999", "CloneGraph fail, graph_id:%u", + compute_graph->GetGraphID()); GELOGE(INTERNAL_ERROR, "Clone graph failed."); return INTERNAL_ERROR; } copy_graphs.emplace(old_compute_graph->GetName(), new_compute_graph); if (!AttrUtils::SetBool(old_compute_graph, ATTR_NAME_NEED_LX_FUSION, true)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", + ATTR_NAME_NEED_LX_FUSION.c_str(), old_compute_graph->GetGraphID()); GELOGE(INTERNAL_ERROR, "Set attr lx_fusion to graph failed."); return INTERNAL_ERROR; } @@ -574,6 +733,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr for (size_t i = 0; i < vector_future.size(); ++i) { Status ret_status = vector_future[i].get(); if (ret_status != SUCCESS) { + REPORT_CALL_ERROR("E19999", "subgraph %zu optimize failed", i); GELOGE(ret_status, "subgraph %zu optimize failed", i); return ret_status; } @@ -584,6 +744,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr bool GraphManager::CheckAllFusionOptimizeSuccess(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Input param compute_graph is nullptr."); return false; } @@ -623,6 +784,8 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_ for (const auto &subgraph : root_subgraph_list) { auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName()); if (iter == copy_graphs.end()) { + REPORT_INNER_ERROR("E19999", "Can not find subgraph:%s in copy graphs, check invalid", + subgraph->GetSubGraph()->GetName().c_str()); GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); return FAILED; } @@ -634,6 +797,8 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_ for (const auto &subgraph : subgraph_list) { auto iter = copy_graphs.find(subgraph->GetSubGraph()->GetName()); if (iter == copy_graphs.end()) { + REPORT_INNER_ERROR("E19999", "Can not find subgraph:%s in copy graphs, check invalid", + subgraph->GetSubGraph()->GetName().c_str()); GELOGE(FAILED, "Can not find subgraph:%s in copy graphs.", subgraph->GetSubGraph()->GetName().c_str()); return FAILED; } @@ -732,6 +897,8 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, Status ret = compute_graph->TopologicalSorting(); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "TopologicalSorting fail, graph_id:%u", + compute_graph->GetGraphID()); GELOGE(ret, "Graph topological sort failed, ret:%d.", ret); return ret; } @@ -747,11 +914,15 @@ Status GraphManager::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint rtError_t rt_ret = rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxCreate faileded, session_id:%lu, graph_id:%u, mode:%d", + session_id, graph_id, mode); GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return FAILED; } rt_ret = rtCtxSetCurrent(rt_context); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, mode:%d", + session_id, graph_id, mode); GELOGE(FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return FAILED; } @@ -810,7 +981,9 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetName().c_str()); @@ -825,7 +998,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetName().c_str()); return ret; @@ -843,6 +1016,22 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetAllNodes()) { + OpDescPtr op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + GELOGD("Fuzz compile flag is %d.", GetLocalOmgContext().fuzz_compile_flag); + if (!AttrUtils::SetBool(op_desc, ATTR_NAME_FUZZ_BUILD, GetLocalOmgContext().fuzz_compile_flag)) { + GELOGE(FAILED, "[Set][ATTR_NAME_FUZZ_BUILD]Failed to set fuzz build attr to %s.", op_desc->GetName().c_str()); + return FAILED; + } + } + return SUCCESS; +} + Status GraphManager::SubexpressionMigration(ComputeGraphPtr &compute_graph) { PassManager pass_manager; GE_CHK_STATUS_RET(pass_manager.AddPass("SubexpressionMigrationPass", new (std::nothrow) SubexpressionMigrationPass)); @@ -866,6 +1055,8 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: if (IsGraphNeedBuild(graph_node)) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); if (graph_node->GetBuildFlag()) { + REPORT_INNER_ERROR("E19999", "Graph:%u has not build before, can't run directly, " + "check invalid", graph_node->GetGraphId()); GELOGE(PARAM_INVALID, "The graph %u need to re-build, you should remove it from GE " "first, then AddGraph again and rebuild it.", @@ -888,6 +1079,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: if (!graph_node->IsAsync()) { ret = LoadGraph(ge_root_model, graph_node); } else { + GE_CHECK_NOTNULL(ge_root_model); ret = LoadGraphAsync(ge_root_model, graph_node); } if (ret != SUCCESS) { @@ -902,6 +1094,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: if (!graph_node->IsAsync()) { ret = LoadGraph(ge_root_model_ptr, graph_node); } else { + GE_CHECK_NOTNULL(ge_root_model); ret = LoadGraphAsync(ge_root_model_ptr, graph_node); } if (ret != SUCCESS) { @@ -914,6 +1107,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: Status GraphManager::LoadGraph(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node) { GELOGI("[LoadGraph] run_graph_flag[%d], graph_id[%u]", options_.run_graph_flag, graph_node->GetGraphId()); if (options_.run_graph_flag && ge_root_model != nullptr) { + ge_root_model->SetTrainFlag(GetTrainFlag()); // synchronization run graph with model std::shared_ptr model_listener = GetModelListener(); ModelIdInfo model_id_info; @@ -1067,16 +1261,22 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorGetRunFlag()) { + REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph_id:%u, " + "check invalid", graph_id); GELOGE(GE_GRAPH_ALREADY_RUNNING, "[RunGraph] graph already running, graph id = %u", graph_id); return GE_GRAPH_ALREADY_RUNNING; } @@ -1089,6 +1289,8 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorInitFlag()) { + REPORT_INNER_ERROR("E19999", "GELib is not init before, graph_id:%u, check invalid", + graph_id); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); return GE_CLI_GE_NOT_INITIALIZED; } @@ -1205,12 +1417,19 @@ Status GraphManager::BuildGraphForUnregisteredOp(const GraphId &graph_id, const OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(op_desc->GetOpKernelLibName()); if (kernel_info == nullptr) { + REPORT_INNER_ERROR("E19999", "GetOpsKernelInfoStore fail for op:%s(%s), kernel_lib_name:%s, graph_id:%u, " + "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), + op_desc->GetOpKernelLibName().c_str(), graph_id); GELOGE(FAILED, "Get op kernel info store failed"); return FAILED; } ret = kernel_info->CompileOp(node_vec); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call CompileOp fail for op:%s(%s), kernel_lib_name:%s, graph_id:%u, " + "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), + op_desc->GetOpKernelLibName().c_str(), graph_id); + GELOGE(FAILED, "Get op kernel info store failed"); GELOGE(ret, "Compile op failed, op = %s, graph_id = %u.", op_desc->GetName().c_str(), graph_id); return ret; } @@ -1234,16 +1453,22 @@ Status GraphManager::BuildGraph(const GraphId &graph_id, const std::vectorGetRunFlag()) { + REPORT_INNER_ERROR("E19999", "Graph is already running, can't be run again, graph_id:%u, " + "check invalid", graph_id); GELOGE(GE_GRAPH_ALREADY_RUNNING, "[BuildGraph] graph already running, graph id = %u", graph_node->GetGraphId()); return GE_GRAPH_ALREADY_RUNNING; } @@ -1308,54 +1533,29 @@ bool GraphManager::CheckModelLoad(const GeRootModelPtr &ge_root_model, bool load } Status GraphManager::RemoveGraph(const GraphId &graph_id) { + auto it = to_be_deleted_graphs_.find(graph_id); + if (it != to_be_deleted_graphs_.end()) { + to_be_deleted_graphs_.erase(it); + } GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); - if (ret != SUCCESS) { + if (ret != SUCCESS || graph_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid when GraphManager %s", + graph_id, __FUNCTION__); GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] Id %u does not exists.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } - - if ((graph_node == nullptr) || (graph_node->GetRunFlag())) { - GELOGE(GE_GRAPH_GRAPH_IS_RUNNING, "[GraphManager] Id %u is running, can't be deleted.", graph_id); - return GE_GRAPH_GRAPH_IS_RUNNING; + if (graph_node->GetRunFlag()) { + // only put graph into to-be-deleted list when exceptional scenario + to_be_deleted_graphs_.insert(graph_id); + GELOGI("[GraphManager] Trying to remove running graph[Id:%u], added into to_be_deleted_graphs_.", graph_id); + return SUCCESS; } std::lock_guard lock(unload_model_mutex_); Status middle_ret; rtError_t rt_ret; - const std::vector &all_sub_graph = graph_node->GetAllSubGraph(); - for (size_t i = 0; i < all_sub_graph.size(); ++i) { - // must free buffer firstly - middle_ret = all_sub_graph[i]->FreeInOutBuffer(); - if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager] RemoveGraph free mem failed, graph_id=%u.", graph_id); - ret = middle_ret; - } - if (all_sub_graph[i]->GeModelIsValid() && all_sub_graph[i]->GetModelIdInfo().model_id != INVALID_MODEL_ID) { - // unload model - GELOGI("UnloadModel via new ome."); - rt_ret = rtSetDevice(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = FAILED; - continue; - } - middle_ret = GraphLoader::UnloadModel(all_sub_graph[i]->GetModelIdInfo().model_id); - if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager:] unload model failed, modelId=%u, graph_id=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = middle_ret; - } - rt_ret = rtDeviceReset(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[GraphManager:] unload model failed, modelId=%u, graphId=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = FAILED; - } - } - } var_acc_ctrl_.RemoveGraph(graph_id); RemoveGraphNode(graph_id); @@ -1363,28 +1563,35 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { auto ge_root_model = graph_node->GetGeRootModel(); if (CheckModelLoad(ge_root_model, graph_node->GetLoadFlag())) { - GELOGI("Unload model %u.", ge_root_model->GetModelId()); rt_ret = rtSetDevice(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, graph_id:%u", + GetContext().DeviceId(), graph_id); GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), graph_id); return FAILED; } - middle_ret = GraphLoader::UnloadModel(ge_root_model->GetModelId()); + // same graph may be added for several times, different models were created separately, + // unload them respectively. + middle_ret = UnloadModel(ge_root_model, graph_id); if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager:] unload model failed, modelId=%u, graph_id=%u.", ge_root_model->GetModelId(), - graph_id); + REPORT_INNER_ERROR("E19999", "UnloadModel for graph:%u failed, check unload detail in GraphLoader %s", + graph_id, __FUNCTION__); + GELOGE(middle_ret, "[GraphManager:] unload model failed, graph_id=%u.", graph_id); ret = middle_ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), - graph_id); + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, graph_id:%u, when GraphManager %s", + GetContext().DeviceId(), graph_id, __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); ret = FAILED; } } RemoveCompilerStages(graph_id); + RemoveGraphCount(graph_id); + RemoveAddGraphCondition(graph_id); GE_CHK_STATUS_RET(ret, "[GraphManager:] Remove graph failed, graph_id=%u.", graph_id); GELOGI("[GraphManager] remove graph success, graph_id=%u.", graph_id); @@ -1466,6 +1673,10 @@ Status GraphManager::ParseOptions(const std::map &opti GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.compressFlag value is invalid, must be 0 or 1."); return GE_GRAPH_OPTIONS_INVALID); + // Set Build model and step + ParseOption(options, BUILD_MODE, options_.build_mode); + ParseOption(options, BUILD_STEP, options_.build_step); + ParseOption(options, BUILD_STEP, options_.tuning_path); // ge.graphType. options_.run_graph_flag = true; @@ -1514,10 +1725,6 @@ Status GraphManager::ParseOptions(const std::map &opti GELOGD("Dynamic dims params: input shape is %s, dynamic dims is %s, dynamic node type is %d", options_.input_shape.c_str(), options_.dynamic_dims.c_str(), options_.dynamic_node_type); - // Set Build model and step - ParseOption(options, BUILD_MODE, options_.build_mode); - ParseOption(options, BUILD_STEP, options_.build_step); - return SUCCESS; } @@ -1549,6 +1756,7 @@ void GraphManager::ParseOption(const std::map &options std::string &option) { auto iter = options.find(key); if (iter != options.end()) { + GELOGD("Set option %s from value %s to value%s", key.c_str(), option.c_str(), iter->second.c_str()); option = iter->second; } } @@ -1563,6 +1771,8 @@ Status GraphManager::ParseOption(const std::map &optio } else if (flag == "1") { option = true; } else { + REPORT_INNER_ERROR("E19999", "Option:%s value:%s must be 0 or 1, check invalid", + key.c_str(), flag.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, it must be 0 or 1.", key.c_str(), flag.c_str()); return GE_GRAPH_OPTIONS_INVALID; @@ -1579,6 +1789,8 @@ Status GraphManager::ParseOption(const std::map &optio if (iter != options.end()) { option = static_cast(std::strtol(iter->second.c_str(), &ptr, kDecimal)); if (ptr != nullptr && *ptr != '\0') { + REPORT_INNER_ERROR("E19999", "Option:%s value:%s must be int32_t type, check invalid", + key.c_str(), iter->second.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, must be int32_t type.", key.c_str(), iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; @@ -1622,6 +1834,8 @@ Status GraphManager::ParseOption(const std::map &optio // split engine and num by : size_t pos = engine_parallel.find(':'); if (pos == string::npos) { + REPORT_INNER_ERROR("E19999", "Option:%s, value:%s, engine and num must be connected by :, check invalid", + key.c_str(), engine_parallel.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "engine and num must be connected by :, " "while your input is %s", @@ -1655,6 +1869,8 @@ Status GraphManager::ParseOption(const std::map &optio Status GraphManager::CheckEngineName(const std::string &engine_name, const std::string &key, const std::map &option) { if (engine_name.empty()) { + REPORT_INNER_ERROR("E19999", "Option:%s, param engine_name:%s is empty, check invalid", + key.c_str(), engine_name.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "engine name of %s is empty", key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1665,6 +1881,8 @@ Status GraphManager::CheckEngineName(const std::string &engine_name, const std:: auto it_stream_repeat = option.find(engine_name); if (it_stream_repeat != option.end()) { + REPORT_INNER_ERROR("E19999", "Option:%s, param engine_name:%s is repeated, check invalid", + key.c_str(), engine_name.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "engine : %s of %s is repeated", engine_name.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1673,11 +1891,15 @@ Status GraphManager::CheckEngineName(const std::string &engine_name, const std:: Status GraphManager::ParseParallelNum(const std::string ¶llel_num, const std::string &key, int &num) { if (parallel_num.empty()) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is empty, check invalid", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num of %s is empty", key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } for (char c : parallel_num) { if (!isdigit(c)) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is not digit, check invalid", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "%s input is invalid ", key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1686,17 +1908,25 @@ Status GraphManager::ParseParallelNum(const std::string ¶llel_num, const std try { num = std::stoi(parallel_num); } catch (std::invalid_argument &) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is invalid argument, check", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is invalid argument", parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (std::out_of_range &) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is out of range, check", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is out of range", parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (...) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s is invalid argument, check", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s is invalid argument", parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } if (num < 1) { + REPORT_INNER_ERROR("E19999", "Option:%s, param parallel num:%s < 1, check invalid", + key.c_str(), parallel_num.c_str()); GELOGE(GE_GRAPH_OPTIONS_INVALID, "parallel num : %s of %s must bigger than 0", parallel_num.c_str(), key.c_str()); return GE_GRAPH_OPTIONS_INVALID; } @@ -1724,6 +1954,8 @@ Status GraphManager::GetGraphNode(const GraphId &graph_id, GraphNodePtr &out) { auto iter = graph_map_.find(graph_id); if (iter == graph_map_.end()) { out = nullptr; + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", + graph_id); GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] graph not exist, graph_id= %u.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } @@ -1744,6 +1976,7 @@ Status GraphManager::SummaryHandle(const GraphId &graph_id, std::vector> &whole_summary_output_indexes = GetCompilerStages(graph_id).optimizer.GetSummaryOutputIndexes(); if (whole_summary_output_indexes.find(graph_id) == whole_summary_output_indexes.end()) { + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in whole_summary_output_indexes, check invalid", graph_id); GELOGE(FAILED, "No Summary graph found in map."); return FAILED; } @@ -1789,6 +2022,8 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap } } if (netoutput == nullptr) { + REPORT_INNER_ERROR("E19999", "No netoutput node in graph:%u, check invalid", + graph_id); GELOGE(FAILED, "Netoutput is null."); return FAILED; } @@ -1796,6 +2031,9 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap std::string desc_name; auto out_anchor = in->GetPeerOutAnchor(); if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Peer anchor of op:%s(%s), in_index:%u is nullptr, graph_id:%u, check invalid", + netoutput->GetName().c_str(), netoutput->GetType().c_str(), + in->GetIdx(), graph_id); GELOGE(FAILED, "out_anchor is null."); return FAILED; } @@ -1803,6 +2041,8 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap // find the variable node in graph while (peer_node != nullptr && peer_node->GetType() != kVariable) { if (peer_node->GetAllInDataAnchors().size() != 1) { + REPORT_INNER_ERROR("E19999", "More than one prior nodes of peer_node:%s(%s) in checkpoint Graph:%u, " + "check invalid", peer_node->GetName().c_str(), peer_node->GetType().c_str(), graph_id); GELOGE(FAILED, "More than one prior nodes of peer_node %s in checkpoint Graph.", peer_node->GetName().c_str()); return FAILED; } @@ -1816,12 +2056,18 @@ Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGrap } } if (peer_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Peer anchor node of op:%s(%s), in_index:%u is nullptr, graph_id:%u, check invalid", + netoutput->GetName().c_str(), netoutput->GetType().c_str(), + in->GetIdx(), graph_id); GELOGE(FAILED, "No variable op found in one branch, checkpoint graph illegal."); return FAILED; } desc_name = peer_node->GetName(); GELOGI("[GraphManager] CheckpointHandle, descName=%s.", desc_name.c_str()); if (in->GetIdx() >= static_cast(outputs.size())) { + REPORT_INNER_ERROR("E19999", "in index:%u of op:%s(%s) is out of outputs.size:%zu range, graph_id:%u, " + "check invalid", in->GetIdx(), netoutput->GetName().c_str(), + netoutput->GetType().c_str(), outputs.size(), graph_id); GELOGE(FAILED, "variable index out of range."); return FAILED; } @@ -1868,6 +2114,8 @@ Status GraphManager::PushSummaryData2ME(const GraphId &graph_id, } return iter->second(graph_id, tmp_summary_data); } + REPORT_INNER_ERROR("E19999", "No summary callback found, graph_id:%u, check invalid", + graph_id); GELOGE(FAILED, "[GraphManager] PushSummaryData2ME failed, not found summary callback."); return FAILED; } @@ -1888,6 +2136,8 @@ Status GraphManager::PushSaveData2ME(const GraphId &graph_id, const std::mapsecond(graph_id, tmp_save_data); } + REPORT_INNER_ERROR("E19999", "No checkpoint callback found, graph_id:%u, check invalid", + graph_id); GELOGE(FAILED, "[GraphManager] PushSaveData2ME failed, not found checkpoint callback."); return FAILED; } @@ -1916,6 +2166,8 @@ bool GraphManager::CheckVariableForCheckpointGraph(NodePtr &node) { } auto out = node->GetOutDataAnchor(0); if (out == nullptr) { + REPORT_INNER_ERROR("E19999", "anchor index:0 of op:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(GE_GRAPH_PARAM_NULLPTR, "out is nullptr."); return false; } @@ -1948,6 +2200,7 @@ static inline bool CheckConstanOpForCheckpointGraph(NodePtr &node) { return node bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "[IsCheckpointGraph] computeGraph is nullptr."); return false; } @@ -2082,6 +2335,8 @@ Status GraphManager::RemoveIsolatedConstInThisGraph(ge::ComputeGraphPtr &compute if (n->GetOutAllNodes().empty() && n->GetInAllNodes().empty()) { // it is an isolated constant, just remove it if (GraphUtils::RemoveJustNode(compute_graph, n) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove constant op:%s(%s) failed", + n->GetName().c_str(), n->GetType().c_str()); GELOGE(FAILED, "remove constant %s failed.", n->GetName().c_str()); return FAILED; } @@ -2243,6 +2498,14 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { // Reason: Make sure that the var "global_step" can be partitioned to known sub graph and allocated memory GE_CHK_STATUS_RET( graph_pass.AddPass("OptimizeStage1_3::GlobalStepInsertPass", new (std::nothrow) GlobalStepInsertPass)) + + std::string hccl_tailing_optimize; + if (GetContext().GetOption("ge.exec.hccl_tailing_optimize", hccl_tailing_optimize) == SUCCESS && + hccl_tailing_optimize == "1") { + GELOGI("Add hccl tailing optimize stage"); + GE_CHK_STATUS_RET( + graph_pass.AddPass("OptimizeStage1_3::HcclTailingOptimizationPass", new (std::nothrow) HcclTailingOptimizationPass)) + } } GE_TIMESTAMP_START(graph_pass); ret = graph_pass.Run(compute_graph); @@ -2331,6 +2594,8 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) { new (std::nothrow) VariableRefDeleteOpPass)) GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass("OptimizeStage2::ControlAttrOptimize::CompileNodesPass", new (std::nothrow) CompileNodesPass)) + GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass( + "OptimizeStage2::AfterMergePasses::MarkNodeUnknownShapePass", new(std::nothrow) MarkNodeUnknownShapePass)) GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass( "OptimizeStage2::AfterMergePasses::MarkGraphUnknownStatusPass", new(std::nothrow) MarkGraphUnknownStatusPass)) GE_CHK_STATUS_RET( @@ -2373,6 +2638,18 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) { GE_CHK_STATUS_RET(memcpy_addr.Run(compute_graph), "Add memcpy_addr_async node failed."); GE_TIMESTAMP_END(AddMemcpyAddrAsyncNode, "MemcpyAddrAsyncPass::Run."); + // Process offset and dependency for buffer pool memory assigner. + GE_TIMESTAMP_START(BufferPoolMemoryPass); + BufferPoolMemoryPass buffer_pool_mem_pass; + GE_CHK_STATUS_RET(buffer_pool_mem_pass.Run(compute_graph), "Failed to process for buffer pool allocator."); + GE_TIMESTAMP_END(BufferPoolMemoryPass, "BufferPoolMemoryPass::Run."); + + // Handle parallel group . + GE_TIMESTAMP_START(ParallelGroup); + ParallelGroupPass parallel_group_pass; + GE_CHK_STATUS_RET(parallel_group_pass.Run(compute_graph), "Handle parallel group failed."); + GE_TIMESTAMP_END(ParallelGroup, "ParallelGroupPass::Run."); + // After while sub graph handle, mark all node rw type auto result = GetCompilerStages(compute_graph->GetGraphID()).optimizer.HandleMemoryRWConflict(compute_graph); if (result != SUCCESS) { @@ -2401,6 +2678,7 @@ void GraphManager::ChangeConstTypeWhenTraining(const ComputeGraphPtr &compute_gr Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node) { GELOGI("[LoadGraphAsync] run_graph_flag[%d], graph_id[%u]", options_.run_graph_flag, graph_node->GetGraphId()); if (options_.run_graph_flag && ge_root_model != nullptr) { + ge_root_model->SetTrainFlag(GetTrainFlag()); // synchronization run graph with model ModelIdInfo model_id_info; bool is_unknown_shape = false; @@ -2417,9 +2695,9 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G } } GE_TIMESTAMP_START(LoadGraph); - GE_CHECK_NOTNULL(graph_node->graph_run_async_listener_); - Status ret = - GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, graph_node->graph_run_async_listener_); + auto listener = MakeShared(); + GE_CHECK_NOTNULL(listener); + Status ret = GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, listener); GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraphAsync"); if (ret != SUCCESS) { GELOGE(ret, "[LoadGraphAsync] LoadGraphAsync Failed"); @@ -2433,6 +2711,52 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G return SUCCESS; } +void GraphManager::ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph_node, + const std::vector &model_ids, uint32_t graph_id, uint64_t session_id) { + rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, when GraphManager %s", + GetContext().DeviceId(), __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, graphId=%u.", graph_id); + return; + } + for (auto model_id : model_ids) { + uint64_t max_memory_size = 0; + Status result = GraphLoader::GetMaxUsedMemory(model_id, max_memory_size); + if (result != SUCCESS) { + continue; + } + GELOGI("CheckAndReleaseMemory try to UnloadGraph[%u], model[%u] which MaxUsedMemory[%lu].", graph_id, model_id, + max_memory_size); + if (model_ids.size() > 1) { + result = ge_model->GetSessionId(model_id, session_id); + if (result != SUCCESS) { + GELOGW("[GraphManager:] get session failed when dynamic memory, modelId=%u, graphId=%u.", model_id, + graph_id); + continue; + } + } + result = GraphLoader::DestroyAicpuKernel(session_id, model_id, 0); + if (result != SUCCESS) { + GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id, + graph_id); + } + result = GraphLoader::UnloadModel(model_id); + if (result != SUCCESS) { + GELOGW("[GraphManager:] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); + } + GELOGI("CheckAndReleaseMemory UnloadGraph[%u], model[%u] success.", graph_id, model_id); + } + graph_node->SetLoadFlag(false); + rt_ret = rtDeviceReset(GetContext().DeviceId()); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, when GraphManager %s", + GetContext().DeviceId(), __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); + return; + } +} + Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const GraphNodePtr &graph_node) { GELOGI("CheckAndReleaseMemory graph_id[%u]", graph_node->GetGraphId()); int64_t value = 0; @@ -2454,6 +2778,8 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra " Device[%u] free_memory_size[%ld]", graph_node->GetGraphId(), memory_size, weight_size, GetContext().DeviceId(), free_memory); if (ge::CheckInt64AddOverflow(memory_size, weight_size) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "memory_size:%ld and weight_size:%ld will overflow after add, check invalid", + memory_size, weight_size); GELOGE(INTERNAL_ERROR, "The sum of Memory size and weight size exceeds INT64_MAX"); return INTERNAL_ERROR; } @@ -2476,6 +2802,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra continue; } auto model_id = model->GetModelId(); + auto model_ids = model->GetAllModelId(); // unload model not release bool is_unknown_shape = false; GE_CHK_STATUS_RET(model->CheckIsUnknownShape(is_unknown_shape)); @@ -2488,34 +2815,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra GELOGI("CheckAndReleaseMemory graph[%u] has not been loaded.", graph_id); continue; } - uint64_t max_memory_size = 0; - result = GraphLoader::GetMaxUsedMemory(model_id, max_memory_size); - if (result != SUCCESS) { - continue; - } - GELOGI("CheckAndReleaseMemory try to UnloadGraph[%u], model[%u] which MaxUsedMemory[%lu].", graph_id, model_id, - max_memory_size); - rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", model_id, graph_id); - continue; - } - result = GraphLoader::DestroyAicpuKernel(session_id, model_id, 0); - if (result != SUCCESS) { - GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id, - graph_id); - } - result = GraphLoader::UnloadModel(model_id); - if (result != SUCCESS) { - GELOGW("[GraphManager:] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); - } - rt_ret = rtDeviceReset(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, modelId=%u, graphId=%u.", model_id, graph_id); - continue; - } - it.second->SetLoadFlag(false); - GELOGI("CheckAndReleaseMemory UnloadGraph[%u], model[%u] success and set LoadFlag to false.", graph_id, model_id); + ReleaseMemory(ge_model, it.second, model_ids, graph_id, session_id); } return SUCCESS; @@ -2540,10 +2840,14 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager GE_DUMP(compute_graph_tmp, "OptimizeSubGraphBefore"); GE_CHECK_NOTNULL(compute_graph_tmp); if (!AttrUtils::SetInt(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_ID, root_graph_id)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u", ATTR_NAME_ROOT_GRAPH_ID.c_str(), + compute_graph_tmp->GetGraphID()); GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_ID for subgraph, graph_id: %u.", root_graph_id); return FAILED; } if (!AttrUtils::SetStr(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_NAME, root_graph_name)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to graph:%u", ATTR_NAME_ROOT_GRAPH_NAME.c_str(), + compute_graph_tmp->GetGraphID()); GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_NAME for subgraph, \ root_graph_name: %s.", root_graph_name.c_str()); return FAILED; @@ -2563,6 +2867,7 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager compute_graph_tmp != nullptr ? compute_graph_tmp->GetName().c_str() : "", engine_name.c_str(), pthread_self()); } else { + REPORT_INNER_ERROR("E19999", "Param sub_graph_info_ptr or graph_manager is nullptr"); GELOGE(FAILED, "graph_manager or sub_graph_info_ptr is nullptr"); return FAILED; } @@ -2651,6 +2956,38 @@ void GraphManager::ConstructGeInput(const vector &inputs, vecto } } +Status GraphManager::CheckIncreBuildAndPreRun(GraphManager *graph_manager, const PreRunArgs &args, + GraphNodePtr &graph_node, GeRootModelPtr &ge_root_model) { + if (!graph_manager->IsGraphNeedBuild(graph_node)) { + ge_root_model = graph_node->GetGeRootModel(); + return SUCCESS; + } + if (graph_node->GetBuildFlag()) { + ReturnError(graph_manager, args.callback, PARAM_INVALID, + "The graph " + std::to_string(graph_node->GetGraphId()) + + " need to re-build, you should remove it" + " from GE first, then AddGraph again and rebuild it."); + graph_node->Unlock(); + return PARAM_INVALID; + } + // check need incre build. + GeModelPtr ge_model = nullptr; + if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { + std::vector ge_inputs; + ConstructGeInput(args.input_tensor, ge_inputs); + Status ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id); + // release rts generate context + RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId()); + if (ret != SUCCESS) { + ReturnError(graph_manager, args.callback, ret, "PreRun Failed."); + return ret; + } + } + graph_node->SetBuildFlag(true); + graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); + return SUCCESS; +} + void GraphManager::PreRunThread(GraphManager *graph_manager) { if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) { GELOGW("Set thread name failed."); @@ -2663,7 +3000,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { continue; } - GELOGI("A new loop start."); + GELOGI("[PreRunThread] A new loop start, graph_id:%u.", args.graph_id); ErrorManager::GetInstance().SetErrorContext(args.error_context); ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); @@ -2679,7 +3016,24 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { "[RunGraph] graph not exist, graph_id=" + std::to_string(args.graph_id)); return; } - + // more than one graph owns same graph_id + uint32_t count = 0; + if (graph_manager->GetGraphCount(args.graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed.", args.graph_id); + return; + } + // Avoid repeatively prerun for graphs owns same graph_id in online inference concurrency + if (count > 1 && graph_node->GetBuildFlag()) { + graph_node->Lock(); + GELOGD("Avoid repeatively prerun, graph_id:%u.", args.graph_id); + // In online inference concurrency senario, graph_node is allowed to be locked for 'count' times + graph_node->SetSemSize(count); + graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.error_context, + args.input_tensor, graph_node->GetGeRootModel(), GetThreadLocalContext(), args.callback })); + GELOGI("[PreRunThread] Loop end. Start to run with cached build model."); + continue; + } + // Cannot be put ahead of the repeatively prerun judgement graph_node->Lock(); if (graph_node->GetRunFlag()) { @@ -2711,46 +3065,24 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { // it will not execute graph preprocess, optimize, parition, build if the graph has built successful. GELOGI("Start for run graph async."); GeRootModelPtr ge_root_model = nullptr; - if (graph_manager->IsGraphNeedBuild(graph_node)) { - if (graph_node->GetBuildFlag()) { - ReturnError(graph_manager, args.callback, PARAM_INVALID, - "The graph " + std::to_string(graph_node->GetGraphId()) + - " need to re-build, you should remove it" - " from GE first, then AddGraph again and rebuild it."); + + ret = CheckIncreBuildAndPreRun(graph_manager, args, graph_node, ge_root_model); + if (ret != SUCCESS) { + graph_node->SetRunFlag(false); + if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) { + ReturnError(graph_manager, args.callback, ret, "CheckIncreBuildAndPreRun Failed, thread exit.."); graph_node->Unlock(); return; + } else { + ReturnError(graph_manager, graph_node, args.callback, ret, + "CheckIncreBuildAndPreRun Failed, keep geop continue!"); + graph_node->Unlock(); + continue; } - - // check need incre build. - GeModelPtr ge_model = nullptr; - if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { - std::vector ge_inputs; - ConstructGeInput(args.input_tensor, ge_inputs); - ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id); - // release rts generate context - RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId()); - if (ret != SUCCESS) { - graph_node->SetRunFlag(false); - if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) { - ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit.."); - graph_node->Unlock(); - return; - } else { - ReturnError(graph_manager, graph_node, args.callback, ret, "PreRun Failed, keep geop continue!"); - graph_node->Unlock(); - continue; - } - } - } - graph_node->SetBuildFlag(true); - graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); - } else { - ge_root_model = graph_node->GetGeRootModel(); } - graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.error_context, args.input_tensor, ge_root_model, GetThreadLocalContext(), args.callback })); - GELOGI("Loop end."); + GELOGI("[PreRunThread] Loop end."); } } @@ -2776,10 +3108,16 @@ Status GraphManager::ParseInputsDimsForGetNexNosinkAndData(const vector } GeAttrValue::INT index = 0; if (!(AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, index))) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) fail", ATTR_NAME_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Get index from attr failed"); return PARAM_INVALID; } if (static_cast(index) > input_tensor.size()) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s) value:%ld > param input_tensor.size:%zu, " + "check invalid", ATTR_NAME_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + index, input_tensor.size()); GELOGE(PARAM_INVALID, "The count of input tensor should be equal to the count of data."); return PARAM_INVALID; } @@ -2847,16 +3185,13 @@ void GraphManager::RunThread(GraphManager *graph_manager) { continue; } - GELOGI("A new loop start."); + GELOGI("[RunThread] A new loop start, graph_id:%u.", args.graph_id); ErrorManager::GetInstance().SetErrorContext(args.error_context); GetContext().SetSessionId(args.session_id); GetThreadLocalContext() = args.context; graph_manager->UpdateLocalOmgContext(args.graph_id); - if (args.graph_node->graph_run_async_listener_ != nullptr) { - args.graph_node->graph_run_async_listener_->SetCallback(args.callback); - } Status ret; // parse inputs.dims to vector> dynamic_dims ret = graph_manager->ParseInputsDims(args.input_tensor); @@ -2866,8 +3201,10 @@ void GraphManager::RunThread(GraphManager *graph_manager) { return; } + args.graph_node->UpdateLoadFlag(); if (!args.graph_node->GetLoadFlag()) { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelLoad, ErrorMessage::kModelLoad); + args.ge_root_model->SetTrainFlag(graph_manager->GetTrainFlag()); ret = graph_manager->LoadGraphAsync(args.ge_root_model, args.graph_node); if (ret != SUCCESS || args.ge_root_model == nullptr) { StopQueue(graph_manager); @@ -2875,6 +3212,10 @@ void GraphManager::RunThread(GraphManager *graph_manager) { args.graph_node->Unlock(); return; } + // control the times of graph loading in multi-thread scenario + args.graph_node->DecreaseLoadCount(); + args.graph_node->IncreaseLoadRecord(); + args.graph_node->SetLoadFlag(true); GELOGI("LoadGraph[%u], model[%u] success and set LoadFlag to true.", args.graph_node->GetGraphId(), args.ge_root_model->GetModelId()); @@ -2890,7 +3231,7 @@ void GraphManager::RunThread(GraphManager *graph_manager) { } ret = graph_manager->graph_executor_.ExecuteGraphAsync(args.graph_id, args.graph_node->GetGeRootModel(), - args.input_tensor); + args.input_tensor, args.callback); args.graph_node->SetRunFlag(false); if (ret != SUCCESS) { ReturnError(graph_manager, args.callback, ret, "ExecuteGraphAsync failed, thread exit."); @@ -2927,6 +3268,8 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ std::vector outputs; auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph()); if (graph_manager == nullptr || compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph_manager or compute_graph in graph_node is nullptr, " + "check invalid"); GELOGE(GRAPH_FAILED, "[Analyze Mode] compute graph is null!"); callback(GRAPH_FAILED, outputs); return; @@ -2946,6 +3289,9 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ len = input_desc->GetShape().GetShapeSize(); } if (len < 0) { + REPORT_INNER_ERROR("E19999", "InputIndex:%zu ShapeSize:%ld of op:%s(%s) < 0, unknown shape is not support, " + "check invalid", i, len, + node->GetName().c_str(), node->GetType().c_str()); GELOGE(GRAPH_FAILED, "Analyze Mode does not support GEOP output unknown shape!"); callback(GRAPH_FAILED, outputs); return; @@ -2955,12 +3301,19 @@ void GraphManager::ReturnError(GraphManager *graph_manager, GraphNodePtr &graph_ } auto size = GetSizeByDataType(input_desc->GetDataType()); if (size <= 0) { + REPORT_INNER_ERROR("E19999", "data_type:%s of op:%s(%s) is not support, input_index:%zu check invalid", + ge::TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str(), + node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(PARAM_INVALID, "Failed to get cube size, the data type %s is invalid", ge::TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str()); callback(GRAPH_FAILED, outputs); return; } if (CheckInt64MulOverflow(len, static_cast(size)) != true) { + REPORT_INNER_ERROR("E19999", "shape_size:%ld of op:%s(%s) will overflow after multiply by " + "size:%u of data_type:%s, input_index:%zu, check invalid", len, + node->GetName().c_str(), node->GetType().c_str(), size, + ge::TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str(), i); GELOGE(MEMALLOC_FAILED, "int64 multiply happens overflow! a:%ld b:%d", len, size); callback(GRAPH_FAILED, outputs); return; @@ -2983,11 +3336,15 @@ bool GraphManager::IsGraphNeedRebuild(uint32_t graph_id) { GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", + graph_id); GELOGE(ret, "[RunGraph] graph not exist, graph_id=%u.", graph_id); return true; } if (graph_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", + graph_id); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[RunGraph] graph node is NULL, graphId=%u.", graph_id); return true; } @@ -3002,11 +3359,15 @@ const map *GraphManager::GetGraphOptions(uint32_t grap GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", + graph_id); GELOGE(ret, "[RunGraph] graph not exist, graph_id=%u.", graph_id); return nullptr; } if (!graph_node) { + REPORT_INNER_ERROR("E19999", "Graph node is nullptr in graph_map, graph_id:%u, check invalid", + graph_id); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[RunGraph] graph node is NULL, graph_id=%u.", graph_id); return nullptr; } @@ -3037,6 +3398,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra } bool dynamic_shape_partitioned = false; if (!AttrUtils::GetBool(*compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, dynamic_shape_partitioned)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%u fail", + ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); GELOGE(FAILED, "failed get dynamic shape partitioned flag on partitioned graph."); return FAILED; } @@ -3094,6 +3457,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra if (AttrUtils::GetBool(compute_graph, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel)) { GELOGI("Compute graph %s get superkernel flag %d.", compute_graph->GetName().c_str(), off_superkernel); if (!AttrUtils::SetBool(merged_compute_graph, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", + ATTR_NAME_OFF_SUPERKERNEL_ATTR.c_str(), compute_graph->GetGraphID()); GELOGE(FAILED, "Compute graph %s set superkernel flag %d failed", merged_compute_graph->GetName().c_str(), off_superkernel); return FAILED; @@ -3103,6 +3468,8 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra GE_DUMP(merged_compute_graph, "mergedComputeGraph"); compute_graph = merged_compute_graph; if (!AttrUtils::SetBool(*compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, dynamic_shape_partitioned)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s to graph:%u fail", + ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED.c_str(), compute_graph->GetGraphID()); GELOGE(FAILED, "failed set dynamic shape partitioned flag on partitioned graph."); return FAILED; } @@ -3132,6 +3499,21 @@ Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, GraphPar non_tuning_subgraphs.push_back(sub_graph_tmp); } } + // for function graphs to tune + for (auto &function_graph : compute_graph->GetAllSubgraphs()) { + auto subgraph_list = sub_graph_map[function_graph]; + for (const auto &sub_graph_info_ptr : subgraph_list) { + GE_CHECK_NOTNULL(sub_graph_info_ptr); + ComputeGraphPtr sub_graph_tmp = sub_graph_info_ptr->GetSubGraph(); + // need to tuning + if (sub_graph_info_ptr->GetEngineName() == kVectorEngine || + sub_graph_info_ptr->GetEngineName() == kAIcoreEngine) { + tuning_subgraphs.push_back(sub_graph_tmp); + } else { + non_tuning_subgraphs.push_back(sub_graph_tmp); + } + } + } return TuningUtils::ConvertGraphToFile(tuning_subgraphs, non_tuning_subgraphs, exe_flag, path); } @@ -3201,6 +3583,8 @@ Status GraphManager::SaveVariables(const Graph &graph, const std::vectorGetOwnerNode(); while (peer_node->GetType() != VARIABLE) { if (peer_node->GetAllInDataAnchors().size() != 1) { + REPORT_INNER_ERROR("E19999", "peer node:%s(%s) of netoutput has more than 1 input in checkpoint Graph, " + "check invalid", + peer_node->GetName().c_str(), peer_node->GetType().c_str()); GELOGE(FAILED, "peer_node [%s] has more than 1 input in checkpoint Graph.", peer_node->GetName().c_str()); return FAILED; } @@ -3252,12 +3639,17 @@ Status GraphManager::SaveCheckPointResult(const Graph &graph, const std::vector< } } if (peer_node->GetType() != VARIABLE) { + REPORT_INNER_ERROR("E19999", "peer node:%s(%s) of netoutput is not variable in checkpoint Graph, " + "check invalid", + peer_node->GetName().c_str(), peer_node->GetType().c_str()); GELOGE(FAILED, " peer_node %s is not variable in checkpoint Graph.", peer_node->GetName().c_str()); return FAILED; } auto var_name = peer_node->GetName(); GELOGI("[GraphManager] SaveVariables, varName is %s.", var_name.c_str()); if (in->GetIdx() >= static_cast(outputs.size())) { + REPORT_INNER_ERROR("E19999", "In index:%u of netoutput is out of outputs.size:%zu range in checkpoint Graph, " + "check invalid", in->GetIdx(), outputs.size()); GELOGE(FAILED, "variable index[%d] out of range[%zu].", in->GetIdx(), outputs.size()); return FAILED; } @@ -3291,4 +3683,49 @@ void GraphManager::RemoveCompilerStages(GraphId graph_id) { std::lock_guard lock(member_mutex_); compiler_stages_.erase(graph_id); } + +void GraphManager::IncreaseGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + graph_count_.insert({graph_id, kInitGraphCount}); + GELOGD("After increaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } else { + ++graph_count_[graph_id]; + GELOGD("After increaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } +} + +void GraphManager::RemoveGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph of id: %u has not been added, count cannot be decreased.", graph_id); + } else { + GELOGD("RemoveGraphCount success, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + graph_count_.erase(it); + } +} + +void GraphManager::DecreaseGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph of id: %u has not been added, count cannot be decreased.", graph_id); + } else { + --it->second; + GELOGD("After DecreaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } +} + +Status GraphManager::GetGraphCount(GraphId graph_id, uint32_t &count) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph [id:%u] has not been added.", graph_id); + return FAILED; + } + count = it->second; + return SUCCESS; +} } // namespace ge diff --git a/ge/graph/manager/graph_manager.h b/ge/graph/manager/graph_manager.h index 661cf9d8..0533a0b6 100644 --- a/ge/graph/manager/graph_manager.h +++ b/ge/graph/manager/graph_manager.h @@ -184,6 +184,20 @@ class GraphManager { Status SaveCheckPointResult(const Graph &graph, const std::vector &outputs, map &var_results); + void RemoveGraphCount(GraphId graph_id); + + void IncreaseGraphCount(GraphId graph_id); + + void DecreaseGraphCount(GraphId graph_id); + + Status GetGraphCount(GraphId graph_id, uint32_t &count); + + void SetAddGraphCondition(GraphId graph_id, uint32_t cond); + + uint32_t GetAddGraphCondition(GraphId graph_id); + + void RemoveAddGraphCondition(GraphId graph_id); + private: struct CompilerStages { GraphPrepare preparer; @@ -358,6 +372,7 @@ class GraphManager { ComputeGraphPtr &compute_graph, GeRootModelPtr &ge_root_model, uint64_t session_id); + Status SetFuzzCompileFlag(ComputeGraphPtr &compute_graph); Status CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_graph, Graph2SubGraphInfoList &sub_graph_map, @@ -380,6 +395,24 @@ class GraphManager { CompilerStages &GetCompilerStages(GraphId graph_id); void RemoveCompilerStages(GraphId graph_id); + static Status CheckIncreBuildAndPreRun(GraphManager *graph_manager, const PreRunArgs &args, GraphNodePtr &graph_node, + GeRootModelPtr &ge_root_model); + + void ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph_node, const std::vector &model_ids, + uint32_t graph_id, uint64_t session_id); + + Status CheckRepeatAdd(uint32_t graph_id, bool &is_added); + + Status NotifyWaittingGraph(uint32_t graph_id); + + Status CreateGraphNode(uint32_t graph_id, const Graph &graph, const std::map &options); + + Status SetStagesOptions(uint32_t graph_id, const GraphManagerOptions &options); + + Status UnloadModel(GeRootModelPtr ge_root_model, uint32_t graph_id); + + void SetSessionGraphId(ComputeGraphPtr compute_graph, uint32_t graph_id); + std::atomic_bool thread_run_flag_; BlockingQueue prerun_args_q_{}; BlockingQueue run_args_q_{}; @@ -415,6 +448,16 @@ class GraphManager { std::mutex member_mutex_; std::mutex unload_model_mutex_; + // avoid repeatively add same graph (owns same graph id) + std::mutex add_graph_mutex_; + std::mutex add_graph_cond_mutex_; + std::condition_variable add_graph_cv_; + + std::map graph_id_to_add_graph_cond_; + // use for multi-thread online-infer scenario + std::set to_be_deleted_graphs_; + std::map graph_count_; + std::mutex graph_count_mutex_; }; } // namespace ge diff --git a/ge/graph/manager/graph_manager_utils.cc b/ge/graph/manager/graph_manager_utils.cc index fe7e5b34..e9d72bd8 100644 --- a/ge/graph/manager/graph_manager_utils.cc +++ b/ge/graph/manager/graph_manager_utils.cc @@ -60,6 +60,15 @@ void GraphNode::Unlock() { sem_.Pop(unused); } +void GraphNode::IncreaseLoadCount() { + std::unique_lock lock(load_count_mu_); + if (load_record_ == kMaxLoadNum) { + GELOGW("Reach the maximum of load_count:%u", kMaxLoadNum); + return; + } + ++load_count_; +} + SubGraphInfo::SubGraphInfo() : subgraph_ptr_(nullptr), ge_model_ptr_(nullptr), malloc_flag_(false) {} SubGraphInfo::~SubGraphInfo() { @@ -84,6 +93,7 @@ Status SubGraphInfo::FreeInOutBuffer() { rtError_t rt_ret; rt_ret = rtFreeHost(*iter); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail"); GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); buffer_addr_.erase(buffer_addr_.begin(), iter); return GE_GRAPH_FREE_FAILED; @@ -119,6 +129,7 @@ Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, ui uint32_t GraphModelListener::GetResultCode() const { if (!is_finished_) { + REPORT_CALL_ERROR("E19999", "Model not run finish"); GELOGE(INTERNAL_ERROR, "[GraphManager] model not run finish."); return INTERNAL_ERROR; } diff --git a/ge/graph/manager/graph_manager_utils.h b/ge/graph/manager/graph_manager_utils.h index de65c5cb..ffbc20cf 100644 --- a/ge/graph/manager/graph_manager_utils.h +++ b/ge/graph/manager/graph_manager_utils.h @@ -55,6 +55,7 @@ using ConstGraphPtr = std::shared_ptr; using GraphPtr = std::shared_ptr; const uint64_t INVALID_SESSION_ID = 0xffffffffffffffffULL; +const uint32_t kMaxLoadNum = 8; struct ModelIdInfo { uint32_t model_id{INVALID_MODEL_ID}; @@ -162,6 +163,8 @@ class GraphNode { bool GetBuildFlag() const { return build_flag_; } void SetBuildFlag(bool buildFlag) { build_flag_ = buildFlag; } bool GetLoadFlag() const { return load_flag_; } + // allow repeatively load graph owns same graph id + void UpdateLoadFlag() { load_flag_ = load_count_ == 0 || load_record_ >= kMaxLoadNum; } void SetLoadFlag(bool load_flag) { load_flag_ = load_flag; } void SetGeModel(const GeModelPtr &ge_model) { ge_model_ = ge_model; } GeModelPtr GetGeModel() const { return ge_model_; } @@ -172,6 +175,13 @@ class GraphNode { void Lock(); void Unlock(); + void SetSemSize(uint32_t size) { sem_.SetMaxSize(size); } + + uint32_t GetLoadCount() const { return load_count_; } + void IncreaseLoadCount(); + void DecreaseLoadCount() { --load_count_; } + void IncreaseLoadRecord() { ++load_record_; } + // run graph asynchronous listener std::shared_ptr graph_run_async_listener_; @@ -184,11 +194,17 @@ class GraphNode { GraphPtr graph_; ComputeGraphPtr compute_graph_; bool build_flag_; + // load_flag_ is true if more than 1 model were loaded bool load_flag_; bool async_; GeModelPtr ge_model_; GeRootModelPtr ge_root_model_; BlockingQueue sem_; + // consist with graph_count of same graph_id in graph_manager + uint32_t load_count_ = 0; + // total times of loading a graph with same graph_id. + uint32_t load_record_ = 0; + std::mutex load_count_mu_; }; using GraphNodePtr = std::shared_ptr; @@ -249,6 +265,7 @@ struct GraphManagerOptions { std::string save_original_model; std::string build_mode; std::string build_step; + std::string tuning_path; std::string input_shape; std::string dynamic_dims; int32_t dynamic_node_type = -1; @@ -275,7 +292,8 @@ struct GraphManagerOptions { is_single_op(false), save_original_model("false"), build_mode(""), - build_step("") {} + build_step(""), + tuning_path(""){} }; } // namespace ge diff --git a/ge/graph/manager/graph_mem_allocator.cc b/ge/graph/manager/graph_mem_allocator.cc index 428b08ae..24e75356 100755 --- a/ge/graph/manager/graph_mem_allocator.cc +++ b/ge/graph/manager/graph_mem_allocator.cc @@ -49,6 +49,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size uint8_t *memory_addr = nullptr; if (rtMalloc(reinterpret_cast(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", + purpose.c_str(), memory_size, device_id); GELOGE(ge::INTERNAL_ERROR, "MemoryAllocator::MallocMemory device_id = %u," " size= %lu", @@ -66,6 +68,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con GELOGI("MemoryAllocator::FreeMemory device_id = %u", device_id); auto rtRet = rtFree(memory_addr); if (rtRet != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); GELOGE(rtRet, "MemoryAllocator::MallocMemory device_id = %u", device_id); return RT_ERROR_TO_GE_STATUS(rtRet); } @@ -84,6 +87,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memo uint8_t *memory_addr = MallocMemory(purpose, memory_size, device_id); if (memory_addr == nullptr) { + REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", + purpose.c_str(), memory_key.c_str(), memory_size, device_id); GELOGE(ge::INTERNAL_ERROR, "MemoryAllocator::MallocMemory failed," " memory_key[%s], size = %lu.", @@ -120,6 +125,8 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id) } if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", + memory_key.c_str(), device_id); GELOGE(ge::INTERNAL_ERROR, "MemoryAllocator::FreeMemory rtFree failed," " memory_key[%s]", @@ -169,6 +176,7 @@ Status MemManager::Initialize(const std::vector &memory_type) { memory_allocator_map_[index] = memory_allocator; GELOGI("Create MemoryAllocator memory type[%u] success.", index); } else { + REPORT_CALL_ERROR("E19999", "New MemoryAllocator fail, index:%u", index); GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc MemoryAllocator failed."); } } else { diff --git a/ge/graph/manager/graph_var_manager.cc b/ge/graph/manager/graph_var_manager.cc index d0292885..5d440f00 100755 --- a/ge/graph/manager/graph_var_manager.cc +++ b/ge/graph/manager/graph_var_manager.cc @@ -39,6 +39,8 @@ VarResource::~VarResource() { ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr, rtMemType_t &memory_type) { if (dev_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " + "check invalid", var_name.c_str(), session_id_); GELOGE(FAILED, "[GetVarAddr] dev_ptr is null!"); return FAILED; } @@ -47,6 +49,9 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens auto iter = var_addr_mgr_map_.find(var_key); if (iter == var_addr_mgr_map_.end()) { + REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " + "check invalid", var_key.c_str(), var_name.c_str(), + session_id_); GELOGE(FAILED, "VarResource::GetVarAddr failed, var_key %s", var_key.c_str()); return FAILED; } @@ -102,6 +107,9 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen return SUCCESS; } + REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " + "check invalid", var_key.c_str(), var_name.c_str(), + session_id_); GELOGE(FAILED, "VarResource::SaveVarAddr, var_key %s save addr conflict", var_key.c_str()); return FAILED; } @@ -136,6 +144,8 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O } if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", + var_name.c_str(), session_id_); GELOGE(FAILED, "[RenewCurVarDesc] renew var desc fail! input opdesc is null!"); return FAILED; } @@ -152,6 +162,9 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O cur_var_tensor_desc_map_[var_name] = curr_desc; auto iter = var_addr_mgr_map_.find(key); if (iter == var_addr_mgr_map_.end()) { + REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " + "check invalid", key.c_str(), var_name.c_str(), + session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "[RenewCurVarDesc] can't find ele with key [%s]", key.c_str()); return FAILED; } @@ -271,11 +284,15 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size, uint64_t real_size = size; total_size_ = VarManager::Instance(session_id)->GetVarMemMaxSize(); if (total_size_ < var_mem_size_) { + REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" + "", total_size_, var_mem_size_, size, var_name.c_str()); GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_); return PARAM_INVALID; } uint64_t free_size = total_size_ - var_mem_size_; if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { + REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", + free_size, size, var_name.c_str()); GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]", size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); return PARAM_INVALID; @@ -299,6 +316,8 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size, Status RdmaMemResource::AssignVarMem(const std::string &var_name, uint64_t size, uint64_t session_id, size_t &address) { uint8_t *buffer = MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Malloc(size); if (buffer == nullptr) { + REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", + size, var_name.c_str()); GELOGE(MEMALLOC_FAILED, "Failed to malloc rdma memory for node %s, size = %lu", var_name.c_str(), size); return MEMALLOC_FAILED; } @@ -347,14 +366,18 @@ ge::Status VarManager::Init(const uint32_t &version, const uint64_t &session_id, const uint64_t &job_id) { std::lock_guard lock(mutex_); GELOGI("VarManager::Init, session id = %lu.", session_id); - version_ = version; - device_id_ = device_id; - session_id_ = session_id; - job_id_ = job_id; - var_resource_ = std::unique_ptr(new (std::nothrow) VarResource(session_id_)); if (var_resource_ == nullptr) { - GELOGW("VarManager has not been init."); - return ge::INTERNAL_ERROR; + version_ = version; + device_id_ = device_id; + session_id_ = session_id; + job_id_ = job_id; + var_resource_ = std::unique_ptr(new (std::nothrow) VarResource(session_id_)); + if (var_resource_ == nullptr) { + GELOGW("VarManager init failed session id = %lu.", session_id); + return ge::INTERNAL_ERROR; + } + } else { + GELOGW("VarManager::has been inited, session id = %lu.", session_id); } return SUCCESS; } @@ -444,6 +467,8 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { } if (mem_resource == nullptr) { + REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", + memory_type, session_id_); GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); return 0; } @@ -457,6 +482,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { if (iter == mem_resource_map_.end()) { mem_resource = MemResource::BuildMemResourceFromType(memory_type); if (mem_resource == nullptr) { + REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", + memory_type, session_id_); GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); return ge::INTERNAL_ERROR; } else { @@ -467,6 +494,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { } if (mem_resource == nullptr) { + REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", + memory_type, session_id_); GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); return FAILED; } @@ -485,6 +514,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen size_t mem_offset = 0; ge::Status result = TensorUtils::GetSize(tensor_desc, tensor_desc_size); if (result != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", + var_name.c_str(), memory_type, session_id_); GELOGE(result, "get size from TensorDesc failed"); return result; } @@ -494,6 +525,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (it == mem_resource_map_.end()) { mem_resource = MemResource::BuildMemResourceFromType(memory_type); if (mem_resource == nullptr) { + REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", + memory_type, session_id_); GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); return ge::INTERNAL_ERROR; } else { @@ -504,6 +537,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen } if (mem_resource == nullptr) { + REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", + memory_type, session_id_); GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid, memory_type = %u.", memory_type); return ge::INTERNAL_ERROR; } @@ -513,6 +548,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen return ge::INTERNAL_ERROR; } if (var_resource_ == nullptr) { + REPORT_INNER_ERROR("E19999", "VarManager has not been init, memory_type:%d, session_id:%lu, " + "check invalid", memory_type, session_id_); GELOGW("VarManager has not been init."); return ge::INTERNAL_ERROR; } @@ -631,6 +668,9 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt GELOGD("VarManager::RenewCurVarDesc var_name = %s.", var_name.c_str()); if (var_resource_ == nullptr) { + REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + session_id_); GELOGE(ge::INTERNAL_ERROR, "VarManager has not been init."); return ge::INTERNAL_ERROR; } @@ -782,6 +822,8 @@ Status VarManager::SetMemoryMallocSize(const map &options) { var_mem_logic_base_ = graph_mem_max_size_ + kGraphMemoryBuffer; if (var_mem_logic_base_ > kMaxMemorySize) { + REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", + var_mem_logic_base_, kMaxMemorySize, session_id_); GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kMemoryVarLogicBase : %zu can not exceed max memory size : %zu.", var_mem_logic_base_, kMaxMemorySize); return ge::GE_GRAPH_OPTIONS_INVALID; @@ -789,6 +831,8 @@ Status VarManager::SetMemoryMallocSize(const map &options) { use_max_mem_size_ = graph_mem_max_size_ + var_mem_max_size_; if (use_max_mem_size_ > kMaxMemorySize) { + REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", + use_max_mem_size_, kMaxMemorySize, session_id_); GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kUseMaxMemorySize : %zu can not exceed max memory size : %zu.", use_max_mem_size_, kMaxMemorySize); return ge::GE_GRAPH_OPTIONS_INVALID; @@ -799,6 +843,8 @@ Status VarManager::SetMemoryMallocSize(const map &options) { Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { if (memory_size.empty()) { + REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", + session_id_); GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input is empty."); return GE_GRAPH_OPTIONS_INVALID; } @@ -824,15 +870,23 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { for (char c : split) { if (!isdigit(c)) { + REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", + memory_size.c_str(), session_id_); GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input contains non digit."); return GE_GRAPH_OPTIONS_INVALID; } } uint64_t num = std::strtoul(split.c_str(), nullptr, 0); GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(result, static_cast(num)), + REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " + "check invalid", memory_size.c_str(), + session_id_); GELOGE(FAILED, "Input memory size is out of range."); return FAILED); if ((num > kMaxMemorySize) || (result * static_cast(num) > kMaxMemorySize)) { + REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " + "check invalid", memory_size.c_str(), kMaxMemorySize, + session_id_); GELOGE(FAILED, "Input memory size can not exceed max memory size : %zu.", kMaxMemorySize); return FAILED; } @@ -936,6 +990,7 @@ VarManager *VarManagerPool::GetVarManager(uint64_t session_id) { VarManager *var_manager = new (std::nothrow) VarManager(session_id); if (var_manager == nullptr) { + REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); GELOGE(INTERNAL_ERROR, "VarManager::Instance find session by " "session_id[%lu] failed.", diff --git a/ge/graph/manager/host_mem_allocator.cc b/ge/graph/manager/host_mem_allocator.cc index ca2b5124..98f9a313 100644 --- a/ge/graph/manager/host_mem_allocator.cc +++ b/ge/graph/manager/host_mem_allocator.cc @@ -34,6 +34,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { std::lock_guard lock(mutex_); std::shared_ptr aligned_ptr = MakeShared(size); if (aligned_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "New AlignedPtr fail"); GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed"); return nullptr; } @@ -44,6 +45,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { Status HostMemAllocator::Free(const void *memory_addr) { if (memory_addr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); return GE_GRAPH_FREE_FAILED; } @@ -51,6 +53,7 @@ Status HostMemAllocator::Free(const void *memory_addr) { std::lock_guard lock(mutex_); auto it = allocated_blocks_.find(memory_addr); if (it == allocated_blocks_.end()) { + REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); GELOGE(PARAM_INVALID, "Invalid memory pointer"); return PARAM_INVALID; } diff --git a/ge/graph/manager/host_mem_manager.cc b/ge/graph/manager/host_mem_manager.cc index 60a7586d..40a0d1b9 100644 --- a/ge/graph/manager/host_mem_manager.cc +++ b/ge/graph/manager/host_mem_manager.cc @@ -39,6 +39,8 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) { rtMallocHostSharedMemoryOut output_para; rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id); return GE_GRAPH_MEMORY_ALLOC_FAILED; } @@ -59,6 +61,8 @@ Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) { mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", + rt_ret); GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret); return RT_FAILED; } @@ -74,6 +78,7 @@ Status HostMemManager::Initialize() { std::lock_guard lock(mutex_); allocator_ = std::unique_ptr(new (std::nothrow) SharedMemAllocator()); if (allocator_ == nullptr) { + REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!"); return GE_GRAPH_MALLOC_FAILED; } @@ -94,6 +99,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { std::lock_guard lock(mutex_); auto iter = var_memory_base_map_.find(mem_info.op_name); if (iter != var_memory_base_map_.end()) { + REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", + mem_info.op_name.c_str()); GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); return FAILED; } @@ -107,6 +114,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { std::lock_guard lock(mutex_); if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { + REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", + op_name.c_str()); GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/manager/memory_api.cc b/ge/graph/manager/memory_api.cc index 0798eb51..415f8088 100644 --- a/ge/graph/manager/memory_api.cc +++ b/ge/graph/manager/memory_api.cc @@ -50,6 +50,8 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t path.append(file_name); string canonical_path = RealPath(path.c_str()); if (canonical_path.empty()) { + REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", + canonical_path.c_str()); GELOGE(FAILED, "Failed to get realpath of %s", path.c_str()); return FAILED; } @@ -65,12 +67,16 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t auto hcom_remote_mem_register = (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "HcomRegRemoteAccessMem"); if (hcom_remote_mem_register == nullptr) { + REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", + canonical_path.c_str()); GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); return FAILED; } HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); if (hccl_ret != HCCL_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", + hccl_ret); GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; } diff --git a/ge/graph/manager/rdma_pool_allocator.cc b/ge/graph/manager/rdma_pool_allocator.cc index ed243801..c19a2159 100644 --- a/ge/graph/manager/rdma_pool_allocator.cc +++ b/ge/graph/manager/rdma_pool_allocator.cc @@ -81,6 +81,7 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { auto device_id = GetContext().DeviceId(); GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); if (rdma_base_addr_ != nullptr) { + REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced"); return GE_MULTI_INIT; } @@ -100,6 +101,7 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { // Init with a base block. auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); if (base_block == nullptr) { + REPORT_CALL_ERROR("E19999", "New Block failed, device_id:%u", device_id); GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed"); return GE_GRAPH_MALLOC_FAILED; } @@ -118,6 +120,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { block_bin_.erase(it); block->allocated = true; if (block->ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", + device_id); GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr."); return nullptr; } @@ -150,6 +154,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { GELOGI("Free rdma memory, device id = %u", device_id); if (memory_addr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", + device_id); GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); return GE_GRAPH_FREE_FAILED; } @@ -157,6 +163,8 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { std::lock_guard lock(mutex_); auto it = allocated_blocks_.find(memory_addr); if (it == allocated_blocks_.end()) { + REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " + "check invalid", device_id); GELOGE(PARAM_INVALID, "Invalid memory pointer"); return PARAM_INVALID; } @@ -199,6 +207,7 @@ void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) { Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { if (rdma_base_addr_ == nullptr) { + REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr."); return INTERNAL_ERROR; } diff --git a/ge/graph/manager/trans_var_data_utils.cc b/ge/graph/manager/trans_var_data_utils.cc index cd992d3d..7c96eb95 100644 --- a/ge/graph/manager/trans_var_data_utils.cc +++ b/ge/graph/manager/trans_var_data_utils.cc @@ -35,18 +35,24 @@ class RtContextSwitchGuard { RtContextSwitchGuard(rtCtxMode_t mode, uint32_t device_id) : last_(nullptr), current_(nullptr) { auto ret = rtCtxGetCurrent(&last_); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", + device_id, ret); GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret); return; } ret = rtCtxCreate(¤t_, mode, static_cast(device_id)); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", + device_id, ret); GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret); return; } ret = rtCtxSetCurrent(current_); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X,", + device_id, ret); GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id); return; } @@ -72,6 +78,8 @@ class RtContextSwitchGuard { int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { int64_t var_size = GetSizeByDataType(desc.GetDataType()); if (var_size <= 0) { + REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", + TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); return -1; @@ -89,6 +97,8 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res auto ret = rtMemcpy(var_addr, trans_result.length, reinterpret_cast(trans_result.data.get()), trans_result.length, RT_MEMCPY_HOST_TO_DEVICE); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), + var->GetType().c_str(), trans_result.length, ret); GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length); return RT_FAILED; } @@ -110,6 +120,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); if (var_addr == nullptr) { + REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu,", + RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); GELOGE(INTERNAL_ERROR, "Failed to copy var %s from device, cant not get " "var addr from logic addr %p", @@ -124,6 +136,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt std::unique_ptr var_host(new(std::nothrow) uint8_t[var_size_bytes]); if (var_host == nullptr) { + REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu,", + var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes); return OUT_OF_MEMORY; } @@ -131,6 +145,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt ret = rtMemcpy(reinterpret_cast(var_host.get()), var_size_bytes, reinterpret_cast(var_addr), var_size_bytes, RT_MEMCPY_DEVICE_TO_HOST); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", + var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); GELOGE(RT_FAILED, "Failed to copy var memory from device, var %s, size %ld," " rt-error-code %u", @@ -175,6 +191,12 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats TypeUtils::DataTypeToSerialString(data_type).c_str()); auto ret = formats::TransFormat({src_data, src_format, dst_format, src_shape, dst_shape, data_type}, tmp_result); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Trans format from %s to %s, shape %s to %s failed, data type:%s, ret:%u,", + TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(dst_format).c_str(), + formats::ShapeToString(src_shape).c_str(), + formats::ShapeToString(dst_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); GELOGE(INTERNAL_ERROR, "Failed to trans format from %s to %s, shape %s to %s, " "data type %s error code %u", @@ -195,6 +217,10 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats auto ret = formats::TransDataType({src_data, static_cast(src_data_size), src_data_type, dst_data_type}, tmp_result); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Trans data type from %s to %s failed, input shape %s, data size %ld, ret:%u", + TypeUtils::DataTypeToSerialString(src_data_type).c_str(), + TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), + formats::ShapeToString(input_shape).c_str(), src_data_size, ret); GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u", TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), @@ -202,6 +228,8 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats return ret; } } else { + REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", + trans_info.node_type.c_str()); GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported", trans_info.node_type.c_str()); return UNSUPPORTED; @@ -236,6 +264,8 @@ Status ReAssignVarAddr(uint64_t session_id, uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); if (var_addr == nullptr) { + REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", + RT_MEMORY_HBM, var_name.c_str(), session_id); GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str()); return INTERNAL_ERROR; } @@ -263,6 +293,8 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t // Sync var data from device std::unique_ptr var_data; if (trans_road.empty()) { + REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", + session_id); GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty."); return INTERNAL_ERROR; } @@ -314,6 +346,10 @@ Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var auto ret = formats::TransDataType( {var_data, static_cast(src_data_shape_size), src_data_datatype, dst_data_datatype}, result); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Trans data type from %s to %s failed, data size %ld, ret:%u", + TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), + TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), + src_data_shape_size, ret); GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret; }); @@ -329,7 +365,10 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, /// unlink edges between var_fp32 and "dst_node" (need fp16) of var_fp32, add edge between var_fp16 and dst_node. /// need copy value from var_fp32 to var_fp16. /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] - GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, GELOGE(FAILED, "node var is nullptr"); return FAILED); + GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, + REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is empty, session_id:%lu, device_id:%u, " + "check invalid", session_id, device_id); + GELOGE(FAILED, "node var is nullptr"); return FAILED); // src_node output_desc (fp32) GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); auto src_data_type = output_desc.GetDataType(); @@ -447,15 +486,21 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, } std::future f = executor.commit( - [](const ge::NodePtr &node, uint64_t session_id, rtContext_t ctx, uint32_t graph_id) -> Status { + [](const ge::NodePtr &node, uint64_t session_id, rtContext_t ctx, uint32_t graph_id, + const struct ErrorMessage::Context &error_context) -> Status { + ErrorManager::GetInstance().SetErrorContext(error_context); rtError_t rt_ret = rtCtxSetCurrent(ctx); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", + session_id, graph_id, rt_ret); GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } uint32_t allocated_graph_id = 0; Status ret = VarManager::Instance(session_id)->GetAllocatedGraphId(node->GetName(), allocated_graph_id); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", + session_id, graph_id, ret); GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id); return INTERNAL_ERROR; @@ -480,7 +525,7 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, } return SUCCESS; }, - node, session_id, context, graph_id); + node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); return FAILED; @@ -503,6 +548,8 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint64_t session_id, uint32_t device_id) { GELOGD("CopyVarData start: session_id:%lu.", session_id); if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", + session_id, device_id); GELOGE(FAILED, "compute_graph is nullptr"); return FAILED; } diff --git a/ge/graph/manager/util/debug.cc b/ge/graph/manager/util/debug.cc index 2c930d1f..65aa3192 100644 --- a/ge/graph/manager/util/debug.cc +++ b/ge/graph/manager/util/debug.cc @@ -63,12 +63,16 @@ Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) { uint8_t *host_addr = nullptr; rtError_t ret = rtMallocHost(reinterpret_cast(&host_addr), size); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret: 0x%X", + size, ret); GELOGE(FAILED, "Call rt api rtMallocHost failed, ret: 0x%X", ret); return FAILED; } GE_MAKE_GUARD_RTMEM(host_addr); ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", + size, ret); GELOGE(FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret); return FAILED; } diff --git a/ge/graph/manager/util/hcom_util.cc b/ge/graph/manager/util/hcom_util.cc index 53dd9410..a30321f9 100644 --- a/ge/graph/manager/util/hcom_util.cc +++ b/ge/graph/manager/util/hcom_util.cc @@ -40,6 +40,9 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, if (op_desc->GetType() == HCOMRECEIVE) { bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); if (ret == false) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + HCOM_ATTR_DATA_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype."); return PARAM_INVALID; } @@ -51,6 +54,10 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, auto iter = kConstOpHcclDataType.find(static_cast(src_data_type)); if (iter == kConstOpHcclDataType.end()) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " + "check invalid", HCOM_ATTR_DATA_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s", op_desc->GetName().c_str(), op_desc->GetType().c_str(), @@ -76,6 +83,8 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType int &count) { GE_CHECK_NOTNULL(op_desc); if (!IsHCOMOp(op_desc->GetType())) { + REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator."); return PARAM_INVALID; } @@ -142,6 +151,8 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc, std::vector &kernel_hccl_infos) { GE_CHECK_NOTNULL(op_desc); if (!IsHorovodOp(op_desc->GetType())) { + REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator."); return PARAM_INVALID; } @@ -213,7 +224,11 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl if (IsHCOMOp(op_desc->GetType())) { std::string hcom_op_type; - GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), return PARAM_INVALID, + GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + HCOM_ATTR_REDUCE_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!", op_desc->GetName().c_str(), op_desc->GetType().c_str()); @@ -226,6 +241,9 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl } else if (hcom_op_type == "sum") { op_type = HCCL_REDUCE_SUM; } else { + REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " + "check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str()); return PARAM_INVALID; } @@ -234,12 +252,18 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl if (IsHorovodOp(op_desc->GetType())) { int64_t horovod_op_type; GE_CHK_BOOL_EXEC(ge::AttrUtils::GetInt(op_desc, ATTR_HOROVOD_ATTR_REDUCE_TYPE, horovod_op_type), + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!", op_desc->GetName().c_str(), op_desc->GetType().c_str()); auto iter = kHorovodRedOpToHcclRedOp.find(static_cast(horovod_op_type)); if (iter == kHorovodRedOpToHcclRedOp.end()) { + REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " + "check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld", op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); return PARAM_INVALID; @@ -252,7 +276,11 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &root_id) { GE_CHECK_NOTNULL(op_desc); - GE_CHK_BOOL_EXEC(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_ROOT_RANK, root_id), return PARAM_INVALID, + GE_CHK_BOOL_EXEC(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_ROOT_RANK, root_id), + REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", + HCOM_ATTR_ROOT_RANK.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return PARAM_INVALID, "HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!", op_desc->GetName().c_str(), op_desc->GetType().c_str()); @@ -293,6 +321,9 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, std::vector &kernel_hccl_infos) { GE_CHECK_NOTNULL(op_desc); if (IsHCOMOp(op_desc->GetType()) && kernel_hccl_infos.size() != 1) { + REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " + "check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid."); return PARAM_INVALID; } @@ -302,6 +333,10 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, return SUCCESS; } if (kernel_hccl_infos.empty() || op_desc->GetInputsSize() != kernel_hccl_infos.size()) { + REPORT_INNER_ERROR("E19999", "Param kernel_hccl_infos.size:%zu is empty or not equal to input_desc size:%zu " + "in op:%s(%s), check invalid", + kernel_hccl_infos.size(), op_desc->GetInputsSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid."); return PARAM_INVALID; } diff --git a/ge/graph/optimize/graph_optimize.cc b/ge/graph/optimize/graph_optimize.cc index 8cca5b5d..1dc349a6 100644 --- a/ge/graph/optimize/graph_optimize.cc +++ b/ge/graph/optimize/graph_optimize.cc @@ -37,6 +37,7 @@ GraphOptimize::GraphOptimize() void AddNodeInputProperty(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[AddNodeInputProperty]: compute_graph is nullptr."); return; } @@ -78,6 +79,7 @@ void AddNodeInputProperty(ComputeGraphPtr &compute_graph) { Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std::string &engine_name) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeSubGraph]: compute_graph is nullptr."); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } @@ -87,6 +89,7 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GraphOptimzer: GE is not initialized"); return GE_CLI_GE_NOT_INITIALIZED; } @@ -105,6 +108,9 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph)); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeFusedGraphAfterGraphSlice failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, engine_name.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphAfterGraphSlice]: graph optimize failed, ret:%d", ret); return ret; } @@ -115,6 +121,9 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { ret = (*iter)->OptimizeFusedGraph(*(compute_graph)); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeFusedGraph failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, engine_name.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraph]: graph optimize failed, ret:%d", ret); return ret; } @@ -132,6 +141,7 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) { return SUCCESS; } if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeOriginalGraph]: compute_graph is nullptr."); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } @@ -139,6 +149,7 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) { Status ret = SUCCESS; std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeOriginalGraph failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -155,6 +166,9 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) { } ret = (iter->second)->OptimizeOriginalGraph(*compute_graph); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeOriginalGraph failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, iter->first.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeOriginalGraph]: graph optimize failed, ret:%d", ret); return ret; } @@ -174,6 +188,7 @@ Status GraphOptimize::OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_ Status ret = SUCCESS; std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeOriginalGraph failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -191,6 +206,9 @@ Status GraphOptimize::OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_ GELOGI("Begin to refine running format by engine %s", iter->first.c_str()); ret = (iter->second)->OptimizeOriginalGraphJudgeInsert(*compute_graph); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeOriginalGraphJudgeInsert failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, iter->first.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeOriginalGraphJudgeInsert]: graph optimize failed, ret:%d", ret); return ret; } @@ -201,12 +219,14 @@ Status GraphOptimize::OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_ Status GraphOptimize::OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeOriginalGraph]: compute_graph is nullptr."); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeOriginalGraph failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -224,6 +244,9 @@ Status GraphOptimize::OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_ } ret = iter->second->OptimizeGraphPrepare(*compute_graph); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeGraphPrepare failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, iter->first.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeOriginalGraphForQuantize]: graph optimize failed, ret:%u", ret); return ret; } @@ -234,12 +257,14 @@ Status GraphOptimize::OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_ Status GraphOptimize::OptimizeGraphBeforeBuildForRts(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeGraphBeforeBuildForRts]: compute_graph is nullptr."); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeGraphBeforeBuildForRts failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -258,6 +283,9 @@ Status GraphOptimize::OptimizeGraphBeforeBuildForRts(ComputeGraphPtr &compute_gr } ret = iter->second->OptimizeGraphBeforeBuild(*compute_graph); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeGraphBeforeBuild failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, iter->first.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeGraphBeforeBuildForRts]: graph optimize failed, ret:%u", ret); return ret; } @@ -268,6 +296,8 @@ Status GraphOptimize::OptimizeGraphBeforeBuildForRts(ComputeGraphPtr &compute_gr Status GraphOptimize::SetOptions(const ge::GraphManagerOptions &options) { if (options.framework_type >= static_cast(domi::FrameworkType::FRAMEWORK_RESERVED)) { + REPORT_INNER_ERROR("E19999", "Param framework_type:%d in option check invalid", + options.framework_type); GELOGE(GE_GRAPH_OPTIONS_INVALID, "Optimize Type %d invalid.", options.framework_type); return GE_GRAPH_OPTIONS_INVALID; } @@ -342,12 +372,14 @@ Status GraphOptimize::IdentifyReference(ComputeGraphPtr &compute_graph) { } Status GraphOptimize::OptimizeWholeGraph(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeWholeGraph]: compute_graph is nullptr."); return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; } std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeWholeGraph failed."); return GE_CLI_GE_NOT_INITIALIZED; } @@ -366,6 +398,9 @@ Status GraphOptimize::OptimizeWholeGraph(ComputeGraphPtr &compute_graph) { ret = iter.second->OptimizeWholeGraph(*compute_graph); GE_DUMP(compute_graph, "OptimizeWholeGraph" + iter.first); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Call OptimizeWholeGraph failed, ret:%d, engine_name:%s, " + "graph_name:%s", ret, iter.first.c_str(), + compute_graph->GetName().c_str()); GELOGE(ret, "[OptimizeWholeGraph]: graph optimize failed, ret:%u", ret); return ret; } diff --git a/ge/graph/partition/dynamic_shape_partition.cc b/ge/graph/partition/dynamic_shape_partition.cc index 5736e39a..bd95d0c5 100755 --- a/ge/graph/partition/dynamic_shape_partition.cc +++ b/ge/graph/partition/dynamic_shape_partition.cc @@ -48,50 +48,23 @@ namespace ge { using Cluster = DynamicShapePartitioner::Cluster; using ClusterPtr = std::shared_ptr; -static bool IsInExperimentalMode(const ComputeGraphPtr &root_graph) { +static bool IsSingleOpScene(const ComputeGraphPtr &root_graph) { for (const auto &node : root_graph->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); // not do partition in single op scene. bool is_singleop = false; (void)AttrUtils::GetBool(node->GetOpDesc(), ATTR_SINGLE_OP_SCENE, is_singleop); if (is_singleop) { - return false; - } - - for (const auto &input_desc : node->GetOpDesc()->GetAllInputsDesc()) { - auto type = input_desc.GetDataType(); - if (type == DT_STRING || type == DT_RESOURCE || type == DT_STRING_REF) { - if (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") == nullptr) { - return false; - } else { - GEEVENT("In dynamic shape scene, model contains data type:" - "DT_STRING/DT_RESOURCE/DT_STRING_REF may not be supported well " - "temporarily, please retry with \"unset EXPERIMENTAL_DYNAMIC_PARTITION\"."); - break; - } - } - } - for (const auto &output_desc : node->GetOpDesc()->GetAllOutputsDesc()) { - auto type = output_desc.GetDataType(); - if (type == DT_STRING || type == DT_RESOURCE || type == DT_STRING_REF) { - if (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") == nullptr) { - return false; - } else { - GEEVENT("In dynamic shape scene, model contains data type:" - "DT_STRING/DT_RESOURCE/DT_STRING_REF may not be supported well " - "temporarily, please retry with \"unset EXPERIMENTAL_DYNAMIC_PARTITION\"."); - break; - } - } + return true; } } - return true; + return false; } Status DynamicShapePartitioner::Partition() { REQUIRE_NOT_NULL(root_graph_, "Graph is nullptr."); - if (!IsInExperimentalMode(root_graph_)) { - GELOGD("Skip dynamic shape partition as not in experimental mode."); + if (IsSingleOpScene(root_graph_)) { + GELOGD("Skip dynamic shape partition as in single op scene."); REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false), "Failed set dynamic shape partitioned flag on root graph."); return SUCCESS; diff --git a/ge/graph/passes/addn_pass.cc b/ge/graph/passes/addn_pass.cc index c8f820fc..3e2d3f06 100644 --- a/ge/graph/passes/addn_pass.cc +++ b/ge/graph/passes/addn_pass.cc @@ -26,12 +26,14 @@ const size_t kInputSizeSingle = 1; Status AddNPass::Run(NodePtr &node) { GELOGD("AddNPass running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } if (node->GetType() == ADDN) { if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [node] op desc is null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/aicpu_constant_folding_pass.cc b/ge/graph/passes/aicpu_constant_folding_pass.cc index 0331e2e6..b5a989c8 100644 --- a/ge/graph/passes/aicpu_constant_folding_pass.cc +++ b/ge/graph/passes/aicpu_constant_folding_pass.cc @@ -122,6 +122,7 @@ bool AicpuConstantFoldingPass::CheckInput(const NodePtr &node, vector &weight_vec, vector &input_addrs) { if (weight_vec.empty()) { + REPORT_INNER_ERROR("E19999", "Param weight_vec is empty, check invalid"); GELOGE(FAILED, "Weight is null"); return FAILED; } @@ -132,6 +133,8 @@ Status AicpuConstantFoldingPass::GetInputAddrs(const vector &w rtError_t rt_ret = rtMemcpy(input_addr, weight->GetData().size(), weight->GetData().data(), weight->GetData().size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X", + weight->GetData().size(), rt_ret); GELOGE(rt_ret, "rtMemcpy error"); GE_CHK_RT(rtFree(input_addr)); return FAILED; @@ -145,6 +148,8 @@ Status AicpuConstantFoldingPass::GetInputAddrs(const vector &w Status AicpuConstantFoldingPass::GetOutputAddrs(const OpDescPtr &node_desc, vector &output_addrs) { if (node_desc->GetOutputsSize() == 0) { + REPORT_INNER_ERROR("E19999", "Ouput desc size of op:%s(%s) is 0, check invalid", + node_desc->GetName().c_str(), node_desc->GetType().c_str()); GELOGE(FAILED, "Output size is 0 "); return FAILED; } @@ -171,6 +176,8 @@ Status AicpuConstantFoldingPass::GenerateDataPtrInfo(const vector &out if (result_summary.shape_data_size != 0) { rtError_t rt_ret = rtMalloc(&shape_data_addr, result_summary.shape_data_size, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, size:%lu, ret = 0x%X", + result_summary.shape_data_size, rt_ret); GELOGE(rt_ret, "rtMalloc error"); GE_CHK_RT(rtFree(raw_data_addr)); return FAILED; @@ -200,6 +207,7 @@ Status AicpuConstantFoldingPass::GenerateDataPtrInfo(const vector &out Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task) { // Update the workspace_addr if (task_info.empty()) { + REPORT_INNER_ERROR("E19999", "Param task_info is empty, check invalid"); GELOGE(FAILED, "task_info is empty "); return FAILED; } @@ -208,6 +216,8 @@ Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_ rtError_t rt_ret = rtMemcpy(workspace_addr, task_info.size(), task_info.data(), task_info.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X", + task_info.size(), rt_ret); GELOGE(rt_ret, "rtMemcpy error"); GE_CHK_RT(rtFree(workspace_addr)); return FAILED; @@ -221,6 +231,7 @@ Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_ Status AicpuConstantFoldingPass::UpdateInputAndOutputAddr(const vector &io_addrs, STR_FWK_OP_KERNEL &task) { auto addrs_size = sizeof(uint64_t) * (io_addrs.size()); if (addrs_size <= 0) { + REPORT_INNER_ERROR("E19999", "Param io_addrs size is 0, check invalid"); GELOGE(FAILED, "addrs_size is less than 1 "); return FAILED; } @@ -228,6 +239,8 @@ Status AicpuConstantFoldingPass::UpdateInputAndOutputAddr(const vector GE_CHK_RT_RET(rtMalloc(&input_output_addr, addrs_size, RT_MEMORY_HBM)); rtError_t rt_ret = rtMemcpy(input_output_addr, addrs_size, io_addrs.data(), addrs_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X", + addrs_size, rt_ret); GELOGE(rt_ret, "rtMemcpy error"); GE_CHK_RT(rtFree(input_output_addr)); return FAILED; @@ -284,6 +297,8 @@ Status AicpuConstantFoldingPass::UpdateMemCopyAddr(string &task_info, const vect GE_CHK_RT_RET(rtMalloc(&input_addr_ptr, data_size, RT_MEMORY_HBM)); rtError_t rt_ret = rtMemcpy(input_addr_ptr, data_size, item.data(), data_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X", + data_size, rt_ret); GELOGE(rt_ret, "rtMemcpy error"); GE_CHK_RT(rtFree(input_addr_ptr)); return FAILED; @@ -312,11 +327,14 @@ Status AicpuConstantFoldingPass::LaunchSingleOpRunTask(const NodePtr &node, cons void *task_buf = nullptr; auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); return GE_CLI_GE_NOT_INITIALIZED; } auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName); if (kernel_builder == nullptr) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed", + kKernelLibName); GELOGE(FAILED, "Get op kernel info store failed"); return FAILED; } @@ -367,11 +385,14 @@ Status AicpuConstantFoldingPass::LaunchMemCopyTask(const vector &data_ void *task_buf = nullptr; auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); return GE_CLI_GE_NOT_INITIALIZED; } auto kernel_builder = OpsKernelBuilderManager::Instance().GetOpsKernelBuilder(kKernelLibName); if (kernel_builder == nullptr) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed", + kKernelLibName); GELOGE(FAILED, "Get op kernel info store failed"); return FAILED; } @@ -428,6 +449,8 @@ Status AicpuConstantFoldingPass::GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_ rtError_t rt_ret = rtMemcpy(task_buf, sizeof(STR_FWK_OP_KERNEL), reinterpret_cast(&aicpu_task), sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret = 0x%X", + sizeof(STR_FWK_OP_KERNEL), rt_ret); GELOGE(rt_ret, "rtMemcpy error"); GE_CHK_RT(rtFree(task_buf)); return FAILED; @@ -457,41 +480,57 @@ Status AicpuConstantFoldingPass::KernelLaunch(void *task_buf) { rtError_t rt_ret = rtModelCreate(&model, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelCreate failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "create model failed."); return FAILED; } rt_ret = rtStreamCreate(&stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "create stream failed."); return FAILED; } rt_ret = rtModelBindStream(model, stream, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "rtModelBindStream failed."); return FAILED; } rt_ret = rtKernelLaunchEx(task_buf, sizeof(STR_FWK_OP_KERNEL), 0, stream); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelBindStream failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "rtKernelLaunchEx failed."); return FAILED; } rt_ret = rtModelLoadComplete(model); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelLoadComplete failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "rtModelLoadComplete failed."); return FAILED; } rt_ret = rtStreamCreate(&stream_run, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamCreate failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "create run stream failed."); return FAILED; } rt_ret = rtModelExecute(model, stream_run, 0); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtModelExecute failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "rtModelExecute failed."); return FAILED; } rt_ret = rtStreamSynchronize(stream_run); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtStreamSynchronize failed, ret = 0x%X", + rt_ret); GELOGE(rt_ret, "rtStreamSynchronize failed."); return FAILED; } @@ -501,6 +540,9 @@ Status AicpuConstantFoldingPass::KernelLaunch(void *task_buf) { Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, const vector &data_vec, vector &outputs) { if ((node_desc->GetOutputsSize() * kDouble) != data_vec.size()) { + REPORT_INNER_ERROR("E19999", "Output desc size:%zu of op:%s(%s), after multi 2, not equal to data_vec.size:%zu, " + "check invalid", node_desc->GetOutputsSize(), + node_desc->GetName().c_str(), node_desc->GetType().c_str(), data_vec.size()); GELOGE(FAILED, "node[%s] something wrong with output size", node_desc->GetName().c_str()); return FAILED; } @@ -509,6 +551,7 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co auto output_tensor_desc = node_desc->GetOutputDesc(static_cast(i)); GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "node[%s] something wrong with construct GeTensor", node_desc->GetName().c_str()); return FAILED; } @@ -516,6 +559,8 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co uint64_t raw_data_size = raw_data_info.data_size; std::unique_ptr data_addr(new (std::nothrow) uint8_t[raw_data_size]()); if (data_addr == nullptr) { + REPORT_CALL_ERROR("E19999", "New Buffer failed, size:%lu", + raw_data_size); GELOGE(MEMALLOC_FAILED, "new data_addr failed"); return INTERNAL_ERROR; } @@ -539,6 +584,8 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co uint64_t dim_num = shape_data_size / sizeof(uint64_t); std::unique_ptr shape_addr(new (std::nothrow) int64_t[dim_num]()); if (shape_addr == nullptr) { + REPORT_CALL_ERROR("E19999", "New Buffer failed, size:%lu", + dim_num); GELOGE(MEMALLOC_FAILED, "new shape_addr failed"); return INTERNAL_ERROR; } @@ -584,17 +631,22 @@ bool AicpuConstantFoldingPass::IsSkipFold(const ge::NodePtr &node) { } auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid"); GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); return true; } OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName); if (kernel_info == nullptr) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed", + kKernelLibName); GELOGE(FAILED, "Get op kernel info store failed"); return true; } std::string check_result; kernel_info->opsFlagCheck(*node, check_result); if (check_result.empty()) { + REPORT_CALL_ERROR("E19999", "Call opsFlagCheck faled, ops kernel name:%s, op:%s(%s)", + kKernelLibName, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Get op check_result failed"); return true; } diff --git a/ge/graph/passes/assert_pass.cc b/ge/graph/passes/assert_pass.cc index 79f75f53..20734d56 100644 --- a/ge/graph/passes/assert_pass.cc +++ b/ge/graph/passes/assert_pass.cc @@ -30,10 +30,12 @@ namespace ge { Status AssertPass::Run(NodePtr &node) { GELOGD("AssertPass running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] [opDesc] must not be null."); return PARAM_INVALID; } @@ -93,6 +95,8 @@ Status AssertPass::RemoveUnusedNode(std::vector &nodes_unused) { } if (IsolateAndDeleteNode(node, assert_io_map) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Isolate and delete node:%s(%s) faild", + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } } diff --git a/ge/graph/passes/assign_remove_pass.cc b/ge/graph/passes/assign_remove_pass.cc index 4faa04f6..43a95516 100644 --- a/ge/graph/passes/assign_remove_pass.cc +++ b/ge/graph/passes/assign_remove_pass.cc @@ -57,12 +57,18 @@ Status AssignRemovePass::OptimizedAssignNode(NodePtr &assign_node) { const auto &ref_in_anchor = assign_node->GetInDataAnchor(kAssignRefInputIndex); const auto &value_in_anchor = assign_node->GetInDataAnchor(kAssignValueInputIndex); if ((ref_in_anchor == nullptr) || (value_in_anchor == nullptr)) { + REPORT_INNER_ERROR("E19999", "Index %d or %d input anchor of node:%s(%s) is nullptr, check invalid", + kAssignRefInputIndex, kAssignValueInputIndex, + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "In data anchor is null, node:%s", assign_node->GetName().c_str()); return FAILED; } const auto &ref_peer_anchor = ref_in_anchor->GetPeerOutAnchor(); const auto &value_peer_anchor = value_in_anchor->GetPeerOutAnchor(); if ((ref_peer_anchor == nullptr) || (value_peer_anchor == nullptr)) { + REPORT_INNER_ERROR("E19999", "Index %d or %d input anchor of node:%s(%s), peer anchor is nullptr, check invalid", + kAssignRefInputIndex, kAssignValueInputIndex, + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Peer data anchor is null, node:%s", assign_node->GetName().c_str()); return FAILED; } @@ -79,6 +85,8 @@ Status AssignRemovePass::OptimizedAssignNode(NodePtr &assign_node) { /// GELOGD("Optimization for assign_node %s start", assign_node->GetName().c_str()); if (IsolateAndDeleteNode(assign_node, {kAssignRefInputIndex}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Isolate and delete assign_node %s failed.", assign_node->GetName().c_str()); return FAILED; } @@ -86,16 +94,26 @@ Status AssignRemovePass::OptimizedAssignNode(NodePtr &assign_node) { const auto &ref_input = ref_peer_anchor->GetOwnerNode()->GetOpDesc(); const auto &value_input = value_peer_anchor->GetOwnerNode()->GetOpDesc(); if ((ref_input == nullptr) || (value_input == nullptr)) { + REPORT_INNER_ERROR("E19999", "Input index %d or %d of node:%s(%s), peer op is nullptr, check invalid", + kAssignRefInputIndex, kAssignValueInputIndex, + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "value input is null"); return FAILED; } // variable has and only has one input if (ref_input->UpdateInputDesc(0, value_input->GetOutputDesc(value_peer_anchor->GetIdx())) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Input index %d of node:%s(%s), update it's peer op input:0 desc failed", + kAssignRefInputIndex, assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Update input_desc for variable %s failed.", ref_input->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(value_peer_anchor, ref_peer_anchor->GetOwnerNode()->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:0) failed", + value_peer_anchor->GetOwnerNode()->GetName().c_str(), + value_peer_anchor->GetOwnerNode()->GetType().c_str(), value_peer_anchor->GetIdx(), + ref_peer_anchor->GetOwnerNode()->GetName().c_str(), + ref_peer_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(FAILED, "Add data edge %s->%s failed", value_input->GetName().c_str(), ref_input->GetName().c_str()); return FAILED; } @@ -104,6 +122,9 @@ Status AssignRemovePass::OptimizedAssignNode(NodePtr &assign_node) { value_input->GetName().c_str(), ref_input->GetName().c_str()); if (!AttrUtils::SetStr(value_input->MutableOutputDesc(value_peer_anchor->GetIdx()), ASSIGN_VAR_NAME, ref_input->GetName())) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output:%d desc of node:%s(%s) failed", + ASSIGN_VAR_NAME.c_str(), value_peer_anchor->GetIdx(), + value_input->GetName().c_str(), value_input->GetType().c_str()); GELOGE(FAILED, "Set attr ASSIGN_VAR_NAME failed."); return FAILED; } @@ -136,6 +157,9 @@ Status AssignRemovePass::TransformAttr(NodePtr &node) { GELOGD("add attr ASSIGN_VAR_NAME on node %s, var_name=%s", in_node->GetName().c_str(), assign_var_name.c_str()); if (!AttrUtils::SetStr(in_node->GetOpDesc()->MutableOutputDesc(peer_data_anchor->GetIdx()), ASSIGN_VAR_NAME, assign_var_name)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output:%d desc of node:%s(%s) failed", + ASSIGN_VAR_NAME.c_str(), peer_data_anchor->GetIdx(), + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(FAILED, "Set attr ASSIGN_VAR_NAME failed."); return FAILED; } diff --git a/ge/graph/passes/atomic_addr_clean_pass.cc b/ge/graph/passes/atomic_addr_clean_pass.cc index 7c6ed8ce..24b84fa0 100755 --- a/ge/graph/passes/atomic_addr_clean_pass.cc +++ b/ge/graph/passes/atomic_addr_clean_pass.cc @@ -93,7 +93,7 @@ bool AtomicAddrCleanPass::CheckAtomicFromOpsKernel(const NodePtr &node) { in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) { auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode(); if (peer_in_node->GetType() == DATA) { - GELOGI("Recognized atomic op %s from %s engine and input is DATA.", node->GetName().c_str(), + GELOGI("Recognized atomic op %s from %s engine and input is DATA.", node->GetName().c_str(), op_info.engine.c_str()); return false; } @@ -126,11 +126,11 @@ bool AtomicAddrCleanPass::IsOutputIndexPeerInputAtomic(const NodePtr &node, int6 bool AtomicAddrCleanPass::CheckSkipInsertInLoopGraph(const NodePtr &node) { OpDescPtr op_desc = node->GetOpDesc(); - std::map> node_workspace_offset; + std::map> atomic_workspace_index_size; bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX); bool has_atomic_output = op_desc->HasAttr(ATOMIC_ATTR_OUTPUT_INDEX); - node_workspace_offset = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_OFFSET, node_workspace_offset); - if (!has_atomic_input && has_atomic_output && node_workspace_offset.empty()) { + atomic_workspace_index_size = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_INFO, atomic_workspace_index_size); + if (!has_atomic_input && has_atomic_output && atomic_workspace_index_size.empty()) { std::vector atomic_output_index; (void) ge::AttrUtils::GetListInt(op_desc, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_index); bool is_all_output_peer_also_atomic = true; @@ -222,6 +222,39 @@ Status AtomicAddrCleanPass::HandleNormalGraph(ComputeGraphPtr &graph, const vect } } } + return LinkToPotentialPrecedenceNode(graph, clean_addr_node); +} + +// Add control edges from atomic clean node to all potential precedence nodes which may execute before atomic clean +// node. We hope that atomic clean node can execute with the highest priority in the entire graph. Because of stream +// concurrency mechanism, only placing it at the head can not ensure that priority. Therefore, we need to add control +// edges from atomic clean node to the nodes that may be the first node on each stream. Generally, the first nodes on +// each stream are successors of Data/Variable, and Data/Variable won't generate task or execute, so we link to the +// successors of Data/Variable. +Status AtomicAddrCleanPass::LinkToPotentialPrecedenceNode(ComputeGraphPtr &graph, NodePtr &atomic_clean_node) { + GELOGD("Start to add control edges from %s to all second-nodes behind first-nodes which have no input.", + atomic_clean_node->GetName().c_str()); + auto out_ctrl_anchor = atomic_clean_node->GetOutControlAnchor(); + GE_CHECK_NOTNULL(out_ctrl_anchor); + + for (const auto &node : graph->GetDirectNode()) { + GE_CHECK_NOTNULL(node); + bool need_handle = (node->GetType() == DATA || node->GetType() == VARIABLE) && node->GetInAllNodes().empty(); + if (!need_handle) { + continue; + } + auto second_nodes = node->GetOutAllNodes(); + for (const auto &second_node : second_nodes) { + GE_CHECK_NOTNULL(second_node); + auto in_ctrl_anchor = second_node->GetInControlAnchor(); + GE_CHECK_NOTNULL(in_ctrl_anchor); + if (!out_ctrl_anchor->IsLinkedWith(in_ctrl_anchor)) { + GE_CHK_STATUS_RET(out_ctrl_anchor->LinkTo(in_ctrl_anchor)); + GELOGD("Add control edge from %s to %s.", atomic_clean_node->GetName().c_str(), second_node->GetName().c_str()); + } + } + } + return SUCCESS; } @@ -266,6 +299,7 @@ Status AtomicAddrCleanPass::HandleDispersedAtomicNodes(ComputeGraphPtr &graph, NodePtr AtomicAddrCleanPass::InsertAtomicAddrCleanNode(ComputeGraphPtr &graph) { OpDescPtr op_desc = MakeShared(NODE_NAME_ATOMIC_ADDR_CLEAN, ATOMICADDRCLEAN); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(INTERNAL_ERROR, "Make shared atomic addr clean op failed."); return nullptr; } @@ -292,10 +326,17 @@ NodePtr AtomicAddrCleanPass::InsertAtomicAddrCleanNode(ComputeGraphPtr &graph) { Status AtomicAddrCleanPass::LinkToAtomicNode(const NodePtr &atomic_node, NodePtr &atomic_clean_node) { GE_IF_BOOL_EXEC(atomic_node == nullptr || atomic_clean_node == nullptr, - DOMI_LOGE("param [atomic_node][atomic_clean_node] must not be null."); return PARAM_INVALID); + REPORT_INNER_ERROR("E19999", "Param atomic_node or atomic_clean_node is nullptr, " + "check invalid"); + DOMI_LOGE("param [atomic_node][atomic_clean_node] must not be null."); + return PARAM_INVALID); InControlAnchorPtr in_ctrl_anchor = atomic_node->GetInControlAnchor(); OutControlAnchorPtr out_ctrl_anchor = atomic_clean_node->GetOutControlAnchor(); if (in_ctrl_anchor == nullptr || out_ctrl_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "in_ctrl_anchor of op:%s(%s) or out_ctrl_anchor of op:%s(%s) is nullptr, " + "check invalid", + atomic_node->GetName().c_str(), atomic_node->GetType().c_str(), + atomic_clean_node->GetName().c_str(), atomic_clean_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get control anchor faild, dst node: %s.", atomic_node->GetName().c_str()); @@ -304,6 +345,11 @@ Status AtomicAddrCleanPass::LinkToAtomicNode(const NodePtr &atomic_node, NodePtr graphStatus status = GraphUtils::AddEdge(out_ctrl_anchor, in_ctrl_anchor); if (status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_ctrl_anchor->GetOwnerNode()->GetName().c_str(), + out_ctrl_anchor->GetOwnerNode()->GetType().c_str(), + in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), + in_ctrl_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Graph add cleanAddrNode op out ctrl edge fail, dst node: %s.", atomic_node->GetName().c_str()); @@ -332,11 +378,11 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) { } // 2.Check atomic attr in node - std::map> node_workspace_offset; + std::map> atomic_workspace_index_size; bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX); bool has_atomic_output = op_desc->HasAttr(ATOMIC_ATTR_OUTPUT_INDEX); - node_workspace_offset = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_OFFSET, node_workspace_offset); - if (!has_atomic_input && !has_atomic_output && node_workspace_offset.empty()) { + atomic_workspace_index_size = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_INFO, atomic_workspace_index_size); + if (!has_atomic_input && !has_atomic_output && atomic_workspace_index_size.empty()) { return false; } @@ -361,6 +407,7 @@ Status AtomicAddrCleanPass::CompileUnknownGraphOp(const vector &atomic_ std::unordered_map> node_vector_map; std::shared_ptr instance = ge::GELib::GetInstance(); if ((instance == nullptr) || !instance->InitFlag()) { + REPORT_INNER_ERROR("E19999", "GeLib is not init before, check invalid"); GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "CompileSingleOp failed."); return ge::GE_CLI_GE_NOT_INITIALIZED; } @@ -373,6 +420,8 @@ Status AtomicAddrCleanPass::CompileUnknownGraphOp(const vector &atomic_ } string kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed", + kernel_lib_name.c_str()); GELOGE(ge::INTERNAL_ERROR, "Get atomic node:%s(%s) kernel lib failed.", atomic_node->GetName().c_str(), atomic_node->GetType().c_str()); return ge::INTERNAL_ERROR; @@ -393,6 +442,8 @@ Status AtomicAddrCleanPass::CompileUnknownGraphOp(const vector &atomic_ GELOGI("The atomic node size of compile op of %s is %zu", kernel_lib_name.c_str(), node_vector.size()); GE_TIMESTAMP_ADD(UnknownGraphCompileOp); if (ret != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call CompileOp failed, kernel_lib_name:%s, ret:%d", + kernel_lib_name.c_str(), ret); GELOGE(ret, "Compile atomic op failed, kernel lib name is %s", kernel_lib_name.c_str()); return ret; } diff --git a/ge/graph/passes/atomic_addr_clean_pass.h b/ge/graph/passes/atomic_addr_clean_pass.h index 8138d511..96147fa2 100755 --- a/ge/graph/passes/atomic_addr_clean_pass.h +++ b/ge/graph/passes/atomic_addr_clean_pass.h @@ -67,6 +67,14 @@ class AtomicAddrCleanPass : public GraphPass { */ Status LinkToAtomicNode(const NodePtr &atomic_node, NodePtr &atomic_clean_node); + /** + * Link atomic clean node to all potential precedence nodes which may execute before atomic clean node + * @param graph + * @param atomic_clean_node + * @return + */ + Status LinkToPotentialPrecedenceNode(ComputeGraphPtr &graph, NodePtr &atomic_clean_node); + /** * Check if this node is atomic op. * @param node diff --git a/ge/graph/passes/attach_stream_label_pass.cc b/ge/graph/passes/attach_stream_label_pass.cc index cd3509c7..d8c81e92 100644 --- a/ge/graph/passes/attach_stream_label_pass.cc +++ b/ge/graph/passes/attach_stream_label_pass.cc @@ -24,34 +24,31 @@ namespace ge { Status AttachStreamLabelPass::Run(ComputeGraphPtr graph) { GELOGD("AttachStreamLabelPass Enter."); - FindNodes(graph); - for (const auto &node : need_label_nodes_) { - GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str()); + std::vector need_label_nodes; + std::vector enter_nodes; + std::map branch_head_nodes; + FindNodes(graph, need_label_nodes, enter_nodes, branch_head_nodes); + for (const auto &node : need_label_nodes) { + GE_CHK_STATUS_RET(UpdateCondBranch(node, branch_head_nodes), "Update cond branch failed, start node:%s.", node->GetName().c_str()); } - GE_CHK_STATUS_RET(UpdateEnterNode(), "UpdateEnterNode failed."); + GE_CHK_STATUS_RET(UpdateEnterNode(enter_nodes), "UpdateEnterNode failed."); GELOGD("AttachStreamLabelPass Leave."); return SUCCESS; } -/// -/// @brief Clear Status, used for subgraph pass -/// @return -/// -Status AttachStreamLabelPass::ClearStatus() { - stream_switch_nodes_.clear(); - need_label_nodes_.clear(); - enter_nodes_.clear(); - branch_head_nodes_.clear(); - return SUCCESS; -} - /// /// @brief Find StreamSwitch / StreamMerge / Enter node /// @param [in] graph +/// @param [out] need_label_nodes +/// @param [out] enter_nodes +/// @param [out] branch_head_nodes /// @return void /// -void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) { +void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph, std::vector &need_label_nodes, + std::vector &enter_nodes, + std::map &branch_head_nodes) { + std::vector stream_switch_nodes; for (const NodePtr &node : graph->GetDirectNode()) { const auto &op_desc = node->GetOpDesc(); if (op_desc == nullptr) { @@ -59,29 +56,31 @@ void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) { } const std::string &type = op_desc->GetType(); if ((type == STREAMSWITCH) && op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)) { - stream_switch_nodes_.emplace_back(node); + stream_switch_nodes.emplace_back(node); } else if ((type == STREAMMERGE) && !op_desc->HasAttr(ATTR_NAME_NEXT_ITERATION)) { - need_label_nodes_.emplace_back(node); + need_label_nodes.emplace_back(node); } else if ((type == ENTER) || (type == REFENTER)) { - enter_nodes_.emplace_back(node); + enter_nodes.emplace_back(node); } } - for (const auto &node : stream_switch_nodes_) { + for (const auto &node : stream_switch_nodes) { for (const auto &out_ctrl_node : node->GetOutControlNodes()) { GELOGD("branch_head_node %s of stream_switch %s.", out_ctrl_node->GetName().c_str(), node->GetName().c_str()); - branch_head_nodes_[out_ctrl_node] = node; + branch_head_nodes[out_ctrl_node] = node; } - need_label_nodes_.emplace_back(node); + need_label_nodes.emplace_back(node); } } /// /// @brief update cond branch /// @param [in] node +/// @param [in] branch_head_nodes /// @return Status /// -Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) { +Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node, + const std::map &branch_head_nodes) { std::string stream_label; if (AttachFlag(node, stream_label) != SUCCESS) { GELOGE(FAILED, "Attach flag for node %s failed.", node->GetName().c_str()); @@ -103,8 +102,9 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) { const std::string &type = cur_node->GetType(); for (const auto &out_node : cur_node->GetOutAllNodes()) { const std::string &out_type = out_node->GetType(); + const auto &iter = branch_head_nodes.find(node); bool stop_flag = (end_type_set.count(out_type) > 0) || - ((branch_head_nodes_.count(out_node) > 0) && (branch_head_nodes_[out_node] != node)) || + ((iter != branch_head_nodes.end()) && (iter->second != node)) || (((type == ENTER) || (type == REFENTER)) && (out_type != STREAMACTIVE)); if (!stop_flag) { nodes.push(out_node); @@ -117,7 +117,13 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) { for (const NodePtr &tmp_node : branch_nodes) { GELOGD("Attach label %s to node: %s.", stream_label.c_str(), tmp_node->GetName().c_str()); - GE_CHK_STATUS_RET(SetStreamLabel(tmp_node, stream_label), "Set stream label failed."); + auto status = SetStreamLabel(tmp_node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), tmp_node->GetName().c_str(), tmp_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } return SUCCESS; @@ -133,21 +139,38 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea const std::string &type = node->GetType(); if (type == STREAMSWITCH) { if (node->GetInDataNodes().empty()) { + REPORT_INNER_ERROR("E19999", "In data nodes is empty of op:%s(%s), check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "node %s has no input_data_node.", node->GetName().c_str()); return INTERNAL_ERROR; } stream_label = node->GetInDataNodes().at(0)->GetName(); - GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed."); bool value = false; OpDescPtr op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); - GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED, + GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), + REPORT_CALL_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return FAILED, "StreamSwitch get attr TRUE_BRANCH_STREAM failed."); stream_label += (value ? "_t" : "_f"); - GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed."); + auto status = SetActiveLabelList(node, {stream_label}); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + stream_label.c_str(), node->GetName().c_str(), node->GetType().c_str()); + GELOGE(status, "set active_label_list failed."); + return status; + } } else if (type == STREAMMERGE) { stream_label = node->GetName(); - GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed."); + auto status = SetStreamLabel(node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), node->GetName().c_str(), node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } return SUCCESS; @@ -155,11 +178,12 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea /// /// @brief Update stream_label start with enter nodes +/// @param [in] enter_nodes /// @return Status /// -Status AttachStreamLabelPass::UpdateEnterNode() { +Status AttachStreamLabelPass::UpdateEnterNode(const std::vector &enter_nodes) { std::unordered_map> enter_active_map; - for (const auto &enter_node : enter_nodes_) { + for (const auto &enter_node : enter_nodes) { for (const auto &out_ctrl_node : enter_node->GetOutControlNodes()) { if (out_ctrl_node->GetType() != STREAMACTIVE) { continue; @@ -184,15 +208,18 @@ Status AttachStreamLabelPass::UpdateEnterNode() { bool get_attr = AttrUtils::GetListStr(active_node->GetOpDesc(), ATTR_NAME_ACTIVE_LABEL_LIST, active_label_list) && (active_label_list.size() == 1) && !active_label_list[0].empty(); if (!get_attr) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + ATTR_NAME_ACTIVE_LABEL_LIST.c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ACTIVE_LABEL_LIST failed, node: %s.", active_node->GetName().c_str()); return INTERNAL_ERROR; } - std::stack enter_nodes; + std::stack nodes; for (const auto &enter_node : pair.second) { - enter_nodes.emplace(enter_node); + nodes.emplace(enter_node); } - if (UpdateLoopBranch(enter_nodes, active_label_list[0]) != SUCCESS) { + if (UpdateLoopBranch(nodes, active_label_list[0]) != SUCCESS) { GELOGE(FAILED, "Update stream_label for loop_branch failed."); return FAILED; } @@ -217,7 +244,13 @@ Status AttachStreamLabelPass::SetEnterLabel(const std::vector &enter_no } for (const auto &enter_node : enter_nodes) { - GE_CHK_STATUS_RET(SetStreamLabel(enter_node, stream_label), "Set stream label failed."); + auto status = SetStreamLabel(enter_node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), enter_node->GetName().c_str(), enter_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } return SUCCESS; } @@ -246,7 +279,13 @@ Status AttachStreamLabelPass::UpdateLoopBranch(const std::stack &enter_ continue; } GELOGD("Attach label %s to node: %s.", stream_label.c_str(), out_node->GetName().c_str()); - GE_CHK_STATUS_RET(SetStreamLabel(out_node, stream_label), "Set stream label failed."); + auto status = SetStreamLabel(out_node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), out_node->GetName().c_str(), out_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } nodes.push(out_node); } } diff --git a/ge/graph/passes/attach_stream_label_pass.h b/ge/graph/passes/attach_stream_label_pass.h index ad71d58f..a1600a58 100755 --- a/ge/graph/passes/attach_stream_label_pass.h +++ b/ge/graph/passes/attach_stream_label_pass.h @@ -25,26 +25,25 @@ class AttachStreamLabelPass : public GraphPass { public: Status Run(ComputeGraphPtr graph); - /// - /// @brief Clear Status, used for subgraph pass - /// @return - /// - Status ClearStatus() override; - private: /// /// @brief Find StreamSwitch / StreamMerge / Enter node /// @param [in] graph + /// @param [out] need_label_nodes + /// @param [out] enter_nodes + /// @param [out] branch_head_nodes /// @return void /// - void FindNodes(const ComputeGraphPtr &graph); + void FindNodes(const ComputeGraphPtr &graph, std::vector &need_label_nodes, + std::vector &enter_nodes, std::map &branch_head_nodes); /// /// @brief update cond branch /// @param [in] node + /// @param [in] branch_head_nodes /// @return Status /// - Status UpdateCondBranch(const NodePtr &node); + Status UpdateCondBranch(const NodePtr &node, const std::map &branch_head_nodes); /// /// @brief attach flag @@ -64,9 +63,10 @@ class AttachStreamLabelPass : public GraphPass { /// /// @brief Update stream_label start with enter nodes + /// @param [in] enter_nodes /// @return Status /// - Status UpdateEnterNode(); + Status UpdateEnterNode(const std::vector &enter_nodes); /// /// @brief Set stream_label for enter_nodes @@ -75,11 +75,6 @@ class AttachStreamLabelPass : public GraphPass { /// @return Status /// static Status SetEnterLabel(const std::vector &enter_nodes, const NodePtr &active_node); - - std::vector stream_switch_nodes_; - std::vector need_label_nodes_; - std::vector enter_nodes_; - std::unordered_map branch_head_nodes_; }; } // namespace ge #endif // GE_GRAPH_PASSES_ATTACH_STREAM_LABEL_PASS_H_ diff --git a/ge/graph/passes/base_pass.cc b/ge/graph/passes/base_pass.cc index 3b854c18..0868b729 100755 --- a/ge/graph/passes/base_pass.cc +++ b/ge/graph/passes/base_pass.cc @@ -30,8 +30,15 @@ constexpr int kMaxRePassTimes = 10000; constexpr size_t kMaxOneInNodes = 1000; // Each iteration, we take about 0.3k memory on the stack, we should change the recursion to loop later constexpr int kMaxRecursiveDepth = 20; +struct DuringPassNodeSets { + std::unordered_set nodes_seen; + std::unordered_set nodes_deleted; + std::unordered_set nodes_re_pass; + std::unordered_set nodes_re_pass_immediately; + std::unordered_set nodes_last; +}; -void GetAllNodesNoInputEdge(const ComputeGraphPtr &graph, std::queue &input_edge_nodes, +void GetAllNodesNoInputEdge(const ComputeGraphPtr &graph, std::deque &input_edge_nodes, std::unordered_set &nodes_seen, std::unordered_set &nodes_last) { nodes_last.clear(); for (auto &node : graph->GetDirectNode()) { @@ -40,7 +47,7 @@ void GetAllNodesNoInputEdge(const ComputeGraphPtr &graph, std::queue &i } size_t in_nums = node->GetInNodes().size(); if (in_nums == 0) { - input_edge_nodes.push(node); + input_edge_nodes.push_back(node); nodes_seen.insert(node.get()); } else if (in_nums > kMaxOneInNodes) { nodes_last.insert(node); @@ -48,7 +55,7 @@ void GetAllNodesNoInputEdge(const ComputeGraphPtr &graph, std::queue &i } } -void AddNextIterNodes(const Node::Vistor &nodes, std::queue &nodes_to_pass, +void AddNextIterNodes(const Node::Vistor &nodes, std::deque &nodes_to_pass, std::unordered_set &nodes_seen, std::unordered_set &nodes_last) { for (auto &node : nodes) { if (node == nullptr) { @@ -60,13 +67,30 @@ void AddNextIterNodes(const Node::Vistor &nodes, std::queue &n bool all_in_nodes_seen = node->IsAllInNodesSeen(nodes_seen); if (all_in_nodes_seen && nodes_seen.insert(node.get()).second) { - nodes_to_pass.push(node); + nodes_to_pass.push_back(node); } } } -Status RunPasses(NodePtr &node, const NamesToPass &names_to_passes, std::unordered_set &nodes_re_pass, - std::unordered_set &nodes_deleted, std::unordered_set &nodes_seen) { +void PushToRePassIfSeen(NodePtr &node, const std::pair &name_to_pass, + std::unordered_set &nodes_seen, std::unordered_set &nodes_to_re_pass, + std::unordered_set &nodes_re_pass) { + for (const auto &node_to_re_pass : nodes_to_re_pass) { + if (node_to_re_pass == nullptr) { + GELOGW("Found null re-pass node when executing %s on node %s type %s", name_to_pass.first.c_str(), + node->GetName().c_str(), node->GetType().c_str()); + continue; + } + if (nodes_seen.count(node_to_re_pass.get()) > 0 || node_to_re_pass->IsAllInNodesSeen(nodes_seen)) { + GELOGD("The node %s will be re-pass.", node_to_re_pass->GetName().c_str()); + nodes_re_pass.insert(node_to_re_pass); + } else { + GELOGD("The node %s are not all seen, don't set repass this time", node_to_re_pass->GetName().c_str()); + } + } +} + +Status RunPasses(NodePtr &node, const NamesToPass &names_to_passes, DuringPassNodeSets &during_pass_node_set) { if (node == nullptr) { GELOGE(FAILED, "parameter is null."); return FAILED; @@ -90,22 +114,15 @@ Status RunPasses(NodePtr &node, const NamesToPass &names_to_passes, std::unorder } auto nodes_to_re_pass = name_to_pass.second->GetNodesNeedRePass(); - for (const auto &node_to_re_pass : nodes_to_re_pass) { - if (node_to_re_pass == nullptr) { - GELOGW("Found null re-pass node when executing %s on node %s type %s", name_to_pass.first.c_str(), - node->GetName().c_str(), node->GetType().c_str()); - continue; - } - if (nodes_seen.count(node_to_re_pass.get()) > 0 || node_to_re_pass->IsAllInNodesSeen(nodes_seen)) { - GELOGD("The node %s will be re-pass later", node_to_re_pass->GetName().c_str()); - nodes_re_pass.insert(node_to_re_pass); - } else { - GELOGD("The node %s are not all seen, don't set repass this time", node_to_re_pass->GetName().c_str()); - } - } + PushToRePassIfSeen(node, name_to_pass, during_pass_node_set.nodes_seen, nodes_to_re_pass, + during_pass_node_set.nodes_re_pass); + + auto nodes_to_re_pass_immediately = name_to_pass.second->GetNodesNeedRePassImmediately(); + PushToRePassIfSeen(node, name_to_pass, during_pass_node_set.nodes_seen, nodes_to_re_pass_immediately, + during_pass_node_set.nodes_re_pass_immediately); auto nodes_deleted_by_pass = name_to_pass.second->GetNodesDeleted(); - nodes_deleted.insert(nodes_deleted_by_pass.begin(), nodes_deleted_by_pass.end()); + during_pass_node_set.nodes_deleted.insert(nodes_deleted_by_pass.begin(), nodes_deleted_by_pass.end()); if (nodes_deleted_by_pass.count(node) > 0) { GELOGD("The node %s was deleted by pass %s, stop the remain passes", node->GetName().c_str(), name_to_pass.first.c_str()); @@ -181,36 +198,33 @@ Status GEPass::Run(const NamesToPass &names_to_passes) { Status GEPass::RunPassesOneGraph(const NamesToPass &names_to_passes) { GELOGD("Begin to run pass on graph, passes count %zu", names_to_passes.size()); - std::queue nodes; - std::unordered_set nodes_seen; - std::unordered_set nodes_deleted; - std::unordered_set nodes_re_pass; - std::unordered_set nodes_last; - GetAllNodesNoInputEdge(graph_, nodes, nodes_seen, nodes_last); + std::deque nodes; + DuringPassNodeSets during_pass_node_set; + GetAllNodesNoInputEdge(graph_, nodes, during_pass_node_set.nodes_seen, during_pass_node_set.nodes_last); GELOGD("Start points count %zu", nodes.size()); int re_pass_times = 0; do { - for (auto &node : nodes_re_pass) { - nodes.push(node); - nodes_seen.insert(node.get()); + for (auto &node : during_pass_node_set.nodes_re_pass) { + nodes.push_back(node); + during_pass_node_set.nodes_seen.insert(node.get()); } - nodes_re_pass.clear(); + during_pass_node_set.nodes_re_pass.clear(); while (!nodes.empty()) { NodePtr node = nodes.front(); - nodes.pop(); + nodes.pop_front(); - (void)nodes_re_pass.erase(node); + (void)during_pass_node_set.nodes_re_pass.erase(node); GE_IF_BOOL_EXEC(node == nullptr, GELOGW("node is null"); continue); - if (nodes_deleted.count(node) > 0) { + if (during_pass_node_set.nodes_deleted.count(node) > 0) { GELOGD("The node %s was deleted before, skip it.", node->GetName().c_str()); continue; } - AddNextIterNodes(node->GetOutNodes(), nodes, nodes_seen, nodes_last); + AddNextIterNodes(node->GetOutNodes(), nodes, during_pass_node_set.nodes_seen, during_pass_node_set.nodes_last); - auto ret = RunPasses(node, names_to_passes, nodes_re_pass, nodes_deleted, nodes_seen); + auto ret = RunPasses(node, names_to_passes, during_pass_node_set); if (ret != SUCCESS) { GELOGE(ret, "Failed to process passes on node %s type %s, error code: %u", node->GetName().c_str(), node->GetType().c_str(), ret); @@ -227,7 +241,7 @@ Status GEPass::RunPassesOneGraph(const NamesToPass &names_to_passes) { if (has_sub_graph) { GELOGD("There are subgraphs on node %s, run passes for for the second time", node->GetName().c_str()); SetFlagOption(kOptimizeAfterSubGraph, names_to_passes); - ret = RunPasses(node, names_to_passes, nodes_re_pass, nodes_deleted, nodes_seen); + ret = RunPasses(node, names_to_passes, during_pass_node_set); if (ret != SUCCESS) { GELOGE(ret, "Failed to process passes on node %s type %s, error code: %u", node->GetName().c_str(), node->GetType().c_str(), ret); @@ -239,16 +253,21 @@ Status GEPass::RunPassesOneGraph(const NamesToPass &names_to_passes) { // should be called each time at the begin of the iteration ClearOption(names_to_passes); } + for (const auto &node : during_pass_node_set.nodes_re_pass_immediately) { + GELOGD("The node %s will be re-pass immediately.", node->GetName().c_str()); + nodes.push_front(node); + } + during_pass_node_set.nodes_re_pass_immediately.clear(); } - for (auto &node : nodes_last) { - bool all_in_nodes_seen = node->IsAllInNodesSeen(nodes_seen); - if (all_in_nodes_seen && nodes_seen.insert(node.get()).second) { - nodes.push(node); + for (auto &node : during_pass_node_set.nodes_last) { + bool all_in_nodes_seen = node->IsAllInNodesSeen(during_pass_node_set.nodes_seen); + if (all_in_nodes_seen && during_pass_node_set.nodes_seen.insert(node.get()).second) { + nodes.push_back(node); } } - nodes_last.clear(); - } while ((!nodes_re_pass.empty() || !nodes.empty()) && ++re_pass_times < kMaxRePassTimes); + during_pass_node_set.nodes_last.clear(); + } while ((!during_pass_node_set.nodes_re_pass.empty() || !nodes.empty()) && ++re_pass_times < kMaxRePassTimes); if (re_pass_times == kMaxRePassTimes) { GELOGW("re_pass_times should not come to %d", kMaxRePassTimes); diff --git a/ge/graph/passes/base_pass.h b/ge/graph/passes/base_pass.h index bb41691d..a9f4f000 100644 --- a/ge/graph/passes/base_pass.h +++ b/ge/graph/passes/base_pass.h @@ -53,6 +53,8 @@ class BaseNodePass { std::unordered_set GetNodesNeedRePass() { return nodes_need_re_pass_; } + std::unordered_set GetNodesNeedRePassImmediately() { return nodes_need_re_pass_immediately_; } + std::unordered_set GetNodesDeleted() { return nodes_deleted_; } void SetOption(NodePassOption option, const std::string &value) { options_[option] = value; } @@ -62,6 +64,7 @@ class BaseNodePass { void init() { nodes_need_re_pass_.clear(); nodes_deleted_.clear(); + nodes_need_re_pass_immediately_.clear(); } protected: @@ -79,6 +82,14 @@ class BaseNodePass { /// void AddRePassNode(NodePtr &node) { nodes_need_re_pass_.insert(node); } + /// + /// Add a node to be optimized immediately again. If you add a new node to the graph, or + /// change a node connections, and you want to make sure the node will be + /// optimized by other passes, call this function. + /// @param node + /// + void AddImmediateRePassNode(NodePtr &node) { nodes_need_re_pass_immediately_.insert(node); } + /// /// Add a node and it's input/output data nodes to be optimized again. /// @param node @@ -109,6 +120,7 @@ class BaseNodePass { private: std::unordered_set nodes_need_re_pass_; + std::unordered_set nodes_need_re_pass_immediately_; std::unordered_set nodes_deleted_; std::map options_; }; diff --git a/ge/graph/passes/bitcast_pass.cc b/ge/graph/passes/bitcast_pass.cc index 8388b21a..b5166959 100644 --- a/ge/graph/passes/bitcast_pass.cc +++ b/ge/graph/passes/bitcast_pass.cc @@ -22,6 +22,7 @@ #include "graph/utils/type_utils.h" #include "framework/common/debug/log.h" #include "framework/common/ge_inner_error_codes.h" +#include "common/formats/utils/formats_trans_utils.h" namespace ge { namespace { @@ -31,6 +32,7 @@ const char *const kAttrNameType = "type"; Status BitcastPass::Run(NodePtr &node) { GELOGD("Bitcast running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [node] must not be null."); return PARAM_INVALID; } @@ -41,6 +43,7 @@ Status BitcastPass::Run(NodePtr &node) { OpDescPtr op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); return PARAM_INVALID; } ge::DataType dst_data_type; @@ -58,20 +61,30 @@ Status BitcastPass::Run(NodePtr &node) { Status BitcastPass::CheckDstDataType(const OpDescPtr op_desc, ge::DataType &dst_data_type) { if (!ge::AttrUtils::GetDataType(op_desc, kAttrNameType, dst_data_type)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + kAttrNameType, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Node failed to get attribute type."); return PARAM_INVALID; } if (dst_data_type >= ge::DT_UNDEFINED) { - GELOGE(PARAM_INVALID, "dst_data_type[%s] is not valid.", + REPORT_INNER_ERROR("E19999", "Param dst_data_type:%d check invalid, op:%s(%s)", + dst_data_type, op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(PARAM_INVALID, "dst_data_type[%s] is not valid.", TypeUtils::DataTypeToSerialString(dst_data_type).c_str()); return PARAM_INVALID; } if (op_desc->GetOutputDescPtr(0) == nullptr) { + REPORT_INNER_ERROR("E19999", "Index 0 ouput desc of op:%s(%s) not exist, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Bitcast node outputDesc is null."); return PARAM_INVALID; } if (op_desc->GetOutputDescPtr(0)->GetDataType() != dst_data_type) { + REPORT_INNER_ERROR("E19999", "Index 0 ouput desc of op:%s(%s), it't data type:%s not equal to dst_data_type:%s, " + "check invalid", op_desc->GetName().c_str(), op_desc->GetType().c_str(), + TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), + TypeUtils::DataTypeToSerialString(op_desc->GetOutputDescPtr(0)->GetDataType()).c_str()); GELOGE(PARAM_INVALID, "dst_data_type[%s] is not equal to output_data_type[%s].", TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), TypeUtils::DataTypeToSerialString(op_desc->GetOutputDescPtr(0)->GetDataType()).c_str()); @@ -84,6 +97,8 @@ Status BitcastPass::CheckOutputShape(const OpDescPtr op_desc, const ge::DataType const GeTensorDescPtr &input_tensor_desc = op_desc->MutableInputDesc(0); const GeTensorDescPtr &output_tensor_desc = op_desc->MutableOutputDesc(0); if (input_tensor_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Index 0 input desc of op:%s(%s) not exist, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "input_tensor_desc must not be null."); return PARAM_INVALID; } @@ -91,7 +106,10 @@ Status BitcastPass::CheckOutputShape(const OpDescPtr op_desc, const ge::DataType // get origin data_type and shape ge::DataType ori_data_type = input_tensor_desc->GetDataType(); if (ori_data_type >= ge::DT_UNDEFINED) { - GELOGE(PARAM_INVALID, "ori_data_type[%s] is not valid.", + REPORT_INNER_ERROR("E19999", "ori_data_type:%d of index 0 input desc in op:%s(%s), " + "check invalid", + ori_data_type, op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(PARAM_INVALID, "ori_data_type[%s] is not valid.", TypeUtils::DataTypeToSerialString(ori_data_type).c_str()); return PARAM_INVALID; } @@ -108,6 +126,10 @@ Status BitcastPass::CheckOutputShape(const OpDescPtr op_desc, const ge::DataType } if (dim_vec != output_tensor_desc->GetShape().GetDims()) { + REPORT_INNER_ERROR("E19999", "Shape:%s of index 0 output desc in op:%s(%s), different from expect shape:%s ," + "check invalid", + formats::JoinToString(output_tensor_desc->GetShape().GetDims()).c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), formats::JoinToString(dim_vec).c_str()); GELOGE(PARAM_INVALID, "out_put_shape is different from expectations."); return PARAM_INVALID; } @@ -118,6 +140,7 @@ Status BitcastPass::CheckOutputShape(const OpDescPtr op_desc, const ge::DataType Status BitcastPass::CalcAndUpdateShape(BitcastPass::kVecInt64 &dim_vec, ge::DataType ori_data_type, ge::DataType dst_data_type) { if (dim_vec.size() == 0) { + REPORT_INNER_ERROR("E19999", "Param dim_vec is empty, check invalid"); GELOGE(PARAM_INVALID, "Pre node shape size is zero."); return PARAM_INVALID; } @@ -128,6 +151,10 @@ Status BitcastPass::CalcAndUpdateShape(BitcastPass::kVecInt64 &dim_vec, ge::Data return SUCCESS; } else if (ori_data_size > dst_data_size) { if (ori_data_size % dst_data_size != 0) { + REPORT_INNER_ERROR("E19999", "size:%ld of ori_data_type:%s is not divisible by size:%ld of dst_data_type:%s ," + "check invalid", + ori_data_size, TypeUtils::DataTypeToSerialString(ori_data_type).c_str(), + dst_data_size, TypeUtils::DataTypeToSerialString(dst_data_type).c_str()); GELOGE(PARAM_INVALID, "ori_data_size is not divisible by dst_data_size."); return PARAM_INVALID; } @@ -135,11 +162,18 @@ Status BitcastPass::CalcAndUpdateShape(BitcastPass::kVecInt64 &dim_vec, ge::Data return SUCCESS; } else { if (dst_data_size % ori_data_size != 0) { + REPORT_INNER_ERROR("E19999", "size:%ld of dst_data_type:%s is not divisible by size:%ld of ori_data_type:%s ," + "check invalid", + dst_data_size, TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), + ori_data_size, TypeUtils::DataTypeToSerialString(ori_data_type).c_str()); GELOGE(PARAM_INVALID, "dst_data_size is not divisible by ori_data_size."); return PARAM_INVALID; } if (dim_vec[dim_vec.size() - 1] != (dst_data_size / ori_data_size)) { + REPORT_INNER_ERROR("E19999", "The last dim:%ld in param dim_vec is not equal to " + "dst_data_size:%ld / ori_data_size:%ld, check invalid", + dim_vec[dim_vec.size() - 1], dst_data_size, ori_data_size); GELOGE(PARAM_INVALID, "The last dim is not equal to dst_data_size / ori_data_size."); return PARAM_INVALID; } diff --git a/ge/graph/passes/buffer_pool_memory_pass.cc b/ge/graph/passes/buffer_pool_memory_pass.cc new file mode 100644 index 00000000..8a64da59 --- /dev/null +++ b/ge/graph/passes/buffer_pool_memory_pass.cc @@ -0,0 +1,574 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/buffer_pool_memory_pass.h" + +#include +#include +#include "graph/common/omg_util.h" +#include "graph/utils/node_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "common/math/math_util.h" + +namespace ge { +namespace { +const size_t kBufferPoolNodeInSize = 1; +const size_t kBufferPoolNodeOutSize = 1; +} // namespace + +Status BufferPoolMemoryPass::Run(ComputeGraphPtr graph) { + if (graph == nullptr) { + GELOGE(PARAM_INVALID, "[Check][Graph]Graph is nullptr"); + REPORT_INNER_ERROR("E19999", "Input graph is nullptr"); + return PARAM_INVALID; + } + // The cache prefetching scheme is developed for very large models, which gets the weight data in advance + // and allocates it to a special memory pool. When the large model is dynamic shape, it need to go through + // the executor flow and is not allocated memory statically. This is another development point, so we will + // skip the dynamic shape model processing here. + if (graph->GetParentGraph() != nullptr || graph->GetGraphUnknownFlag()) { + return SUCCESS; + } + if (!IsBufferPoolMemEnable(graph)) { + GELOGD("[Check][Enable]Buffer pool memory is not enable, graph:%s.", graph->GetName().c_str()); + return SUCCESS; + } + Status ret = graph->TopologicalSorting(); + if (ret != SUCCESS) { + GELOGE(ret, "[TopologicalSort][Graph]Graph name:%s.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to topological sort for graph:%s.", graph->GetName().c_str()); + return ret; + } + + ret = CopyOutForMultiUsedOutput(graph); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Copy][Output]Graph:%s.", graph->GetName().c_str()); + return FAILED; + } + + ret = GetBufferPoolAndPeerCalcNodes(graph); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Get][BufferPoolNode]Graph:%s.", graph->GetName().c_str()); + return FAILED; + } + if (calc_nodes_.empty()) { + GELOGE(FAILED, "[Check][BufferPoolNode]Graph:%s.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "All Buffer pool nodes are isolated nodes in graph:%s.", graph->GetName().c_str()); + return FAILED; + } + ret = AllocateAllBufferPoolSpace(); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Alloc][BufferPoolMem]Graph:%s.", graph->GetName().c_str()); + return FAILED; + } + + ret = SetResultOfMemoryAndEvent(); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Set][Result]Graph:%s.", graph->GetName().c_str()); + return FAILED; + } + ret = graph->TopologicalSorting(); + if (ret != SUCCESS) { + GELOGE(ret, "[TopologicalSort][Graph]Graph name:%s.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to topological sort for graph:%s.", graph->GetName().c_str()); + return ret; + } + return SUCCESS; +} + +void BufferPoolMemoryPass::ClearQueue(std::queue> &q) { + while (!q.empty()) { + q.pop(); + } +} + +Status BufferPoolMemoryPass::IsBufferPoolMemEnable(const ComputeGraphPtr &graph) { + for (NodePtr &node : graph->GetAllNodes()) { + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + if (op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_ID) && op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_SIZE)) { + return true; + } + } + return false; +} + +Status BufferPoolMemoryPass::CheckBufferPoolSize(int64_t total_size, int64_t pool_id, int64_t buffer_pool_size, + std::unordered_map &calc_total_size) { + auto iter = calc_total_size.find(pool_id); + if (iter == calc_total_size.end()) { + calc_total_size[pool_id] = total_size; + } else { + FMK_INT64_ADDCHECK(calc_total_size[pool_id], total_size); + calc_total_size[pool_id] += total_size; + } + if (calc_total_size[pool_id] > buffer_pool_size) { + GELOGE(INTERNAL_ERROR, "[Check][Size]The memory required at the same is greater than buffer pool size, " + "pool id:%ld, pool size:%ld, required size:%ld.", pool_id, buffer_pool_size, calc_total_size[pool_id]); + REPORT_INNER_ERROR("E19999", "The memory required at the same is greater than buffer pool size, pool id:%ld," + " pool size:%ld, required size:%ld.", pool_id, buffer_pool_size, calc_total_size[pool_id]); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::TryToFixNodeOrder(NodePtr &pre_node, NodePtr &curr_node, bool ¬_change) { + auto pre_node_graph = pre_node->GetOwnerComputeGraph(); + auto curr_node_graph = curr_node->GetOwnerComputeGraph(); + std::string pre_node_stream_label; + (void) AttrUtils::GetStr(pre_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, pre_node_stream_label); + std::string curr_node_stream_label; + (void) AttrUtils::GetStr(curr_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, curr_node_stream_label); + not_change = true; + if ((pre_node_graph == curr_node_graph) && (pre_node_stream_label == pre_node_stream_label)) { + // Same subgraph, including simultaneously in the root graph. + auto ret = ge::GraphUtils::AddEdge(pre_node->GetOutControlAnchor(), curr_node->GetInControlAnchor()); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "[Add][Edge]Src:%s, dst:%s.", pre_node->GetName().c_str(), curr_node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to add ctrl edge from %s to %s.", + pre_node->GetName().c_str(), curr_node->GetName().c_str()); + return INTERNAL_ERROR; + } + not_change = false; + } else if (pre_node_graph->GetParentGraph() == curr_node_graph->GetParentGraph() && + pre_node_graph->GetParentNode() != nullptr && curr_node_graph->GetParentNode() != nullptr) { + // Two nodes are located on different child graphs of different parent nodes. + auto pre_node_parent_op_desc = pre_node_graph->GetParentNode()->GetOpDesc(); + auto curr_node_parent_op_desc = curr_node_graph->GetParentNode()->GetOpDesc(); + GE_CHECK_NOTNULL(pre_node_parent_op_desc); + GE_CHECK_NOTNULL(curr_node_parent_op_desc); + // The parent node dependency is correct to ensure that the child node dependency, + // there is no need to add control edges. + if (pre_node_parent_op_desc->GetId() > curr_node_parent_op_desc->GetId()) { + GELOGE(INTERNAL_ERROR, "[Check][Dependency]Invalid dependency, pre node:%s, curr node:%s.", + pre_node->GetName().c_str(), curr_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Invalid dependency, pre node:%s, curr node:%s.", + pre_node->GetName().c_str(), curr_node->GetName().c_str()); + return INTERNAL_ERROR; + } + GELOGI("[Check][Dependency]The two nodes are located in sub graphs of different parent nodes and meet the " + "dependency relationship. pre:%s, curr:%s.", pre_node->GetName().c_str(), curr_node->GetName().c_str()); + } else { + GELOGE(INTERNAL_ERROR, "[Check][Dependency]Invalid dependency, pre node:%s, curr node:%s.", + pre_node->GetName().c_str(), curr_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Invalid dependency, pre node:%s, curr node:%s.", + pre_node->GetName().c_str(), curr_node->GetName().c_str()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::InsertMemCpyNodeAfter(ComputeGraphPtr &graph, NodePtr &node) { + auto out_anchor = node->GetOutDataAnchor(kBufferPoolNodeOutIndex); + OpDescBuilder op_desc_builder(node->GetName() + "_memcpy_async", MEMCPYASYNC); + auto mem_copy_op = op_desc_builder.AddInput("x", node->GetOpDesc()->GetOutputDesc(kBufferPoolNodeOutIndex)) + .AddOutput("y", node->GetOpDesc()->GetOutputDesc(kBufferPoolNodeOutIndex)) + .Build(); + std::string batch_label; + bool get_attr = AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, batch_label); + if (get_attr && !batch_label.empty()) { + (void) AttrUtils::SetStr(mem_copy_op, ATTR_NAME_STREAM_LABEL, batch_label); + } + auto peer_in_anchors = out_anchor->GetPeerInDataAnchors(); + std::vector in_anchors(peer_in_anchors.begin(), peer_in_anchors.end()); + if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(mem_copy_op)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[Insert][Node] Node:%s.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to insert mem copy node after %s.", node->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::CopyOutForMultiUsedOutput(ComputeGraphPtr &graph) { + bool changed = false; + for (NodePtr &node : graph->GetAllNodes()) { + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + bool use_buffer_pool = op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_ID) && op_desc->HasAttr(ATTR_NAME_BUFFER_POOL_SIZE); + if (use_buffer_pool) { + if ((node->GetInDataNodes().size() == kBufferPoolNodeInSize) && + (node->GetOutDataNodes().size() == kBufferPoolNodeOutSize)) { + continue; + } else if ((node->GetAllInDataAnchors().size() == kBufferPoolNodeInSize) && + (node->GetAllOutDataAnchors().size() == kBufferPoolNodeOutSize)) { + // A prefetching output is used in multiple places. Copy one so that the prefetching node remains + // single input and single output. + if (InsertMemCpyNodeAfter(graph, node) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "[Insert][MemCpy]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to insert mem copy node after %s.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + changed = true; + GELOGI("[Insert][Node]Insert mem copy node after %s.", node->GetName().c_str()); + } else { + GELOGE(PARAM_INVALID, "[Check][InputOutput]Only support single input and single output, " + "node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Only support single input and single output, node:%s.", node->GetName().c_str()); + return PARAM_INVALID; + } + } + } + if (changed) { + Status ret = graph->TopologicalSorting(); + if (ret != SUCCESS) { + GELOGE(ret, "[TopologicalSort][Graph]Graph name:%s.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to topological sort for graph:%s.", graph->GetName().c_str()); + return ret; + } + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::GetBufferPoolAndPeerCalcNodes(const ComputeGraphPtr &graph) { + std::unordered_map>> unique_calc_nodes; + for (const NodePtr &node : graph->GetAllNodes()) { + auto in_data_nodes = node->GetInAllNodes(); + for (NodePtr &in_node : in_data_nodes) { + int64_t buffer_pool_id = 0; + int64_t buffer_pool_size = 0; + bool get_attr = AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_ID, buffer_pool_id); + get_attr = get_attr && (AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_SIZE, buffer_pool_size)); + if (get_attr) { + std::string batch_label; + (void) AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label); + peer_buffer_node_item_[batch_label][node].emplace_back(BufferPoolNodeItem(in_node, 0, 0)); + buffer_node_to_calc_[batch_label][in_node] = node; + if (unique_calc_nodes[batch_label][buffer_pool_id].count(node) == 0) { + calc_nodes_[batch_label][buffer_pool_id].emplace_back(node); + unique_calc_nodes[batch_label][buffer_pool_id].insert(node); + } + GELOGI("[Get][BufferNode]Calc node:%s, pool node:%s.", node->GetName().c_str(), in_node->GetName().c_str()); + Status ret = SetBufferPoolSize(batch_label, buffer_pool_id, buffer_pool_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Set][BufferPoolSize]Node:%s", in_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to set buffer pool size, something wrong with the info of node:%s", + in_node->GetName().c_str()); + return ret; + } + } + } + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::SetBufferPoolSize(const std::string &batch_label, int64_t id, int64_t size) { + auto iter = buffer_pool_size_[batch_label].find(id); + if (iter != buffer_pool_size_[batch_label].end() && iter->second != size) { + GELOGE(PARAM_INVALID, "[Check][BufferPoolSize]Get different size with the same id, " + "id:%ld, original size:%ld, this size:%ld.", id, iter->second, size); + REPORT_INNER_ERROR("E19999", "Get different size with the same id, " + "id:%ld, original size:%ld, this size:%ld.", id, iter->second, size); + return PARAM_INVALID; + } + buffer_pool_size_[batch_label][id] = size; + return SUCCESS; +} + +Status BufferPoolMemoryPass::AllocateAllBufferPoolSpace() { + for (const auto &iter : calc_nodes_) { + std::string batch_label = iter.first; + Status ret = AllocateSpaceInBatch(calc_nodes_[batch_label], + buffer_pool_size_[batch_label], + buffer_node_to_calc_[batch_label], + peer_buffer_node_item_[batch_label]); + if (ret != SUCCESS) { + GELOGE(ret, "[Alloc][InBatch]Batch_label:%s.", batch_label.c_str()); + REPORT_INNER_ERROR("E19999", "Failed to allocate space in batch, batch_label:%s.", batch_label.c_str()); + return ret; + } + GELOGI("[Alloc][InBatch]Alloc space in batch successfully, batch label:%s.", batch_label.c_str()); + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::AllocateSpaceInBatch( + const std::map> &calc_nodes, + const std::unordered_map &buffer_pool_size_map, + const std::unordered_map &buffer_node_to_calc, + std::unordered_map> &buffer_pool_nodes_item) { + for (const auto &calc_node_in_pool : calc_nodes) { + int64_t pool_id = calc_node_in_pool.first; + int64_t buffer_pool_size = buffer_pool_size_map.at(pool_id); + ClearQueue(mem_ctrl_event_); + ClearQueue(stream_ctrl_event_); + BufferPool buffer_pool(pool_id, buffer_pool_size, buffer_node_to_calc); + Status ret = AllocateSpaceInBufferPool(buffer_pool, + calc_node_in_pool.second, + buffer_pool_nodes_item); + if (ret != SUCCESS) { + GELOGE(ret, "[Alloc][InBufferPool]Pool id:%ld, pool size:%ld.", pool_id, buffer_pool_size); + REPORT_INNER_ERROR("E19999", "Failed to allocate space in buffer pool, id:%ld, pool size:%ld.", + pool_id, buffer_pool_size); + return ret; + } + GELOGI("[Alloc][InBufferPool]Alloc space in buffer pool successfully, pool id:%ld.", pool_id); + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::AllocateSpaceInBufferPool( + const BufferPool &buffer_pool, + const std::vector &calc_nodes_in_pool, + std::unordered_map> &buffer_pool_nodes_item) { + int64_t pool_id = buffer_pool.pool_id; + int64_t buffer_pool_size = buffer_pool.pool_size; + int64_t next_start = 0; + NodePtr pre_buffer_pool_node = nullptr; + std::queue node_mem_range_in_pool; + node_mem_range_in_pool.push(BufferPoolMemoryPass::BufferPoolNodeItem(nullptr, 0, buffer_pool_size)); + for (auto &calc_node : calc_nodes_in_pool) { + auto &peer_buffer_node_item = buffer_pool_nodes_item[calc_node]; + std::unordered_map calc_total_size; + size_t input_buffer_node_num = 0; + for (auto &node_item : peer_buffer_node_item) { + auto peer_buffer_node = node_item.node; + GE_CHECK_NOTNULL(peer_buffer_node); + int64_t total_size = 0; + ++input_buffer_node_num; + Status ret = GetMemorySize(peer_buffer_node, total_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Get][MemSize]Node:%s, calc_node:%s.", + peer_buffer_node->GetName().c_str(), calc_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get memory size, node:%s, calc_node:%s.", + peer_buffer_node->GetName().c_str(), calc_node->GetName().c_str()); + return ret; + } + ret = CheckBufferPoolSize(total_size, pool_id, buffer_pool_size, calc_total_size); + if (ret != SUCCESS) { + GELOGE(ret, "[Check][BufferPoolSize]Capacity is not enough for all data, calc_node:%s.", + calc_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Capacity is not enough for all data, calc_node:%s.", + calc_node->GetName().c_str()); + return ret; + } + BufferPoolNodeItem buffer_pool_node_item(peer_buffer_node, calc_node, pre_buffer_pool_node, total_size, + 0, 0, (input_buffer_node_num == peer_buffer_node_item.size())); + ret = AllocateSpaceForBufferPoolNode(next_start, buffer_pool, buffer_pool_node_item, node_mem_range_in_pool); + if (ret != SUCCESS) { + GELOGE(ret, "[Alloc][ForNode]Pool node:%s, calc_node:%s.", + peer_buffer_node->GetName().c_str(), calc_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to allocate space for buffer pool node:%s, calc_node:%s.", + peer_buffer_node->GetName().c_str(), calc_node->GetName().c_str()); + return ret; + } + pre_buffer_pool_node = peer_buffer_node; + } + } + return SUCCESS; +} + +Status BufferPoolMemoryPass::AllocateSpaceForBufferPoolNode(int64_t &next_start, + const BufferPool buffer_pool, + BufferPoolNodeItem &buffer_pool_node_item, + std::queue &node_mem_range_in_pool) { + // Get event id must be before FixTheTimingOfDependentNodes + uint32_t logic_event = logic_event_num_; + NodePtr buffer_node = buffer_pool_node_item.node; + NodePtr calc_node = buffer_pool_node_item.out_calc_node; + /// In the scenario where there are multiple PREFETCH operators in the inputs of the calculation operator, + /// the addition of events is optimized to only add events after the last PREFETCH operator. + /// w1 w2 w3 w4 w5 + /// | | | | | + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 xxx + /// \ / \ / \ / + /// \ / \ / \ / + /// \ / \ / \ / + /// node1 node2 node3 + /// | | | + /// | | | + /// --------------- other nodes ------------ + /// + /// The event id of the PREFETCH operator to the calculation operator needs to be generated before + /// FixTheTimingOfDependentNodes, because FixTheTimingOfDependentNodes may add a new id to stream_ctrl_event_, + /// and this id cannot be reused until the next PREFETCH operator in the sequence. + if (buffer_pool_node_item.is_last_input) { + logic_event = GenerateEventId(buffer_node->GetName(), stream_ctrl_event_); + node_event_multiplexing_[buffer_node].push_back(string("SendTo;" + calc_node->GetName() + + ";" + std::to_string(logic_event))); + mem_ctrl_event_.push(std::make_pair(calc_node->GetName(), logic_event)); + } + NodePtr dependent_calc_node = GetOffsetAndDependency(next_start, buffer_pool_node_item.total_size, + buffer_pool.pool_size, + buffer_pool.buffer_node_to_calc, + node_mem_range_in_pool); + if (dependent_calc_node != nullptr) { + Status ret = FixTheTimingOfDependentNodes(dependent_calc_node, buffer_node); + if (ret != SUCCESS) { + GELOGE(ret, "[Fix][Timing]Pool_id:%ld, pool node:%s, dependent node:%s.", + buffer_pool.pool_id, buffer_node->GetName().c_str(), dependent_calc_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to fix timing, pool_id:%ld, pool node:%s, dependent node:%s.", + buffer_pool.pool_id, buffer_node->GetName().c_str(), + dependent_calc_node->GetName().c_str()); + return ret; + } + } + + buffer_pool_node_item.offset_start = next_start; + buffer_node_logical_offset_[buffer_node].push_back(buffer_pool_node_item.total_size); + buffer_node_logical_offset_[buffer_node].push_back(next_start); + FMK_INT64_ADDCHECK(next_start, buffer_pool_node_item.total_size); + next_start += buffer_pool_node_item.total_size; + buffer_pool_node_item.offset_end = next_start; + node_mem_range_in_pool.push(buffer_pool_node_item); + if (buffer_pool_node_item.pre_buffer_pool_node != nullptr) { + bool not_change = true; + auto ret = TryToFixNodeOrder(buffer_pool_node_item.pre_buffer_pool_node, buffer_node, not_change); + if (ret != SUCCESS) { + GELOGE(ret, "[Fix][BufferPoolNodeOrder]Pre node:%s, curr node:%s.", + buffer_pool_node_item.pre_buffer_pool_node->GetName().c_str(), buffer_node->GetName().c_str()); + return ret; + } + } + GELOGI("[Alloc][ForNode]Buffer pool node %s send to %s, offset start:%ld, send event id:%u.", + buffer_node->GetName().c_str(), calc_node->GetName().c_str(), + buffer_pool_node_item.offset_start, logic_event); + return SUCCESS; +} + +/// When generating the event ID, determine whether the name of the queue head node is the same as the name of +/// the operator, in order to handle such scenarios: +/// w1 w2 w3 w4 w5 +/// | | | | | +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// | | | | | +/// node1 node2 node3 node4 node5 +/// +/// Memory distribution: +/// +/// |____w1_____|__| +/// +/// |____w2_____|__| +/// +/// |____w3_____|__| +/// +/// |______w4______| +/// +/// |______w5______| +/// +/// In this scenario, prefetch2 depends on node1. If the dependency is handled by adding an event of node1 to prefetch2, +/// the id sent by prefetch2 will be the same as the id it receives.Although Runtime supports this through WaitReset, +/// we consider this a dangerous operation and avoid it. +uint32_t BufferPoolMemoryPass::GenerateEventId(const std::string &node_name, + std::queue> &event_queue) { + uint32_t logic_event = logic_event_num_; + if (!event_queue.empty()) { + auto item = event_queue.front(); + if (item.first != node_name) { + logic_event = item.second; + event_queue.pop(); + return logic_event; + } + } + ++logic_event_num_; + return logic_event; +} + +NodePtr BufferPoolMemoryPass::GetOffsetAndDependency(int64_t &next_start, + int64_t total_mem_size, + int64_t buffer_pool_size, + const std::unordered_map &buffer_node_to_calc, + std::queue &nodes_in_buffer) { + // The buffer pool can no longer fit this Tensor and needs to turn back. + if (next_start + total_mem_size > buffer_pool_size) { + next_start = 0; + if (!nodes_in_buffer.empty()) { + // Take up the rest of the space at the end, + nodes_in_buffer.back().offset_end = buffer_pool_size; + // Pop the first tensor memory in the previous round of the previous round. + nodes_in_buffer.pop(); + } + while (!nodes_in_buffer.empty()) { + auto node_item = nodes_in_buffer.front(); + // Go to the begin of previous round. + if (node_item.offset_start == 0) { + break; + } + nodes_in_buffer.pop(); + } + } + + while (!nodes_in_buffer.empty()) { + auto node_item = nodes_in_buffer.front(); + if (next_start + total_mem_size <= node_item.offset_end) { + auto pool_node = node_item.node; + if (pool_node == nullptr) { + return nullptr; + } + auto output_calc = buffer_node_to_calc.find(pool_node); + if (output_calc != buffer_node_to_calc.end()) { + return output_calc->second; + } + return nullptr; + } + nodes_in_buffer.pop(); + } + return nullptr; +} + +Status BufferPoolMemoryPass::FixTheTimingOfDependentNodes(NodePtr &dependent_calc_node, NodePtr &curr_pool_node) { + // The previous process ensures that all pointers are not null. + bool not_change = false; + Status ret = TryToFixNodeOrder(dependent_calc_node, curr_pool_node, not_change); + if (ret != SUCCESS) { + GELOGE(ret, "[Fix][NodeOrder]Src:%s, dst:%s.", + dependent_calc_node->GetName().c_str(), curr_pool_node->GetName().c_str()); + return ret; + } + if (not_change) { + return SUCCESS; + } + uint32_t logic_event = GenerateEventId(dependent_calc_node->GetName(), mem_ctrl_event_); + node_event_multiplexing_[curr_pool_node].push_back(string("RecvFrom;" + dependent_calc_node->GetName() + + ";" + std::to_string(logic_event))); + stream_ctrl_event_.push(std::make_pair(curr_pool_node->GetName(), logic_event)); + GELOGI("[Fix][Timing]Add ctrl edge for buffer pool memory from %s to %s, buffer pool node recv event:%u.", + dependent_calc_node->GetName().c_str(), curr_pool_node->GetName().c_str(), logic_event); + return SUCCESS; +} + +Status BufferPoolMemoryPass::SetResultOfMemoryAndEvent() { + for (auto &iter : node_event_multiplexing_) { + auto node = iter.first; + GE_CHECK_NOTNULL(node); + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + bool ret = AttrUtils::SetListStr(op_desc, ATTR_NAME_EVENT_MULTIPLEXING, iter.second); + if (!ret) { + GELOGE(INTERNAL_ERROR, "[Set][Attr]Node:%s.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to set event reuse info, node:%s.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + auto offset_iter = buffer_node_logical_offset_.find(node); + if (offset_iter == buffer_node_logical_offset_.end()) { + GELOGE(INTERNAL_ERROR, "[Get][LogicalOffset]Node:%s.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get logical offset and size, node:%s.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + ret = AttrUtils::SetListInt(op_desc, ATTR_NAME_BUFFER_POOL_NODE_SIZE_AND_OFFSET, offset_iter->second); + if (!ret) { + GELOGE(INTERNAL_ERROR, "[Set][Attr]Node:%s.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Failed to set node memory offset and size, node:%s.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} +} // namespace ge diff --git a/ge/graph/passes/buffer_pool_memory_pass.h b/ge/graph/passes/buffer_pool_memory_pass.h new file mode 100644 index 00000000..e3d1c159 --- /dev/null +++ b/ge/graph/passes/buffer_pool_memory_pass.h @@ -0,0 +1,136 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_BUFFER_POOL_MEMORY_PASS_H_ +#define GE_GRAPH_PASSES_BUFFER_POOL_MEMORY_PASS_H_ + +#include +#include "graph/graph.h" +#include "inc/graph_pass.h" + +namespace ge { +class BufferPoolMemoryPass : public GraphPass { + public: + explicit BufferPoolMemoryPass() : logic_event_num_(0) {} + + ~BufferPoolMemoryPass() override = default; + + struct BufferPool { + int64_t pool_id = 0; + int64_t pool_size = 0; + std::unordered_map buffer_node_to_calc; + BufferPool(int64_t id, int64_t size, const std::unordered_map &node_map) + : pool_id(id), pool_size(size), buffer_node_to_calc(node_map) {} + }; + + struct BufferPoolNodeItem { + NodePtr node = nullptr; + NodePtr out_calc_node = nullptr; + NodePtr pre_buffer_pool_node = nullptr; + int64_t total_size = 0; + int64_t offset_start = 0; + int64_t offset_end = 0; + bool is_last_input = true; + BufferPoolNodeItem(const NodePtr &buffer_n, const NodePtr &calc_n, const NodePtr &pre_buffer_n, + int64_t size, int64_t start, int64_t end, bool last) + : node(std::move(buffer_n)), + out_calc_node(std::move(calc_n)), + pre_buffer_pool_node(std::move(pre_buffer_n)), + total_size(size), + offset_start(start), + offset_end(end), + is_last_input(last) {} + + BufferPoolNodeItem(const NodePtr &buffer_n, int64_t start, int64_t end) + : node(std::move(buffer_n)), + out_calc_node(nullptr), + pre_buffer_pool_node(nullptr), + total_size(0), + offset_start(start), + offset_end(end), + is_last_input(true) {} + }; + + Status Run(ComputeGraphPtr graph) override; + + private: + static void ClearQueue(std::queue> &q); + + static Status IsBufferPoolMemEnable(const ComputeGraphPtr &graph); + + static Status CheckBufferPoolSize(int64_t total_size, int64_t pool_id, int64_t buffer_pool_size, + std::unordered_map &calc_total_size); + + static Status TryToFixNodeOrder(NodePtr &pre_node, NodePtr &curr_node, bool ¬_change); + + Status InsertMemCpyNodeAfter(ComputeGraphPtr &graph, NodePtr &node); + + Status CopyOutForMultiUsedOutput(ComputeGraphPtr &graph); + + Status GetBufferPoolAndPeerCalcNodes(const ComputeGraphPtr &graph); + + Status SetBufferPoolSize(const std::string &batch_label, int64_t id, int64_t size); + + Status AllocateAllBufferPoolSpace(); + + Status AllocateSpaceInBatch(const std::map> &calc_nodes, + const std::unordered_map &buffer_pool_size_map, + const std::unordered_map &buffer_node_to_calc, + std::unordered_map> &buffer_pool_nodes_item); + + Status AllocateSpaceInBufferPool(const BufferPool &buffer_pool, + const std::vector &calc_nodes_in_pool, + std::unordered_map> &buffer_pool_nodes_item); + + Status AllocateSpaceForBufferPoolNode(int64_t &next_start, + const BufferPool buffer_pool, + BufferPoolNodeItem &buffer_pool_node_item, + std::queue &node_mem_range_in_pool); + + NodePtr GetOffsetAndDependency(int64_t &next_start, + int64_t total_mem_size, + int64_t buffer_pool_size, + const std::unordered_map &buffer_node_to_calc, + std::queue &nodes_in_buffer); + + Status FixTheTimingOfDependentNodes(NodePtr &dependent_calc_node, NodePtr &curr_pool_node); + + uint32_t GenerateEventId(const std::string &node_name, std::queue> &event_queue); + + Status SetResultOfMemoryAndEvent(); + + // Use map to ensure that each visit is in the order of batch label and pool id + std::map>> calc_nodes_; + + std::unordered_map> buffer_node_to_calc_; + + std::unordered_map>> peer_buffer_node_item_; + + std::unordered_map> buffer_pool_size_; + + uint32_t logic_event_num_; + + std::queue> mem_ctrl_event_; + + std::queue> stream_ctrl_event_; + + std::unordered_map> node_event_multiplexing_; + + std::unordered_map> buffer_node_logical_offset_; +}; +} // namespace ge + +#endif // GE_GRAPH_PASSES_BUFFER_POOL_MEMORY_PASS_H_ diff --git a/ge/graph/passes/cast_remove_pass.cc b/ge/graph/passes/cast_remove_pass.cc index 62c92866..7e2bb7bb 100644 --- a/ge/graph/passes/cast_remove_pass.cc +++ b/ge/graph/passes/cast_remove_pass.cc @@ -25,11 +25,13 @@ namespace ge { Status CastRemovePass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [node] must not be null."); return PARAM_INVALID; } OpDescPtr op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); return PARAM_INVALID; } @@ -46,6 +48,7 @@ Status CastRemovePass::Run(NodePtr &node) { } OpDescPtr end_op_desc = end_node->GetOpDesc(); if (end_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "op_desc of end_node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "OpDesc of end node must not be null."); return PARAM_INVALID; } @@ -99,6 +102,8 @@ Status CastRemovePass::RemoveCast(DataType &type, std::vector &nodes_to GELOGI("CastRemovePass, remove Cast %s.", node->GetName().c_str()); cast_name = node->GetName(); if (IsolateAndDeleteNode(node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "IsolateAndDeleteNode %s failed.", node->GetName().c_str()); return FAILED; } @@ -114,6 +119,7 @@ Status CastRemovePass::RemoveCast(DataType &type, std::vector &nodes_to } OpDescPtr op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Find nullptr op_desc in node, check invalid"); GELOGE(FAILED, "OpDesc must not be null."); return FAILED; } @@ -123,6 +129,9 @@ Status CastRemovePass::RemoveCast(DataType &type, std::vector &nodes_to op_desc->SetName(new_node_name); // add attr to changed TransData, then will be rebuild if (!AttrUtils::SetBool(op_desc, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s of op:%s(%s) failed", + ATTR_NEED_COMPILE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Set ATTR_NEED_COMPILE Attr fail."); return FAILED; } diff --git a/ge/graph/passes/cast_translate_pass.cc b/ge/graph/passes/cast_translate_pass.cc index 2e95c19f..37e9bc83 100644 --- a/ge/graph/passes/cast_translate_pass.cc +++ b/ge/graph/passes/cast_translate_pass.cc @@ -223,6 +223,8 @@ Status CastTranslatePass::Run(NodePtr &node) { continue; } if (IsolateAndDeleteNode(out_data_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + out_data_node->GetName().c_str(), out_data_node->GetType().c_str()); return FAILED; } } @@ -247,14 +249,14 @@ Status CastTranslatePass::FuseDstNTranslates(NodePtr &node) { GE_CHECK_NOTNULL(out_data_node); AddRePassNodesWithInOut(out_data_node); // Has checked nodes only has one in data anchor one out data anchor - GE_CHK_STATUS_RET(NodeUtils::MoveOutputEdges(out_data_node, base_node), "move out put edge failed"); + GE_CHK_GRAPH_STATUS_RET(NodeUtils::MoveOutputEdges(out_data_node, base_node), "move out put edge failed"); // Relink in control anchor, delete in data anchor auto in_ctr_anchor = out_data_node->GetInControlAnchor(); GE_CHECK_NOTNULL(in_ctr_anchor); for (const auto &peer_anchor : in_ctr_anchor->GetPeerOutControlAnchors()) { GE_CHECK_NOTNULL(base_node->GetInControlAnchor()); - GE_CHK_STATUS_RET(base_node->GetInControlAnchor()->LinkFrom(peer_anchor), "link from peer anchor failed"); + GE_CHK_GRAPH_STATUS_RET(base_node->GetInControlAnchor()->LinkFrom(peer_anchor), "link from peer anchor failed"); } in_ctr_anchor->UnlinkAll(); out_data_node->GetAllInDataAnchors().at(0)->UnlinkAll(); @@ -262,6 +264,8 @@ Status CastTranslatePass::FuseDstNTranslates(NodePtr &node) { ComputeGraphPtr graph = out_data_node->GetOwnerComputeGraph(); GE_CHECK_NOTNULL(graph); if (GraphUtils::RemoveNodeWithoutRelink(graph, out_data_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + out_data_node->GetName().c_str(), out_data_node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", out_data_node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/common_subexpression_elimination_pass.cc b/ge/graph/passes/common_subexpression_elimination_pass.cc index 3587b03e..a95d0077 100644 --- a/ge/graph/passes/common_subexpression_elimination_pass.cc +++ b/ge/graph/passes/common_subexpression_elimination_pass.cc @@ -106,6 +106,9 @@ Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) { ret = GraphUtils::ReplaceNodeAnchors(iter->second, node, {}, output_map); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace node:%s(%s)'s anchor by node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + iter->second->GetName().c_str(), iter->second->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to replace node %s by node %s error node %u", node->GetName().c_str(), iter->second->GetName().c_str(), ret); return INTERNAL_ERROR; @@ -115,6 +118,8 @@ Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) { ret = GraphUtils::RemoveNodeWithoutRelink(graph, node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to remove node %s from graph", node->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/compile_nodes_pass.cc b/ge/graph/passes/compile_nodes_pass.cc index 7de7fd48..76330dc6 100755 --- a/ge/graph/passes/compile_nodes_pass.cc +++ b/ge/graph/passes/compile_nodes_pass.cc @@ -41,6 +41,7 @@ graphStatus CompileNodesPass::Run(ComputeGraphPtr graph) { } std::shared_ptr instance = ge::GELib::GetInstance(); if (instance == nullptr || !instance->InitFlag()) { + REPORT_INNER_ERROR("E19999", "Gelib not init before, check invalid"); GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "Run CompileNodesPass failed."); return ge::GE_CLI_GE_NOT_INITIALIZED; } @@ -99,6 +100,8 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std: (void)instance->DNNEngineManagerObj().GetDNNEngineName(node); kernel_lib_name = op_desc->GetOpKernelLibName(); if (kernel_lib_name.empty()) { + REPORT_INNER_ERROR("E19999", "kernel_lib_name in op:%s(%s) is empty, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(GRAPH_FAILED, "Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str()); return GRAPH_FAILED; @@ -106,11 +109,16 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std: } OpsKernelInfoStorePtr kernel_info = instance->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_lib_name); if (kernel_info == nullptr) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed for op:%s(%s)", + kernel_lib_name.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(ge::GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", node->GetName().c_str()); return ge::GE_GRAPH_PARAM_NULLPTR; } + + std::map unsupported_reasons; + std::string unsupported_reason; // begin accuracy supported check - if (!CheckAccuracySupport(kernel_info, instance, node)) { + if (!CheckAccuracySupport(kernel_info, instance, node, unsupported_reason)) { // if check accuracy support failed , try to go to other engine. GELOGD("Check Accuracy Supported return not support, node name is %s. Try to go to other engine.", op_desc->GetName().c_str()); @@ -123,13 +131,25 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std: continue; } OpsKernelInfoStorePtr tmp_kernel_info = it->second; - if (CheckAccuracySupport(tmp_kernel_info, instance, node)) { + if (CheckAccuracySupport(tmp_kernel_info, instance, node, unsupported_reason)) { kernel_lib_name = tmp_kernel_name; GELOGD("Find kernel lib %s support node:%s, type:%s , get kernel lib success.", tmp_kernel_name.c_str(), node->GetName().c_str(), op_desc->GetType().c_str()); return GRAPH_SUCCESS; + } else { + unsupported_reasons.emplace(tmp_kernel_name, unsupported_reason); } } + for (const auto &it : unsupported_reasons) { + REPORT_INPUT_ERROR("E13002", std::vector({"optype", "opskernel", "reason"}), + std::vector({op_desc->GetType(), it.first, it.second})); + GELOGE(GE_GRAPH_ASSIGN_ENGINE_FAILED, + "CheckAccuracySupport:Op type %s of ops kernel %s is unsupported, reason:%s", + op_desc->GetType().c_str(), it.first.c_str(), it.second.c_str()); + } + + REPORT_INPUT_ERROR("E13003", std::vector({"opname", "optype"}), + std::vector({op_desc->GetName(), op_desc->GetType()})); GELOGE(GRAPH_FAILED, "Cannot find kernel lib support node:%s, type:%s , get kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str()); return GRAPH_FAILED; @@ -137,10 +157,10 @@ graphStatus CompileNodesPass::GetSupportedKernel(const NodePtr &node, const std: return GRAPH_SUCCESS; } -bool CompileNodesPass::CheckAccuracySupport(const OpsKernelInfoStorePtr &kernel_info, - const std::shared_ptr instance, const NodePtr &node) { - string reason; - if (!(kernel_info->CheckAccuracySupported(node, reason, true))) { +bool CompileNodesPass::CheckAccuracySupport( + const OpsKernelInfoStorePtr &kernel_info, const std::shared_ptr instance, + const NodePtr &node, string& unsupported_reason) { + if (!(kernel_info->CheckAccuracySupported(node, unsupported_reason, true))) { return false; } return true; @@ -153,6 +173,8 @@ graphStatus CompileNodesPass::CompileNodes(const std::shared_ptr instance for (auto &kernel_nodes : kernel_to_compile_nodes) { kernel_info = instance->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_nodes.first); if (kernel_info == nullptr) { + REPORT_INNER_ERROR("E19999", "Find ops kernel by name:%s failed", + kernel_nodes.first.c_str()); GELOGE(ge::GE_GRAPH_PARAM_NULLPTR, "Get op %s ops kernel info store failed", kernel_nodes.first.c_str()); return ge::GE_GRAPH_PARAM_NULLPTR; } @@ -168,6 +190,8 @@ graphStatus CompileNodesPass::CompileNodes(const std::shared_ptr instance } auto ret = kernel_info->CompileOp(kernel_nodes.second); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Call CompileOp failed, kernel_lib_name:%s, ret:%d", + kernel_nodes.first.c_str(), ret); GELOGE(ret, "Compile op failed, kernel name is %s", kernel_nodes.first.c_str()); return GRAPH_FAILED; } diff --git a/ge/graph/passes/compile_nodes_pass.h b/ge/graph/passes/compile_nodes_pass.h index e9a77e07..11a0f4fa 100644 --- a/ge/graph/passes/compile_nodes_pass.h +++ b/ge/graph/passes/compile_nodes_pass.h @@ -39,7 +39,7 @@ class CompileNodesPass : public GraphPass { private: graphStatus GetSupportedKernel(const NodePtr &node, const std::shared_ptr instance, string &kernel_lib_name); bool CheckAccuracySupport(const OpsKernelInfoStorePtr &kernel_info, const std::shared_ptr instance, - const NodePtr &node); + const NodePtr &node, string& unsupported_reason); graphStatus CompileNodes(const std::shared_ptr instance, std::unordered_map> &kernel_to_compile_nodes); }; diff --git a/ge/graph/passes/cond_pass.cc b/ge/graph/passes/cond_pass.cc index 06a209ed..c274df49 100644 --- a/ge/graph/passes/cond_pass.cc +++ b/ge/graph/passes/cond_pass.cc @@ -75,6 +75,10 @@ Status CondPass::Run(NodePtr &node) { case DT_INT32: break; default: + REPORT_INNER_ERROR("E19999", + "data_type:%d of index:%d input tensor in op:%s(%s) check invalid", + cond_tensor.GetDataType(), cond_in_anchor->GetIdx(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "UpdateInputDesc for node %s failed.", op_desc->GetName().c_str()); return FAILED; } @@ -85,6 +89,8 @@ Status CondPass::Run(NodePtr &node) { cond_tensor.SetShape(GeShape()); cond_tensor.SetOriginShape(GeShape()); if (op_desc->UpdateInputDesc(cond_in_anchor->GetIdx(), cond_tensor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input desc of op:%s(%s) failed, index:%d", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), cond_in_anchor->GetIdx()); GELOGE(FAILED, "UpdateInputDesc for node %s failed.", op_desc->GetName().c_str()); return FAILED; } @@ -158,6 +164,9 @@ Status CondPass::GetCondInfoForWhile(const NodePtr &node, ComputeGraphPtr &graph std::map subgraph_names_to_index = op_desc->GetSubgraphNameIndexes(); auto iter = subgraph_names_to_index.find(ATTR_NAME_WHILE_COND); if (iter == subgraph_names_to_index.end()) { + REPORT_INNER_ERROR("E19999", "subgraph name:%s not exist in SubgraphNameIndexes map of op:%s(%s), " + "check invalid", ATTR_NAME_WHILE_COND.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Get cond_graph index failed, while_node:%s.", node->GetName().c_str()); return FAILED; } @@ -170,6 +179,8 @@ Status CondPass::GetCondInfoForWhile(const NodePtr &node, ComputeGraphPtr &graph // cond_graph has and only has one output uint32_t output_num = net_output_node->GetAllInDataAnchorsSize(); if (output_num != 1) { + REPORT_INNER_ERROR("E19999", "Input data anchor num:%u of op:%s(%s) not equal to 1, check invalid", + output_num, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "output size of cond_graph is invalid, expect 1 but %u exactly, while_node:%s.", output_num, node->GetName().c_str()); return FAILED; @@ -233,6 +244,12 @@ Status CondPass::HandleScalarCond(const ComputeGraphPtr &graph, const OutDataAnc } if (GraphUtils::InsertNodeAfter(peer_out_anchor, { cond_in_anchor }, cast_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert Cast node %s(%s) between %s(%s)->%s(%s) failed", + cast_node->GetName().c_str(), cast_node->GetType().c_str(), + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + cond_in_anchor->GetOwnerNode()->GetName().c_str(), + cond_in_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(FAILED, "Insert Cast node %s between %s->%s failed.", cast_node->GetName().c_str(), peer_out_anchor->GetOwnerNode()->GetName().c_str(), cond_in_anchor->GetOwnerNode()->GetName().c_str()); @@ -268,17 +285,27 @@ Status CondPass::InsertNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr OpDescBuilder op_desc_builder(in_data_anchor->GetOwnerNode()->GetName() + "_" + type, type); OpDescPtr op_desc = op_desc_builder.AddInput("x", in_tensor).AddOutput("y", out_tensor).Build(); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Create op_desc:%s(%s) failed", + (in_data_anchor->GetOwnerNode()->GetName() + "_" + type).c_str(), type.c_str()); GELOGE(FAILED, "Create op_desc failed."); return FAILED; } NodePtr new_node = graph->AddNode(op_desc); if (new_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Create %s node failed.", type.c_str()); return FAILED; } AddRePassNode(new_node); if (GraphUtils::InsertNodeAfter(peer_out_anchor, { in_data_anchor }, new_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert node %s(%s) between %s(%s)->%s(%s) failed", + new_node->GetName().c_str(), new_node->GetType().c_str(), + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + in_data_anchor->GetOwnerNode()->GetName().c_str(), + in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(FAILED, "Insert %s node %s between %s->%s failed.", type.c_str(), new_node->GetName().c_str(), peer_out_anchor->GetOwnerNode()->GetName().c_str(), in_data_anchor->GetOwnerNode()->GetName().c_str()); @@ -310,6 +337,8 @@ NodePtr CondPass::AddCastNode(const ComputeGraphPtr &graph, const std::string &n OpDescBuilder op_desc_builder(name, CAST); OpDescPtr cast_desc = op_desc_builder.AddInput("x", in_tensor).AddOutput("y", out_tensor).Build(); if (cast_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Create op_desc:%s(%s) failed", + name.c_str(), CAST); GELOGE(FAILED, "Create cast op_desc failed, name: %s.", name.c_str()); return nullptr; } @@ -317,12 +346,18 @@ NodePtr CondPass::AddCastNode(const ComputeGraphPtr &graph, const std::string &n AttrUtils::SetInt(cast_desc, CAST_ATTR_DSTT, dst) && AttrUtils::SetInt(cast_desc, CAST_ATTR_DST_TYPE, dst) && AttrUtils::SetBool(cast_desc, CAST_ATTR_TRUNCATE, false))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s,%s,%s,%s to node:%s(%s) not all success", + CAST_ATTR_SRCT.c_str(), CAST_ATTR_DSTT.c_str(), + CAST_ATTR_DST_TYPE.c_str(), CAST_ATTR_TRUNCATE.c_str(), + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); GELOGE(FAILED, "Set CAST_ATTR failed, node: %s.", name.c_str()); return nullptr; } NodePtr cast_node = graph->AddNode(cast_desc); if (cast_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Add cast node failed, name: %s.", name.c_str()); return nullptr; } diff --git a/ge/graph/passes/cond_remove_pass.cc b/ge/graph/passes/cond_remove_pass.cc index 5fc41714..74568c2f 100644 --- a/ge/graph/passes/cond_remove_pass.cc +++ b/ge/graph/passes/cond_remove_pass.cc @@ -85,6 +85,11 @@ Status CondRemovePass::RemoveDeadCondLink(const int32_t index, const NodePtr &no const auto &in_anchor = node->GetInDataAnchor(index); const auto &peerout_anchor = in_anchor->GetPeerOutAnchor(); if (GraphUtils::RemoveEdge(peerout_anchor, in_anchor) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + peerout_anchor->GetOwnerNode()->GetName().c_str(), + peerout_anchor->GetOwnerNode()->GetType().c_str(), peerout_anchor->GetIdx(), + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(FAILED, "Remove edge from node %s index %d to node %s index %d.", peerout_anchor->GetOwnerNode()->GetName().c_str(), peerout_anchor->GetIdx(), in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetIdx()); @@ -98,6 +103,8 @@ Status CondRemovePass::GetCaseChosenBranch(const NodePtr &node, const uint32_t c uint32_t subgraph_names_size = static_cast(node->GetOpDesc()->GetSubgraphInstanceNames().size()); uint32_t cond_index_new = cond_index; if (subgraph_names_size == 0) { + REPORT_INNER_ERROR("E19999", "subgraph size of op:%s(%s) is 0, check invavlid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Node %s has none subgraph.", node->GetName().c_str()); return ge::FAILED; } @@ -107,6 +114,8 @@ Status CondRemovePass::GetCaseChosenBranch(const NodePtr &node, const uint32_t c } const auto &chosen_branch_name = node->GetOpDesc()->GetSubgraphInstanceName(cond_index_new); if (chosen_branch_name.empty()) { + REPORT_INNER_ERROR("E19999", "Get subgraph name from op:%s(%s) by index:%u failed", + node->GetName().c_str(), node->GetType().c_str(), cond_index_new); GELOGE(FAILED, "Node %s has no subgraph, index is %u.", node->GetName().c_str(), cond_index_new); return ge::FAILED; } @@ -121,6 +130,8 @@ Status CondRemovePass::GetIfChosenBranch(const NodePtr &node, const uint32_t con uint32_t subgraph_names_size = static_cast(node->GetOpDesc()->GetSubgraphInstanceNames().size()); uint32_t cond_index_new = 0; if (subgraph_names_size == 0) { + REPORT_INNER_ERROR("E19999", "subgraph size of op:%s(%s) is 0, check invavlid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Node %s has none subgraph.", node->GetName().c_str()); return ge::FAILED; } @@ -130,11 +141,16 @@ Status CondRemovePass::GetIfChosenBranch(const NodePtr &node, const uint32_t con } const auto &chosen_branch_name = node->GetOpDesc()->GetSubgraphInstanceName(cond_index_new); if (chosen_branch_name.empty()) { + REPORT_INNER_ERROR("E19999", "Get subgraph name from op:%s(%s) by index:%u failed", + node->GetName().c_str(), node->GetType().c_str(), cond_index_new); GELOGE(FAILED, "Node %s has no subgraph, index is %u.", node->GetName().c_str(), cond_index_new); return ge::FAILED; } auto chosen_graph = GraphUtils::FindRootGraph(node->GetOwnerComputeGraph())->GetSubgraph(chosen_branch_name); if (chosen_graph == nullptr) { + REPORT_INNER_ERROR("E19999", + "Find subgraph by name:%s from node:%s(%s)'s root_graph failed", + chosen_branch_name.c_str(), node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Can not find branch %s in node %s's parent graph %s.", chosen_branch_name.c_str(), node->GetName().c_str(), node->GetOwnerComputeGraph()->GetName().c_str()); return ge::FAILED; @@ -242,6 +258,11 @@ Status CondRemovePass::ReplaceIfCaseNodeWithPartitioncall(const NodePtr &node, c for (const auto &peerout_anchor : input_anchor->GetPeerAnchors()) { if (GraphUtils::AddEdge(peerout_anchor, partitioncall_node->GetInAnchor( input_anchor->GetIdx() - kConditionIndexNum)) != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + peerout_anchor->GetOwnerNode()->GetName().c_str(), + peerout_anchor->GetOwnerNode()->GetType().c_str(), peerout_anchor->GetIdx(), + partitioncall_node->GetName().c_str(), + partitioncall_node->GetType().c_str(), input_anchor->GetIdx()); GELOGE(FAILED, "Add edge failed, from node:%s idx:%d to node:%s idx:%d, input num:%zu, output num:%zu", peerout_anchor->GetOwnerNode()->GetName().c_str(), peerout_anchor->GetIdx(), partitioncall_node->GetName().c_str(), input_anchor->GetIdx(), input_desc_size, @@ -255,6 +276,10 @@ Status CondRemovePass::ReplaceIfCaseNodeWithPartitioncall(const NodePtr &node, c for (const auto &output_anchor : node->GetAllOutAnchors()) { for (const auto &peerin_anchor : output_anchor->GetPeerAnchors()) { if (GraphUtils::RemoveEdge(node->GetOutAnchor(output_anchor->GetIdx()), peerin_anchor) != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + node->GetName().c_str(), node->GetType().c_str(), output_anchor->GetIdx(), + peerin_anchor->GetOwnerNode()->GetName().c_str(), + peerin_anchor->GetOwnerNode()->GetType().c_str(), peerin_anchor->GetIdx()); GELOGE(FAILED, "Remove edge failed, from node:%s idx:%d to node:%s idx:%d, input num:%zu, output num:%zu", node->GetName().c_str(), output_anchor->GetIdx(), peerin_anchor->GetOwnerNode()->GetName().c_str(), peerin_anchor->GetIdx(), input_desc_size, output_desc_size); @@ -262,6 +287,11 @@ Status CondRemovePass::ReplaceIfCaseNodeWithPartitioncall(const NodePtr &node, c } if (GraphUtils::AddEdge(partitioncall_node->GetOutAnchor(output_anchor->GetIdx()), peerin_anchor) != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + partitioncall_node->GetName().c_str(), + partitioncall_node->GetType().c_str(), output_anchor->GetIdx(), + peerin_anchor->GetOwnerNode()->GetName().c_str(), + peerin_anchor->GetOwnerNode()->GetType().c_str(), peerin_anchor->GetIdx()); GELOGE(FAILED, "Add edge failed, from node:%s idx:%d to node:%s idx:%d, input num:%zu, output num:%zu", partitioncall_node->GetName().c_str(), output_anchor->GetIdx(), peerin_anchor->GetOwnerNode()->GetName().c_str(), peerin_anchor->GetIdx(), input_desc_size, diff --git a/ge/graph/passes/constant_folding_pass.cc b/ge/graph/passes/constant_folding_pass.cc index 66e076af..db2ef494 100644 --- a/ge/graph/passes/constant_folding_pass.cc +++ b/ge/graph/passes/constant_folding_pass.cc @@ -108,6 +108,8 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) { node->GetType().c_str()); return SUCCESS; } + REPORT_CALL_ERROR("E19999", "Calculate for node %s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Calculate for node %s failed in constant folding", node->GetName().c_str()); return ret; } @@ -125,6 +127,8 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) { } if (outputs.empty()) { + REPORT_INNER_ERROR("E19999", "After calculate for node %s(%s), output weight is empty, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to constant folding on node %s," " no output weight", diff --git a/ge/graph/passes/constant_fuse_same_pass.cc b/ge/graph/passes/constant_fuse_same_pass.cc index eb8b3470..8cb8c091 100644 --- a/ge/graph/passes/constant_fuse_same_pass.cc +++ b/ge/graph/passes/constant_fuse_same_pass.cc @@ -57,6 +57,7 @@ void GetOutDataNodeToIndexMap(NodePtr &node, std::map & Status ConstantFuseSamePass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "Compute graph is null."); return GE_GRAPH_PARAM_NULLPTR; } @@ -159,6 +160,11 @@ Status ConstantFuseSamePass::MoveOutDataEdges(NodePtr &src_node, NodePtr &dst_no } auto ret = dst_out_data_anchor->LinkTo(it->second); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:0 link to op:%s(%s) in index:%d failed", + dst_node->GetName().c_str(), dst_node->GetType().c_str(), + it->second->GetOwnerNode()->GetName().c_str(), it->second->GetOwnerNode()->GetType().c_str(), + it->second->GetIdx()); GELOGE(FAILED, "Failed to move out data edge from %s to %s", src_node->GetName().c_str(), dst_node->GetName().c_str()); return FAILED; @@ -185,6 +191,8 @@ Status ConstantFuseSamePass::FuseConstNodes(ComputeGraphPtr &graph, return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/control_trigger_pass.cc b/ge/graph/passes/control_trigger_pass.cc index e179c64e..9125a48f 100644 --- a/ge/graph/passes/control_trigger_pass.cc +++ b/ge/graph/passes/control_trigger_pass.cc @@ -70,6 +70,12 @@ Status ControlTriggerPass::HandleDynamicCtrlEdges(ComputeGraphPtr &graph, NodePt NodePtr constant = (branch_flag ? iter2->second.second : iter2->second.first); if ((GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), node->GetInControlAnchor()) != GRAPH_SUCCESS) || (GraphUtils::AddEdge(in_ctrl_node->GetOutControlAnchor(), constant->GetInControlAnchor()) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s), then " + "add control edge between op:%s(%s) and op:%s(%s) failed", + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str(), + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str(), + constant->GetName().c_str(), constant->GetType().c_str()); GELOGE(FAILED, "Replace ctrl edge fail, %s->%s, %s->%s.", in_ctrl_node->GetName().c_str(), node->GetName().c_str(), in_ctrl_node->GetName().c_str(), constant->GetName().c_str()); return FAILED; @@ -185,6 +191,7 @@ ControlNodeType ControlTriggerPass::TransferNodeType(const NodePtr &node, uint32 } else if ((type == MERGE) || (type == REFMERGE)) { OpDescPtr merge_desc = node->GetOpDesc(); if (merge_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "op_desc in merge node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "FindPredInput fail, merge_desc is null, merge_node: %s.", node->GetName().c_str()); return kInvalidType; } @@ -264,14 +271,23 @@ Status ControlTriggerPass::InsertOppositeBranch(ComputeGraphPtr &graph, NodePtr } if (GraphUtils::AddEdge(in_ctrl_node->GetOutControlAnchor(), orig_const->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str(), + orig_const->GetName().c_str(), orig_const->GetType().c_str()); GELOGE(FAILED, "Add in ctrl edge fail, %s->%s.", in_ctrl_node->GetName().c_str(), orig_const->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(switch_node->GetOutDataAnchor(new_idx), identity_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%u) and op:%s(%s)(index:0) failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), new_idx, + identity_node->GetName().c_str(), identity_node->GetType().c_str()); GELOGE(FAILED, "Add in data edge fail, %s->%s.", switch_desc->GetName().c_str(), identity_node->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(identity_node->GetOutControlAnchor(), new_const->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + identity_node->GetName().c_str(), identity_node->GetType().c_str(), + new_const->GetName().c_str(), new_const->GetType().c_str()); GELOGE(FAILED, "Add in ctrl edge fail, %s->%s.", identity_node->GetName().c_str(), new_const->GetName().c_str()); return FAILED; } @@ -282,6 +298,7 @@ Status ControlTriggerPass::InsertOppositeBranch(ComputeGraphPtr &graph, NodePtr control_trigger_map_[node] = {pred_const}; } else { if (!iter->second.insert(pred_const).second) { + REPORT_INNER_ERROR("E19999", "Insert to control_trigger_map_ failed"); GELOGE(FAILED, "control_trigger_map_ insert failed."); return FAILED; } @@ -303,12 +320,15 @@ NodePtr ControlTriggerPass::InsertMergeNode(ComputeGraphPtr &graph, NodePtr &nod const std::string name = node->GetName() + "_" + MERGE; OpDescPtr op_desc = MakeShared(name, MERGE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create Merge op %s: create op_desc fail.", name.c_str()); return nullptr; } if ((op_desc->AddInputDesc(data_desc) != GRAPH_SUCCESS) || (op_desc->AddInputDesc(data_desc) != GRAPH_SUCCESS) || (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS) || (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add input or ouput desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Merge op %s: add input/output desc fail.", name.c_str()); return nullptr; } @@ -316,12 +336,20 @@ NodePtr ControlTriggerPass::InsertMergeNode(ComputeGraphPtr &graph, NodePtr &nod GELOGI("Create Merge op:%s.", name.c_str()); NodePtr merge_node = graph->AddNode(op_desc); if (merge_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Create Merge op %s fail.", name.c_str()); return nullptr; } if ((GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), node->GetInControlAnchor()) != GRAPH_SUCCESS) || (GraphUtils::AddEdge(merge_node->GetOutControlAnchor(), node->GetInControlAnchor()) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s), then " + "add control edge between op:%s(%s) and op:%s(%s) failed", + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str(), + merge_node->GetName().c_str(), merge_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Replace ctrl edge fail, %s->%s, %s->%s", in_ctrl_node->GetName().c_str(), node->GetName().c_str(), merge_node->GetName().c_str(), node->GetName().c_str()); return nullptr; @@ -343,6 +371,7 @@ NodePtr ControlTriggerPass::InsertConstNode(ComputeGraphPtr &graph, NodePtr &mer const std::string name = merge_node->GetName() + "_" + CONSTANT + (flag ? "_t" : "_f"); OpDescPtr op_desc = MakeShared(name, CONSTANT); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create Const op %s: create op_desc fail.", name.c_str()); return nullptr; } @@ -350,15 +379,20 @@ NodePtr ControlTriggerPass::InsertConstNode(ComputeGraphPtr &graph, NodePtr &mer int32_t value = 0; GeTensorPtr const_value = MakeShared(data_desc, reinterpret_cast(&value), sizeof(int32_t)); if (const_value == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create tensor fail."); return nullptr; } if (!AttrUtils::SetTensor(op_desc, ATTR_NAME_WEIGHTS, const_value)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Const op %s: set attr ATTR_NAME_WEIGHTS fail.", name.c_str()); return nullptr; } if (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Const op %s: add output desc fail.", name.c_str()); return nullptr; } @@ -366,12 +400,17 @@ NodePtr ControlTriggerPass::InsertConstNode(ComputeGraphPtr &graph, NodePtr &mer GELOGI("Create Const op: %s", name.c_str()); NodePtr const_node = graph->AddNode(op_desc); if (const_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Create Const op %s fail.", name.c_str()); return nullptr; } uint32_t out_idx = (flag ? SWITCH_TRUE_OUTPUT : SWITCH_FALSE_OUTPUT); if (GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), merge_node->GetInDataAnchor(out_idx)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%u) failed", + const_node->GetName().c_str(), const_node->GetType().c_str(), + merge_node->GetName().c_str(), merge_node->GetType().c_str(), out_idx); GELOGE(FAILED, "Add in data edge fail, %s->%s", const_node->GetName().c_str(), merge_node->GetName().c_str()); return nullptr; } @@ -390,11 +429,14 @@ NodePtr ControlTriggerPass::InsertIdentityNode(ComputeGraphPtr &graph, const std const GeTensorDesc &data_desc) { OpDescPtr op_desc = MakeShared(name, IDENTITY); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create Identity op %s: create op_desc fail.", name.c_str()); return nullptr; } if ((op_desc->AddInputDesc(data_desc) != GRAPH_SUCCESS) || (op_desc->AddOutputDesc(data_desc) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add input or output desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op %s: add input/output desc fail.", name.c_str()); return nullptr; } @@ -402,6 +444,8 @@ NodePtr ControlTriggerPass::InsertIdentityNode(ComputeGraphPtr &graph, const std GELOGI("Create Identity op:%s.", name.c_str()); NodePtr identity_node = graph->AddNode(op_desc); if (identity_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op %s fail.", name.c_str()); return nullptr; } @@ -418,17 +462,24 @@ NodePtr ControlTriggerPass::InsertIdentityNode(ComputeGraphPtr &graph, const std /// Status ControlTriggerPass::FindPredInput(const NodePtr &switch_node) { if (switch_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param switch_node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "switch_node is null"); return INTERNAL_ERROR; } InDataAnchorPtr in_cond_anchor = switch_node->GetInDataAnchor(SWITCH_PRED_INPUT); if (in_cond_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%d in anchor of switch_node:%s(%s) is nullptr, check invalid", + SWITCH_PRED_INPUT, + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "in_cond_anchor is nullptr, node: %s.", switch_node->GetName().c_str()); return INTERNAL_ERROR; } OutDataAnchorPtr pred_cond_anchor = in_cond_anchor->GetPeerOutAnchor(); if (pred_cond_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%d in anchor of switch_node:%s(%s), it's peer anchor is nullptr, " + "check invalid", SWITCH_PRED_INPUT, + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "pred_cond_anchor is nullptr, node: %s.", switch_node->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/ctrl_edge_transfer_pass.cc b/ge/graph/passes/ctrl_edge_transfer_pass.cc index a538a10c..598d2e14 100755 --- a/ge/graph/passes/ctrl_edge_transfer_pass.cc +++ b/ge/graph/passes/ctrl_edge_transfer_pass.cc @@ -64,13 +64,13 @@ Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) { for (auto &in_control_node : n->GetInControlNodes()) { GE_CHECK_NOTNULL(in_control_node); - GE_CHK_STATUS_RET(ge::GraphUtils::RemoveEdge(in_control_node->GetOutControlAnchor(), + GE_CHK_GRAPH_STATUS_RET(ge::GraphUtils::RemoveEdge(in_control_node->GetOutControlAnchor(), n->GetInControlAnchor()), "remove edge failed"); for (auto &out_node : n->GetOutNodes()) { if (out_node == nullptr) { continue; } - GE_CHK_STATUS_RET(ge::GraphUtils::AddEdge(in_control_node->GetOutControlAnchor(), + GE_CHK_GRAPH_STATUS_RET(ge::GraphUtils::AddEdge(in_control_node->GetOutControlAnchor(), out_node->GetInControlAnchor()), "add edge failed."); } } diff --git a/ge/graph/passes/data_pass.cc b/ge/graph/passes/data_pass.cc index 5bbd2fb1..cb94b161 100644 --- a/ge/graph/passes/data_pass.cc +++ b/ge/graph/passes/data_pass.cc @@ -30,6 +30,8 @@ Status MappingSubgraphInput(const ComputeGraphPtr &graph, const std::functionGetOpDesc(), "index", index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", "index", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Failed to get index from data[%s]", node->GetName().c_str()); return FAILED; } @@ -38,6 +40,8 @@ Status MappingSubgraphInput(const ComputeGraphPtr &graph, const std::functionGetName().c_str(), index, parent_index); if (!AttrUtils::SetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Failed to set parent index for node %s", node->GetName().c_str()); return FAILED; } @@ -65,6 +69,9 @@ Status MappingSubgraphOutput(const ComputeGraphPtr &graph, const std::functionMutableInputDesc(index); GE_CHECK_NOTNULL(tensor); if (!AttrUtils::SetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to tensor of op:%s(%s) input:%zu failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), + index); GELOGE(FAILED, "Failed to set parent index for graph %s", graph->GetName().c_str()); return FAILED; } @@ -140,12 +147,16 @@ Status DataPass::PostParseSubgraph(const ComputeGraphPtr &graph, const string &i auto post_func_it = subgraph_handle.find(parent_node->GetType()); if (post_func_it == subgraph_handle.end()) { + REPORT_INNER_ERROR("E19999", "The subgraph post func for node %s type %s is null, check invalid", + parent_node->GetName().c_str(), parent_node->GetType().c_str()); GELOGE(FAILED, "The subgraph post func for node %s type %s is null.", parent_node->GetName().c_str(), parent_node->GetType().c_str()); return FAILED; } if (post_func_it->second(ir_name, graph) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Post process subgraph %s on node %s type %s failed", + graph->GetName().c_str(), parent_node->GetName().c_str(), parent_node->GetType().c_str()); GELOGE(FAILED, "Failed to post process subgraph %s on node %s type %s", graph->GetName().c_str(), parent_node->GetName().c_str(), parent_node->GetType().c_str()); return FAILED; diff --git a/ge/graph/passes/dimension_adjust_pass.cc b/ge/graph/passes/dimension_adjust_pass.cc index 9677fa5f..dbea8dc9 100755 --- a/ge/graph/passes/dimension_adjust_pass.cc +++ b/ge/graph/passes/dimension_adjust_pass.cc @@ -29,12 +29,14 @@ const int kRemoveInputIndex = 1; Status DimensionAdjustPass::Run(ge::NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "node is nullptr."); return PARAM_INVALID; } OpDescPtr op_desc_ptr = node->GetOpDesc(); if (op_desc_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "GetOpDesc return nullptr."); return PARAM_INVALID; } @@ -42,6 +44,8 @@ Status DimensionAdjustPass::Run(ge::NodePtr &node) { string type; Status ret = GetOriginalType(node, type); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(ret, "DimensionAdjustPass get originnal type fail."); return ret; } @@ -69,15 +73,31 @@ Status DimensionAdjustPass::Run(ge::NodePtr &node) { if (ret == NOT_CHANGED) { return SUCCESS; } + REPORT_CALL_ERROR("E19999", "kernel compute for op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(ret, "DimensionAdjustPass compute failed"); return ret; } + // Need to handle axis_input of node like ExpandDims if (node->GetAllInDataAnchors().size() > static_cast(kRemoveInputIndex)) { + auto axis_node_out_anchor = node->GetInDataAnchor(kRemoveInputIndex)->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(axis_node_out_anchor); + auto axis_node = axis_node_out_anchor->GetOwnerNode(); + // 1.Copy control dependency of axis node ret = PassUtils::UnlinkNodeWithControlCopy(node, kRemoveInputIndex); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Unlink op:%s(%s) data input:%u with control edge copy failed", + node->GetName().c_str(), node->GetType().c_str(), kRemoveInputIndex); GELOGE(ret, "DimensionAdjustPass unlink node with control copy fail."); return ret; } + // 2.Remove const axis node without any output + if ((axis_node->GetType() == CONSTANT || axis_node->GetType() == CONSTANTOP) && + axis_node->GetOutDataNodesSize() == 0) { + ret = IsolateAndDeleteNode(axis_node, {}); + GE_CHK_GRAPH_STATUS_RET(ret, "Fail to remove node %s.", axis_node->GetName().c_str()); + GELOGI("Remove useless axis input const %s", axis_node->GetName().c_str()); + } } ret = DealWithInNodes(node); @@ -111,12 +131,12 @@ Status DimensionAdjustPass::DealWithInNodes(NodePtr &node) { GE_CHECK_NOTNULL(identity); GELOGI("Create new identity node[%s] after node %s[type: %s] success.", identity->GetName().c_str(), in_node->GetName().c_str(), in_node->GetType().c_str()); - GE_CHK_STATUS_RET(GraphUtils::AddEdge(in_node_anchor, identity->GetInDataAnchor(0))) + GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(in_node_anchor, identity->GetInDataAnchor(0))) GE_CHECK_NOTNULL(identity->GetOutControlAnchor()); if (identity->GetOutControlAnchor()->IsLinkedWith(node->GetInControlAnchor())) { continue; } - GE_CHK_STATUS_RET(GraphUtils::AddEdge(identity->GetOutControlAnchor(), node->GetInControlAnchor())) + GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(identity->GetOutControlAnchor(), node->GetInControlAnchor())) } } @@ -126,12 +146,14 @@ Status DimensionAdjustPass::DealWithInNodes(NodePtr &node) { NodePtr DimensionAdjustPass::AddIdentityNodeToGraph(const string &name, const GeTensorDesc &tensor, ComputeGraphPtr &graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Comput graph ptr is null in creating identity node."); return nullptr; } OpDescPtr desc = MakeShared("", ""); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(MEMALLOC_FAILED, "Failed to create op desc."); return nullptr; } @@ -141,6 +163,8 @@ NodePtr DimensionAdjustPass::AddIdentityNodeToGraph(const string &name, const Ge auto ret = desc->AddInputDesc(tensor); auto ret2 = desc->AddOutputDesc(tensor); if ((ret != GRAPH_SUCCESS) || (ret2 != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add input or ouput desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add input/output desc in creating identity."); return nullptr; } diff --git a/ge/graph/passes/dimension_compute_pass.cc b/ge/graph/passes/dimension_compute_pass.cc index dfa2d404..cfd978b6 100755 --- a/ge/graph/passes/dimension_compute_pass.cc +++ b/ge/graph/passes/dimension_compute_pass.cc @@ -39,12 +39,16 @@ Status DimensionComputePass::Run(ge::NodePtr &node) { if (ret == NOT_CHANGED) { return SUCCESS; } else { + REPORT_CALL_ERROR("E19999", "kernel compute for op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(ret, "DimensionComputePass Compute failed"); return ret; } } if (outputs.empty()) { + REPORT_INNER_ERROR("E19999", "After compute for node %s(%s), output weight is empty, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to compute dims for node %s," " no output weight", diff --git a/ge/graph/passes/dropout_pass.cc b/ge/graph/passes/dropout_pass.cc index 09c297a6..11be74f0 100644 --- a/ge/graph/passes/dropout_pass.cc +++ b/ge/graph/passes/dropout_pass.cc @@ -31,10 +31,12 @@ namespace ge { Status DropOutPass::Run(NodePtr &node) { GELOGD("DropOutPass running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param op_desc of node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [opDesc] must not be null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/end_of_sequence_add_control_pass.cc b/ge/graph/passes/end_of_sequence_add_control_pass.cc index d6503d0d..361d4a46 100755 --- a/ge/graph/passes/end_of_sequence_add_control_pass.cc +++ b/ge/graph/passes/end_of_sequence_add_control_pass.cc @@ -26,6 +26,7 @@ namespace ge { Status EndOfSequenceAddControlPass::Run(ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID; } @@ -82,6 +83,10 @@ Status EndOfSequenceAddControlPass::AddControlEdge(NodePtr &end_of_sequence, std } Status status = GraphUtils::AddEdge(out_ctrl_anchor, in_ctrl_anchor); if (status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Add control edge between op:%s(%s) and op:%s(%s) failed", + end_of_sequence->GetName().c_str(), end_of_sequence->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Graph add EndOfSequence op out ctrl edge fail, dst node: %s.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/enter_pass.cc b/ge/graph/passes/enter_pass.cc index 066c97cf..cde3d6d9 100644 --- a/ge/graph/passes/enter_pass.cc +++ b/ge/graph/passes/enter_pass.cc @@ -37,6 +37,7 @@ Status EnterPass::Run(NodePtr &node) { // enter node has only one input if (node->GetInDataNodes().empty()) { + REPORT_INNER_ERROR("E19999", "Param node in data nodes is empty, check invalid"); GELOGE(PARAM_INVALID, "enter_node %s has no input", node->GetName().c_str()); return PARAM_INVALID; } @@ -58,6 +59,9 @@ Status EnterPass::Run(NodePtr &node) { } GELOGI("Remove control edge from %s to %s.", node->GetName().c_str(), out_ctrl_node->GetName().c_str()); if (GraphUtils::RemoveEdge(node->GetOutControlAnchor(), out_ctrl_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + out_ctrl_node->GetName().c_str(), out_ctrl_node->GetType().c_str()); GELOGE(FAILED, "Remove Enter ctrl output fail, %s->%s", node->GetName().c_str(), out_ctrl_node->GetName().c_str()); return FAILED; @@ -89,14 +93,14 @@ Status EnterPass::OptimizeEnterWithOnlyDataOut(NodePtr &node, NodePtr &in_node) } GE_CHECK_NOTNULL(in_node->GetOutDataAnchor(0)); - GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0))) + GE_CHK_GRAPH_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0))) const auto &out_data_anchor = node->GetOutDataAnchor(0); GE_CHECK_NOTNULL(out_data_anchor); for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { - GE_CHK_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor)) - GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor)) + GE_CHK_GRAPH_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor)) + GE_CHK_GRAPH_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor)) } - GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(node->GetOwnerComputeGraph(), node)) + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(node->GetOwnerComputeGraph(), node)) AddNodeDeleted(node); AddRePassNodesWithInOut(in_node); @@ -136,11 +140,11 @@ Status EnterPass::UnlinkCtrlEdgeBeforeConst(NodePtr &node) { } GELOGI("Unlink control edge from %s to %s.", node->GetName().c_str(), out_ctrl_node->GetName().c_str()); - GE_CHK_STATUS_RET(out_ctrl_anchor->Unlink(out_ctrl_node->GetInControlAnchor())) + GE_CHK_GRAPH_STATUS_RET(out_ctrl_anchor->Unlink(out_ctrl_node->GetInControlAnchor())) for (auto &out_node_of_const : out_nodes_of_const) { if (!out_ctrl_anchor->IsLinkedWith(out_node_of_const->GetInControlAnchor())) { GELOGI("Link control edge from %s to %s.", node->GetName().c_str(), out_node_of_const->GetName().c_str()); - GE_CHK_STATUS_RET(out_ctrl_anchor->LinkTo(out_node_of_const->GetInControlAnchor())) + GE_CHK_GRAPH_STATUS_RET(out_ctrl_anchor->LinkTo(out_node_of_const->GetInControlAnchor())) } } } diff --git a/ge/graph/passes/flow_ctrl_pass.cc b/ge/graph/passes/flow_ctrl_pass.cc index 435130b3..0072224b 100755 --- a/ge/graph/passes/flow_ctrl_pass.cc +++ b/ge/graph/passes/flow_ctrl_pass.cc @@ -115,6 +115,7 @@ NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &nod const std::vector &output_list) { OpDescPtr op_desc = MakeShared(node_name, node_type); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Make OpDesc failed, name:%s, type:%s.", node_name.c_str(), node_type.c_str()); return nullptr; } @@ -122,6 +123,8 @@ NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &nod for (auto &input_desc : input_list) { graphStatus graph_status = op_desc->AddInputDesc(input_desc); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Add node:%s intput desc failed, error=%u.", node_name.c_str(), graph_status); return nullptr; } @@ -130,14 +133,21 @@ NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &nod for (auto &output_desc : output_list) { graphStatus graph_status = op_desc->AddOutputDesc(output_desc); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Add node:%s output desc failed, error=%u.", node_name.c_str(), graph_status); return nullptr; } } - GE_IF_BOOL_EXEC(compute_graph == nullptr, DOMI_LOGE("compute_graph is nullptr"); return nullptr); + GE_IF_BOOL_EXEC(compute_graph == nullptr, + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); + DOMI_LOGE("compute_graph is nullptr"); + return nullptr); NodePtr node = compute_graph->AddNode(op_desc); if (node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), compute_graph->GetName().c_str()); GELOGE(FAILED, "add node failed, name:%s, type:%s.", node_name.c_str(), node_type.c_str()); return nullptr; } @@ -149,9 +159,15 @@ NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &nod NodePtr FlowCtrlPass::InsertStreamSwitchOp(ComputeGraphPtr &compute_graph, const string &switch_name, const NodePtr &loop_cond, const NodePtr &iter_per_loop) { GE_IF_BOOL_EXEC(loop_cond == nullptr || loop_cond->GetOpDesc() == nullptr, - GELOGE(FAILED, "loop_cond is null"); return nullptr); + REPORT_INNER_ERROR("E19999", "Param loop_cond or its op_desc is nullptr, " + "check invalid"); + GELOGE(FAILED, "loop_cond is null"); + return nullptr); GE_IF_BOOL_EXEC(iter_per_loop == nullptr || iter_per_loop->GetOpDesc() == nullptr, - GELOGE(FAILED, "iter_per_loop is nullptr"); return nullptr); + REPORT_INNER_ERROR("E19999", "Param iter_per_loop or its op_desc is nullptr, " + "check invalid"); + GELOGE(FAILED, "iter_per_loop is nullptr"); + return nullptr); std::vector input_desc_list = {loop_cond->GetOpDesc()->GetOutputDesc(0), iter_per_loop->GetOpDesc()->GetOutputDesc(0)}; std::vector output_desc_list; @@ -164,6 +180,9 @@ NodePtr FlowCtrlPass::InsertStreamSwitchOp(ComputeGraphPtr &compute_graph, const // set input 0 graphStatus add_ret = GraphUtils::AddEdge(loop_cond->GetOutDataAnchor(0), stream_switch->GetInDataAnchor(0)); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + loop_cond->GetName().c_str(), loop_cond->GetType().c_str(), + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "Add loop_cond_node to switch_node:%s edge failed, ret = %u.", switch_name.c_str(), add_ret); return nullptr; } @@ -171,6 +190,9 @@ NodePtr FlowCtrlPass::InsertStreamSwitchOp(ComputeGraphPtr &compute_graph, const // set input 1 add_ret = GraphUtils::AddEdge(iter_per_loop->GetOutDataAnchor(0), stream_switch->GetInDataAnchor(1)); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:1) failed", + iter_per_loop->GetName().c_str(), iter_per_loop->GetType().c_str(), + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "Add iter_per_loop_node to switch_node:%s edge failed, ret = %u.", switch_name.c_str(), add_ret); return nullptr; } @@ -178,13 +200,19 @@ NodePtr FlowCtrlPass::InsertStreamSwitchOp(ComputeGraphPtr &compute_graph, const // stream switch op need switch cond by attr. GE_IF_BOOL_EXEC(!AttrUtils::SetInt(stream_switch->GetOpDesc(), ATTR_NAME_STREAM_SWITCH_COND, static_cast(RT_LESS)), - DOMI_LOGE("set ATTR_NAME_STREAM_SWITCH_COND failed"); return nullptr); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_STREAM_SWITCH_COND.c_str(), + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); + DOMI_LOGE("set ATTR_NAME_STREAM_SWITCH_COND failed"); return nullptr); return stream_switch; } NodePtr FlowCtrlPass::AddVariableNode(ComputeGraphPtr &compute_graph, const string &name) { - GE_IF_BOOL_EXEC(compute_graph == nullptr, DOMI_LOGE("compute_graph is nullptr"); return nullptr); + GE_IF_BOOL_EXEC(compute_graph == nullptr, + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); + DOMI_LOGE("compute_graph is nullptr"); + return nullptr); NodePtr exist_node = compute_graph->FindNode(name); if (exist_node != nullptr) { GELOGD("Node %s already exist, no need add.", name.c_str()); @@ -193,10 +221,14 @@ NodePtr FlowCtrlPass::AddVariableNode(ComputeGraphPtr &compute_graph, const stri // fetch and set tensor desc GeTensorDesc tensor_desc; if (ge::VarManager::Instance(compute_graph->GetSessionID()) == nullptr) { + REPORT_INNER_ERROR("E19999", "Get VarManager by session_id:%lu failed", + compute_graph->GetSessionID()); return nullptr; } Status ret = ge::VarManager::Instance(compute_graph->GetSessionID())->GetCurVarDesc(name, tensor_desc); if (ret != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Get var tensor from VarManager by name:%s failed, session_id:%lu", + name.c_str(), compute_graph->GetSessionID()); GELOGE(FAILED, "Get var desc fail, name:%s", name.c_str()); return nullptr; } @@ -238,6 +270,9 @@ Status FlowCtrlPass::AddGlobalStepVariableNode(ComputeGraphPtr &compute_graph) { // add ctrl edges graphStatus add_ret = GraphUtils::AddEdge(global_step->GetOutControlAnchor(), output_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + global_step->GetName().c_str(), global_step->GetType().c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); GELOGE(FAILED, "Add global_step to netoutput edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -249,6 +284,8 @@ NodePtr FlowCtrlPass::InsertAssignOp(ge::ComputeGraphPtr &compute_graph, const s const string &node_name, const NodePtr &ref_node, const NodePtr &value_node) { GE_IF_BOOL_EXEC(ref_node == nullptr || value_node == nullptr || ref_node->GetOpDesc() == nullptr || value_node->GetOpDesc() == nullptr, + REPORT_INNER_ERROR("E19999", "Param ref_node or value_node or their op_desc has nullptr, " + "check invalid"); GELOGE(FAILED, "ref node or value node is null"); return nullptr); GeTensorDesc ref_tensor_desc = ref_node->GetOpDesc()->GetOutputDesc(0); @@ -263,12 +300,18 @@ NodePtr FlowCtrlPass::InsertAssignOp(ge::ComputeGraphPtr &compute_graph, const s // assign node input 0 = ref_node graphStatus add_ret = GraphUtils::AddEdge(ref_node->GetOutDataAnchor(0), assign_node->GetInDataAnchor(0)); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + ref_node->GetName().c_str(), ref_node->GetType().c_str(), + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Add ref_node to %s edge failed, add_ret=%u.", node_name.c_str(), add_ret); return nullptr; } // assign input 1 = value_node add_ret = GraphUtils::AddEdge(value_node->GetOutDataAnchor(0), assign_node->GetInDataAnchor(1)); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:1) failed", + value_node->GetName().c_str(), value_node->GetType().c_str(), + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Add value_node to %s edge failed, add_ret=%u.", node_name.c_str(), add_ret); return nullptr; } @@ -298,10 +341,23 @@ Status FlowCtrlPass::CreateIterCtrlTrueBranch(ComputeGraphPtr &compute_graph, co string active_name = switch_node->GetName() + "_StreamActive"; // add attr for stream assign model to break branch. - GE_CHK_STATUS_RET(SetStreamLabel(assign_add_node_in_fpbp_loop_, active_name), "set stream label failed"); + auto status = SetStreamLabel(assign_add_node_in_fpbp_loop_, active_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + active_name.c_str(), assign_add_node_in_fpbp_loop_->GetName().c_str(), + assign_add_node_in_fpbp_loop_->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } // used for stream assign to find true branch - GE_CHK_STATUS_RET(SetActiveLabelList(switch_node, { active_name }), "set active label list failed"); + status = SetActiveLabelList(switch_node, { active_name }); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + active_name.c_str(), switch_node->GetName().c_str(), switch_node->GetType().c_str()); + GELOGE(status, "set active_label_list failed."); + return status; + } // 2. Insert active node NodePtr active_node = InsertOp(compute_graph, STREAMACTIVE, active_name, {}, {}); @@ -309,14 +365,28 @@ Status FlowCtrlPass::CreateIterCtrlTrueBranch(ComputeGraphPtr &compute_graph, co GELOGE(FAILED, "Insert stream active node:%s for IterCtrlTrueStream failed.", active_name.c_str()); return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(active_node, active_name), "set stream label failed"); + status = SetStreamLabel(active_node, active_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + active_name.c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } GE_IF_BOOL_EXEC(!AttrUtils::SetBool(active_node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, true), - DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); return FAILED); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_IS_LOOP_ACTIVE.c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); + DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); + return FAILED); // add ctrl edges graphStatus add_ret = GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), assign_add_node_in_fpbp_loop_->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), + assign_add_node_in_fpbp_loop_->GetName().c_str(), + assign_add_node_in_fpbp_loop_->GetType().c_str()); GELOGE(FAILED, "Add switch_node to assign_add_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -324,6 +394,10 @@ Status FlowCtrlPass::CreateIterCtrlTrueBranch(ComputeGraphPtr &compute_graph, co add_ret = GraphUtils::AddEdge(assign_add_node_in_fpbp_loop_->GetOutControlAnchor(), active_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + assign_add_node_in_fpbp_loop_->GetName().c_str(), + assign_add_node_in_fpbp_loop_->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "Add assign_add_node to active_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -351,10 +425,19 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(assign_node, switch_node->GetName()), "set stream label failed."); + auto status = SetStreamLabel(assign_node, switch_node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + switch_node->GetName().c_str(), assign_node->GetName().c_str(), assign_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } graphStatus add_ret = GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), assign_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), + assign_node->GetName().c_str(), assign_node->GetType().c_str()); GELOGE(FAILED, "Add switch_node to assign_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -368,15 +451,30 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c GELOGE(FAILED, "Insert stream active node:%s for IterCtrlTrueStream failed.", active_name.c_str()); return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(active_node, switch_node->GetName()), "set stream label failed"); + status = SetStreamLabel(active_node, switch_node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + switch_node->GetName().c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } GE_CHK_STATUS_RET(SetSwitchBranchNodeLabel(active_node, switch_node->GetName()), "set switch branch node label failed."); string model_exit_name = switch_node->GetName() + "_ModelExit"; - GE_CHK_STATUS_RET(SetActiveLabelList(active_node, { model_exit_name }), "set active label list failed"); + status = SetActiveLabelList(active_node, { model_exit_name }); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + model_exit_name.c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); + GELOGE(status, "set active_label_list failed."); + return status; + } add_ret = GraphUtils::AddEdge(assign_node->GetOutControlAnchor(), active_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + assign_node->GetName().c_str(), assign_node->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "Add assign_node to active_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -387,10 +485,20 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c GELOGE(FAILED, "Insert model_exit node:%s for IterCtrlTrueStream failed.", model_exit_name.c_str()); return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(model_exit_node, model_exit_name), "set stream label failed"); + status = SetStreamLabel(model_exit_node, model_exit_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + model_exit_name.c_str(), model_exit_node->GetName().c_str(), + model_exit_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } add_ret = GraphUtils::AddEdge(active_node->GetOutControlAnchor(), model_exit_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + active_node->GetName().c_str(), assign_node->GetType().c_str(), + model_exit_node->GetName().c_str(), model_exit_node->GetType().c_str()); GELOGE(FAILED, "Add active_node to model_exit_node ctrl edge failed, add_ret=%u.", add_ret); return FAILED; } @@ -433,10 +541,19 @@ Status FlowCtrlPass::AddFpBpIteratorCtrl(ComputeGraphPtr &compute_graph, NodePtr GELOGE(FAILED, "InsertStreamSwitchOp:%s failed.", switch_name.c_str()); return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(switch_node, switch_name), "set stream label failed"); + auto status = SetStreamLabel(switch_node, switch_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + switch_name.c_str(), switch_node->GetName().c_str(), switch_node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } graphStatus add_ret = GraphUtils::AddEdge(pre_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + pre_node->GetName().c_str(), pre_node->GetType().c_str(), + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(FAILED, "Add pre node:%s to switch_node:%s ctrl edge failed, ret = %u.", pre_node_name.c_str(), switch_name.c_str(), add_ret); return FAILED; @@ -477,9 +594,14 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, * itersPerLoop loopCond */ GE_IF_BOOL_EXEC(loop_after_node == nullptr || compute_graph == nullptr, - DOMI_LOGE("loop after node or compute graph is null."); return FAILED); + REPORT_INNER_ERROR("E19999", "Param loop_after_node or compute_graph is nullptr, " + "check invalid"); + DOMI_LOGE("loop after node or compute graph is null."); + return FAILED); InDataAnchorPtr in_anchor = loop_after_node->GetInDataAnchor(0); if (in_anchor == nullptr || in_anchor->GetPeerOutAnchor() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param loop_after_node:%s(%s) no in data node, check invalid", + loop_after_node->GetName().c_str(), loop_after_node->GetType().c_str()); GELOGE(FAILED, "Find %s in data anchor failed.", loop_after_node->GetName().c_str()); return FAILED; } @@ -488,17 +610,26 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, // 1. Get variables NodePtr loop_cond_node = compute_graph->FindNode(NODE_NAME_FLOWCTRL_LOOP_COND); if (loop_cond_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s not found in graph:%s, check invalid", + NODE_NAME_FLOWCTRL_LOOP_COND.c_str(), compute_graph->GetName().c_str()); GELOGE(FAILED, "Find node :%s failed.", NODE_NAME_FLOWCTRL_LOOP_COND.c_str()); return FAILED; } NodePtr iter_per_loop_node = compute_graph->FindNode(NODE_NAME_FLOWCTRL_LOOP_PER_ITER); if (iter_per_loop_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s not found in graph:%s, check invalid", + NODE_NAME_FLOWCTRL_LOOP_PER_ITER.c_str(), compute_graph->GetName().c_str()); GELOGE(FAILED, "Find node :%s failed.", NODE_NAME_FLOWCTRL_LOOP_PER_ITER.c_str()); return FAILED; } // 2. Add StreamSwitch and edges to switch_node. - GE_IF_BOOL_EXEC(loop_pre_node == nullptr, DOMI_LOGE("loop pre node is null."); return FAILED); + GE_IF_BOOL_EXEC(loop_pre_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param loop_after_node:%s(%s) no in data node, " + "check invalid", loop_after_node->GetName().c_str(), + loop_after_node->GetType().c_str()); + DOMI_LOGE("loop pre node is null."); + return FAILED); string switch_name = loop_pre_node->GetName() + "_" + NODE_NAME_STREAM_SWITCH; NodePtr switch_node = InsertStreamSwitchOp(compute_graph, switch_name, loop_cond_node, iter_per_loop_node); if (switch_node == nullptr) { @@ -506,16 +637,28 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(switch_node, switch_name), "set stream label failed."); + auto status = SetStreamLabel(switch_node, switch_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + switch_name.c_str(), switch_node->GetName().c_str(), switch_node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } graphStatus add_ret = GraphUtils::AddEdge(loop_pre_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + loop_pre_node->GetName().c_str(), loop_pre_node->GetType().c_str(), + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(FAILED, "Add loop_pre_node:%s to switch_node:%s ctrl edge failed, ret = %u.", loop_pre_node->GetName().c_str(), switch_name.c_str(), add_ret); return FAILED; } add_ret = GraphUtils::AddEdge(loop_after_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + loop_after_node->GetName().c_str(), loop_after_node->GetType().c_str(), + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(FAILED, "Add node:%s to switch_node:%s ctrl edge failed, ret = %u.", loop_after_node->GetName().c_str(), switch_name.c_str(), add_ret); return FAILED; @@ -529,22 +672,47 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, return FAILED; } - GE_CHK_STATUS_RET(SetStreamLabel(active_node, active_name), "set stream label failed."); + status = SetStreamLabel(active_node, active_name); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + active_name.c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } GE_IF_BOOL_EXEC(!AttrUtils::SetBool(active_node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, true), - DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); return FAILED); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_IS_LOOP_ACTIVE.c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); + DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); + return FAILED); add_ret = GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), active_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "Add switch_node:%s to active_node:%s ctrl edge failed, ret = %u.", switch_name.c_str(), active_name.c_str(), add_ret); return FAILED; } // used for stream assign to find true branch - GE_CHK_STATUS_RET(SetActiveLabelList(switch_node, { active_name }), "set active label list failed."); + status = SetActiveLabelList(switch_node, { active_name }); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + active_name.c_str(), switch_node->GetName().c_str(), switch_node->GetType().c_str()); + GELOGE(status, "set active_label_list failed."); + return status; + } // used for stream assign to find active stream - GE_CHK_STATUS_RET(SetActiveLabelList(active_node, { loop_pre_node->GetName() }), "set active label list failed"); + status = SetActiveLabelList(active_node, { loop_pre_node->GetName() }); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + loop_pre_node->GetName().c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); + GELOGE(status, "set active_label_list failed."); + return status; + } active_nodes_in_iter_loop_.push_back(active_node); return SUCCESS; } diff --git a/ge/graph/passes/folding_pass.cc b/ge/graph/passes/folding_pass.cc index 227a0f61..d4558ac7 100755 --- a/ge/graph/passes/folding_pass.cc +++ b/ge/graph/passes/folding_pass.cc @@ -35,6 +35,7 @@ namespace ge { namespace folding_pass { shared_ptr GetKernelByType(const NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return nullptr; } @@ -42,6 +43,9 @@ shared_ptr GetKernelByType(const NodePtr &node) { string type = node->GetType(); if (type == FRAMEWORKOP) { if (!ge::AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", + ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE.c_str(), + node->GetName().c_str(), node->GetType().c_str()); return nullptr; } } @@ -79,6 +83,7 @@ IndexsToAnchors GetIndexAndPeerInDataAnchors(NodePtr &node) { NodePtr AddConstNodeToGraph(GeTensorPtr &tensor, ComputeGraphPtr &graph) { auto const_desc = OpDescUtils::CreateConstOp(tensor); if (const_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Create Const op failed"); GELOGE(OUT_OF_MEMORY, "Failed to get const desc from tensor"); return nullptr; } @@ -90,12 +95,14 @@ NodePtr AddConstNodeToGraph(GeTensorPtr &tensor, ComputeGraphPtr &graph) { NodePtr AddIdentityNodeToGraph(const std::string &name, const GeTensorDesc &tensor, ComputeGraphPtr &graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Compute graph ptr is null in creating identity node."); return nullptr; } OpDescPtr desc = MakeShared("", ""); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(MEMALLOC_FAILED, "Failed to create op desc."); return nullptr; } @@ -105,6 +112,8 @@ NodePtr AddIdentityNodeToGraph(const std::string &name, const GeTensorDesc &tens auto ret = desc->AddInputDesc(tensor); auto ret2 = desc->AddOutputDesc(tensor); if ((ret != GRAPH_SUCCESS) || (ret2 != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add input or output desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add input/output desc in creating Identity."); return nullptr; } @@ -137,6 +146,8 @@ Status FoldingPass::Folding(NodePtr &node, vector &outputs) { auto in_data_nodes = node->GetInDataNodes(); std::unordered_set in_data_nodes_set(in_data_nodes.begin(), in_data_nodes.end()); if (IsolateAndDeleteNode(node, {}) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Isolate and delete node:%s(%s) faild", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to isolate and delete node %s, type %s.", node->GetName().c_str(), node->GetType().c_str()); return INTERNAL_ERROR; @@ -149,6 +160,8 @@ Status FoldingPass::Folding(NodePtr &node, vector &outputs) { continue; } if (IsolateAndDeleteNode(pre_node, {}) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "Isolate and delete node:%s(%s) faild", + pre_node->GetName().c_str(), pre_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to isolate and delete in data node %s, type %s.", pre_node->GetName().c_str(), pre_node->GetType().c_str()); return INTERNAL_ERROR; @@ -177,6 +190,10 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) { GELOGI("The in_node name is %s, and node type is %s.", in_node->GetName().c_str(), in_node->GetType().c_str()); auto ret = in_node_anchor->Unlink(in_data_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), in_node_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str(), in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to unlink anchor between const node %s to constant-folding-node %s, type %s.", in_node->GetName().c_str(), node->GetName().c_str(), node->GetType().c_str()); return INTERNAL_ERROR; @@ -192,6 +209,9 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) { } ret = GraphUtils::AddEdge(in_node_anchor, identity->GetInDataAnchor(0)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(inde:0) failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), in_node_anchor->GetIdx(), + identity->GetName().c_str(), identity->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add edge, from node %s to node %s.", in_node->GetName().c_str(), identity->GetName().c_str()); return INTERNAL_ERROR; @@ -199,6 +219,9 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) { GELOGI("Create new identity node success."); ret = GraphUtils::AddEdge(identity->GetOutControlAnchor(), node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + identity->GetName().c_str(), identity->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add edge, from node %s to node %s.", in_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; @@ -212,6 +235,7 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) { Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_anchors, std::vector &v_weight) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "node is null"); return FAILED; } @@ -219,6 +243,8 @@ Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_ancho for (auto &index_to_anchors : indexes_to_anchors) { auto index = static_cast(index_to_anchors.first); if (index >= v_weight.size()) { + REPORT_INNER_ERROR("E19999", "Index:%lu in param index_to_anchors >= param v_weight.size:%zu, " + "check invalid", index, v_weight.size()); GELOGE(INTERNAL_ERROR, "Failed to constant fold on node %s type %s, " "the out nodes num %lu calculated is less than the node out anchor index %zu", @@ -227,6 +253,8 @@ Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_ancho } GeTensorPtr weight = v_weight[index]; if (weight == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%lu in param v_weight is nullptr check invalid", + index); GELOGE(INTERNAL_ERROR, "Failed to constant fold on node %s type %s, the %lust node calculated is null", node->GetName().c_str(), node->GetType().c_str(), index); return INTERNAL_ERROR; @@ -243,6 +271,8 @@ Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_ancho // add new const to re-pass node for (auto &in_anchor : index_to_anchors.second) { if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%lu in param index_to_anchors has nullptr member in_anchor, " + "check invalid", index); GELOGE(INTERNAL_ERROR, "In anchor is nullptr."); return INTERNAL_ERROR; } @@ -254,6 +284,9 @@ Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_ancho } Status ret = GraphUtils::AddEdge(node->GetOutControlAnchor(), const_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add control edge, from node %s to const node %s.", node->GetName().c_str(), const_node->GetName().c_str()); return INTERNAL_ERROR; @@ -263,6 +296,9 @@ Status FoldingPass::AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_ancho if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) { GE_CHECK_NOTNULL(const_node->GetOpDesc()); if (!AttrUtils::SetStr(const_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_STREAM_LABEL.c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set stream label on dynamic const node %s, with stream label:%s.", const_node->GetName().c_str(), stream_label.c_str()); return INTERNAL_ERROR; @@ -279,6 +315,8 @@ Status FoldingPass::RemoveNodeKeepingCtrlEdges(NodePtr &node) { GE_IF_BOOL_EXEC(node == nullptr, GELOGE(PARAM_INVALID, "node is null"); return PARAM_INVALID); auto ret = GraphUtils::IsolateNode(node, {}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) in graph failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to isolate the folding-node %s type %s", node->GetName().c_str(), node->GetType().c_str()); return INTERNAL_ERROR; @@ -287,6 +325,8 @@ Status FoldingPass::RemoveNodeKeepingCtrlEdges(NodePtr &node) { auto graph = node->GetOwnerComputeGraph(); ret = GraphUtils::RemoveNodeWithoutRelink(graph, node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to remove node %s from graph", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -297,6 +337,7 @@ Status FoldingPass::RemoveNodeKeepingCtrlEdges(NodePtr &node) { Status FoldingPass::ConnectNodeToInAnchor(InDataAnchorPtr &in_anchor, NodePtr &node, int node_index) { // the origin edge must be removed before add if (in_anchor == nullptr || node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node or in_anchor is nullptr, check invalid"); GELOGE(PARAM_INVALID, "in anchor or node is null"); return PARAM_INVALID; } @@ -309,6 +350,8 @@ Status FoldingPass::ConnectNodeToInAnchor(InDataAnchorPtr &in_anchor, NodePtr &n auto new_out_anchor = node->GetOutDataAnchor(node_index); if (new_out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out index:%d data anchor of node:%s(%s) is nullptr, check invalid", + node_index, node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add node to in anchor," " the index %d for node %s, type %s is invalid", @@ -316,6 +359,10 @@ Status FoldingPass::ConnectNodeToInAnchor(InDataAnchorPtr &in_anchor, NodePtr &n return INTERNAL_ERROR; } if (GraphUtils::AddEdge(new_out_anchor, in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + node->GetName().c_str(), node->GetType().c_str(), node_index, + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add edge between anchors," " new node %s, type %s", diff --git a/ge/graph/passes/for_pass.cc b/ge/graph/passes/for_pass.cc index 3b7a0886..0b6377dc 100644 --- a/ge/graph/passes/for_pass.cc +++ b/ge/graph/passes/for_pass.cc @@ -64,12 +64,16 @@ Status ForPass::Run(NodePtr &node) { ComputeGraphPtr cond_graph = BuildCondGraph(while_info); if ((cond_graph == nullptr) || (root_graph->AddSubgraph(cond_graph) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Build cond graph failed or add cond subgraph to root_graph:%s failed", + root_graph->GetName().c_str()); GELOGE(FAILED, "Add while_cond_graph failed, node:%s.", node->GetName().c_str()); return FAILED; } ComputeGraphPtr body_graph = BuildBodyGraph(while_info); if ((body_graph == nullptr) || (root_graph->AddSubgraph(body_graph) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Build body graph failed or add body subgraph to root_graph:%s failed", + root_graph->GetName().c_str()); GELOGE(FAILED, "Add while_body_graph failed, node:%s.", node->GetName().c_str()); return FAILED; } @@ -99,6 +103,10 @@ Status ForPass::BuildForInfo(const ComputeGraphPtr &root_graph, const NodePtr &n OutDataAnchorPtr limit = FindInputWithIndex(node, FOR_LIMIT_INPUT); OutDataAnchorPtr delta = FindInputWithIndex(node, FOR_DELTA_INPUT); if ((start == nullptr) || (limit == nullptr) || (delta == nullptr)) { + REPORT_INNER_ERROR("E19999", "FOR_START_INPUT index:%d or FOR_LIMIT_INPUT index:%d or FOR_DELTA_INPUT index:%d " + "in data anchor of op:%s(%s) lack, check invalid", + FOR_START_INPUT, FOR_LIMIT_INPUT, FOR_DELTA_INPUT, + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "BuildForInfo for %s failed: start/limit/delta is NULL.", node->GetName().c_str()); return FAILED; } @@ -118,11 +126,15 @@ Status ForPass::BuildForInfo(const ComputeGraphPtr &root_graph, const NodePtr &n // For node has and only has one sub_graph std::string for_body_name = op_desc->GetSubgraphInstanceName(0); if (for_body_name.empty()) { + REPORT_INNER_ERROR("E19999", "Get subgraph name from op:%s(%s) by index 0 failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "BuildForInfo for %s failed: sub_graph_name is empty.", node->GetName().c_str()); return FAILED; } ComputeGraphPtr for_body = root_graph->GetSubgraph(for_body_name); if (for_body == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + root_graph->GetName().c_str(), for_body_name.c_str()); GELOGE(FAILED, "BuildForInfo for %s failed: for_body_graph is NULL.", node->GetName().c_str()); return FAILED; } @@ -222,6 +234,8 @@ Status ForPass::TranWhileInfo(const ComputeGraphPtr &graph, const ForInfo &for_i std::string i_name = for_name + "_i"; NodePtr i_node = graph->AddNode(CreateConstDesc(i_name, 0)); if (i_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(Const) to graph:%s failed", + i_name.c_str(), graph->GetName().c_str()); GELOGE(FAILED, "TranWhileInfo failed: create i_node failed."); return FAILED; } @@ -232,6 +246,9 @@ Status ForPass::TranWhileInfo(const ComputeGraphPtr &graph, const ForInfo &for_i // Const node has and only has one output, Identity node has and only has one input if ((identity_node == nullptr) || (GraphUtils::AddEdge(i_node->GetOutDataAnchor(0), identity_node->GetInDataAnchor(0)) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + i_node->GetName().c_str(), i_node->GetType().c_str(), + identity_node->GetName().c_str(), identity_node->GetType().c_str()); GELOGE(FAILED, "TranWhileInfo failed: Add data-edge %s:0->%s:0 failed.", i_name.c_str(), identity_name.c_str()); return FAILED; } @@ -240,6 +257,8 @@ Status ForPass::TranWhileInfo(const ComputeGraphPtr &graph, const ForInfo &for_i // Identity node has and only has one output OutDataAnchorPtr i_input = identity_node->GetOutDataAnchor(0); if (i_input == nullptr) { + REPORT_INNER_ERROR("E19999", "Out data anchor index:0 in op:%s(%s) is nullptr, check invalid", + identity_node->GetName().c_str(), identity_node->GetType().c_str()); GELOGE(FAILED, "TranWhileInfo failed: i_input is NULL."); return FAILED; } @@ -272,6 +291,7 @@ Status ForPass::TranWhileInfo(const ComputeGraphPtr &graph, const ForInfo &for_i OpDescPtr ForPass::CreateConstDesc(const std::string &name, int32_t value) { OpDescPtr const_op_desc = MakeShared(name, CONSTANT); if (const_op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, const:%s.", name.c_str()); return nullptr; } @@ -279,16 +299,21 @@ OpDescPtr ForPass::CreateConstDesc(const std::string &name, int32_t value) { GeTensorDesc data_desc(GeShape(), FORMAT_ND, DT_INT32); GeTensorPtr const_value = MakeShared(data_desc, reinterpret_cast(&value), sizeof(int32_t)); if (const_value == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create tensor failed, const:%s.", name.c_str()); return nullptr; } if (!AttrUtils::SetTensor(const_op_desc, ATTR_NAME_WEIGHTS, const_value)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); GELOGE(FAILED, "Set ATTR_NAME_WEIGHTS failed, const:%s.", name.c_str()); return nullptr; } if (const_op_desc->AddOutputDesc("y", data_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed, name:y", + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); GELOGE(FAILED, "Add output desc failed, const:%s.", name.c_str()); return nullptr; } @@ -334,6 +359,7 @@ Status ForPass::CreateLoopInput(const ComputeGraphPtr &graph, const ForInfo &for graphStatus error_code = GRAPH_SUCCESS; std::string error_msg; if ((graph_builder.Build(error_code, error_msg) == nullptr) || (error_code != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add loop input node to graph:%s failed", graph->GetName().c_str()); GELOGE(FAILED, "Create loop_count node failed: error_code:%u, error_msg:%s.", error_code, error_msg.c_str()); return FAILED; } @@ -346,6 +372,7 @@ Status ForPass::CreateLoopInput(const ComputeGraphPtr &graph, const ForInfo &for NodePtr abs_delta_node = graph_builder.GetNode(abs_name_0); NodePtr loop_count_node = graph_builder.GetNode(abs_name_1); if ((abs_delta_node == nullptr) || (loop_count_node == nullptr)) { + REPORT_CALL_ERROR("E19999", "Add loop input node to graph:%s failed", graph->GetName().c_str()); GELOGE(FAILED, "Create loop node failed: node is NULL."); return FAILED; } @@ -431,11 +458,15 @@ Status ForPass::InsertWhileNode(const ComputeGraphPtr &graph, const std::string OpDescBuilder op_desc_builder(name, WHILE); OpDescPtr op_desc = op_desc_builder.AddDynamicInput("input", arg_num).AddDynamicOutput("output", arg_num).Build(); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Add dynamic input or output to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Create while op_desc failed, name:%s.", name.c_str()); return FAILED; } NodePtr while_node = graph->AddNode(op_desc); if (while_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Create while node failed, name:%s.", name.c_str()); return FAILED; } @@ -553,6 +584,7 @@ ComputeGraphPtr ForPass::BuildCondGraph(WhileInfo &while_info) { std::string error_msg; ComputeGraphPtr cond_graph = graph_builder.Build(error_code, error_msg); if (cond_graph == nullptr) { + REPORT_CALL_ERROR("E19999", "Build graph:%s failed", cond_name.c_str()); GELOGE(FAILED, "Build cond_graph failed: error_code:%u, error_msg:%s.", error_code, error_msg.c_str()); return nullptr; } @@ -667,6 +699,8 @@ OpDescPtr ForPass::CreateSubgraphOpDesc(const std::string &name, uint32_t input_ OpDescPtr op_desc = op_desc_builder.Build(); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Build op_desc:%s(%s) failed", + name.c_str(), PARTITIONEDCALL); GELOGE(FAILED, "Create op_desc for subgraph node failed, name:%s.", name.c_str()); return nullptr; } diff --git a/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc index ab8fc39b..1f062813 100644 --- a/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc +++ b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc @@ -34,6 +34,7 @@ using std::string; namespace ge { Status FuseDataNodesWithCommonInputPass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "Compute graph is null."); return GE_GRAPH_PARAM_NULLPTR; } @@ -101,12 +102,20 @@ Status FuseDataNodesWithCommonInputPass::FuseDataNodes( first_node->GetName().c_str(), subgraph->GetName().c_str()); // the data node which can be fused has none input(both data and control in) if (GraphUtils::MoveOutCtrlEdges(node, first_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out control edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + first_node->GetName().c_str(), first_node->GetType().c_str()); return FAILED; } if (GraphUtils::ReplaceNodeDataAnchors(first_node, node, {}, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace data edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + first_node->GetName().c_str(), first_node->GetType().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(subgraph, node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/get_original_format_pass.cc b/ge/graph/passes/get_original_format_pass.cc index e743f190..4b78ae49 100644 --- a/ge/graph/passes/get_original_format_pass.cc +++ b/ge/graph/passes/get_original_format_pass.cc @@ -51,6 +51,9 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph GE_CHECK_NOTNULL(node_ptr); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(node_ptr->GetOpDesc(), ATTR_NAME_INFERRED_FORMAT, DOMI_TENSOR_RESERVED), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_INFERRED_FORMAT.c_str(), + node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_INFERRED_FORMAT failed"); return FAILED); } @@ -64,9 +67,15 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph GELOGI("Data node: %s,format :%d", node_ptr->GetName().c_str(), GetLocalOmgContext().format); ori_format = static_cast(GetLocalOmgContext().format); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(desc_ptr, ATTR_NAME_FORMAT, ori_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_FORMAT.c_str(), + desc_ptr->GetName().c_str(), desc_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_FORMAT failed"); return FAILED); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(desc_ptr, ATTR_NAME_INFERRED_FORMAT, ori_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_INFERRED_FORMAT.c_str(), + desc_ptr->GetName().c_str(), desc_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_INFERRED_FORMAT failed"); return FAILED); continue; @@ -130,6 +139,9 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph if (ignore_pred_format) { GE_IF_BOOL_EXEC(!AttrUtils::SetBool(tmp_op_ptr, ATTR_NAME_IGNORE_PRED_FORMAT, true), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_IGNORE_PRED_FORMAT.c_str(), + tmp_op_ptr->GetName().c_str(), tmp_op_ptr->GetType().c_str()); GELOGE(FAILED, "remove edge failed"); return FAILED); } @@ -137,9 +149,15 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph // Do not reset ATTR_NAME_FORMAT if it is set in the OpParser. if (!tmp_op_ptr->HasAttr(ATTR_NAME_FORMAT)) { GE_IF_BOOL_EXEC(!AttrUtils::SetInt(tmp_op_ptr, ATTR_NAME_FORMAT, ori_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_FORMAT.c_str(), + tmp_op_ptr->GetName().c_str(), tmp_op_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_FORMAT failed"); return FAILED); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(tmp_op_ptr, ATTR_NAME_INFERRED_FORMAT, ori_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_INFERRED_FORMAT.c_str(), + tmp_op_ptr->GetName().c_str(), tmp_op_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_INFERRED_FORMAT failed"); return FAILED); } else { @@ -147,6 +165,9 @@ Status GetOriginalFormatPass::SetOriginalFormat(const ge::ComputeGraphPtr &graph GE_RETURN_WITH_LOG_IF_FALSE(AttrUtils::GetInt(tmp_op_ptr, ATTR_NAME_FORMAT, existingFormat), "Get existing_format attr failed"); if (!AttrUtils::SetInt(tmp_op_ptr, ATTR_NAME_INFERRED_FORMAT, existingFormat)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_INFERRED_FORMAT.c_str(), + tmp_op_ptr->GetName().c_str(), tmp_op_ptr->GetType().c_str()); GELOGE(FAILED, "set ATTR_NAME_INFERRED_FORMAT failed"); return FAILED; } diff --git a/ge/graph/passes/global_step_insert_pass.cc b/ge/graph/passes/global_step_insert_pass.cc index 4431fc3d..9fc1d066 100755 --- a/ge/graph/passes/global_step_insert_pass.cc +++ b/ge/graph/passes/global_step_insert_pass.cc @@ -34,11 +34,16 @@ NodePtr GlobalStepInsertPass::InsertOp(ComputeGraphPtr &compute_graph, const std::vector &input_list, const std::vector &output_list) { OpDescPtr op_desc = MakeShared(node_name, node_type); - GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(FAILED,"Make OpDesc failed"); return nullptr); + GE_IF_BOOL_EXEC(op_desc == nullptr, + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); + GELOGE(FAILED,"Make OpDesc failed"); + return nullptr); for (auto &input_desc : input_list) { graphStatus graph_status = op_desc->AddInputDesc(input_desc); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Add node:%s intput desc failed, error=%u.", node_name.c_str(), graph_status); return nullptr; } @@ -47,6 +52,8 @@ NodePtr GlobalStepInsertPass::InsertOp(ComputeGraphPtr &compute_graph, for (auto &output_desc : output_list) { graphStatus graph_status = op_desc->AddOutputDesc(output_desc); if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Add node:%s output desc failed, error=%u.", node_name.c_str(), graph_status); return nullptr; } @@ -55,6 +62,8 @@ NodePtr GlobalStepInsertPass::InsertOp(ComputeGraphPtr &compute_graph, GE_IF_BOOL_EXEC(compute_graph == nullptr, GELOGE(FAILED,"compute_graph is nullptr"); return nullptr); NodePtr node = compute_graph->AddNode(op_desc); GE_IF_BOOL_EXEC(node == nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), compute_graph->GetName().c_str()); GELOGE(FAILED, "add node failed, name:%s, type:%s.", node_name.c_str(), node_type.c_str()); return nullptr); @@ -93,6 +102,9 @@ Status GlobalStepInsertPass::Run(ComputeGraphPtr compute_graph) { // add ctrl edges graphStatus add_ret = GraphUtils::AddEdge(global_step->GetOutControlAnchor(), output_node->GetInControlAnchor()); if (add_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + global_step->GetName().c_str(), global_step->GetType().c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); GELOGE(FAILED, "Add global_step to netoutput edge failed, add_ret=%u.", add_ret); return FAILED; } diff --git a/ge/graph/passes/guarantee_const_pass.cc b/ge/graph/passes/guarantee_const_pass.cc index a2d8f262..f6567fce 100644 --- a/ge/graph/passes/guarantee_const_pass.cc +++ b/ge/graph/passes/guarantee_const_pass.cc @@ -24,6 +24,7 @@ #include "graph/common/omg_util.h" #include "graph/utils/attr_utils.h" #include "graph/utils/graph_utils.h" +#include "graph/utils/type_utils.h" namespace ge { namespace { @@ -35,6 +36,8 @@ Status GuaranteeConstPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get original type for node:%s failed", + node->GetName().c_str()); GELOGE(status_ret, "GuaranteeConstPass get original type fail."); return status_ret; } @@ -42,6 +45,9 @@ Status GuaranteeConstPass::Run(NodePtr &node) { return SUCCESS; } if (node->GetOpDesc()->GetAllInputsDesc().size() != kGuaranteeConstInputsSize) { + REPORT_CALL_ERROR("E19999", "Num:%zu of input desc node:%s(%s) not equal to %u, " + "check invalid", node->GetOpDesc()->GetAllInputsDesc().size(), + node->GetName().c_str(), node->GetType().c_str(), kGuaranteeConstInputsSize); GELOGE(PARAM_INVALID, "input size error. Input size:%zu", node->GetOpDesc()->GetAllInputsDesc().size()); return PARAM_INVALID; } @@ -51,6 +57,11 @@ Status GuaranteeConstPass::Run(NodePtr &node) { // Input tensor cannot be a resource variable handle. const DataType &input_dtype = in_desc->GetDataType(); if (input_dtype == DT_RESOURCE) { + REPORT_CALL_ERROR("E19999", + "Data type:%s of op:%s(%s) input0 tensor not equal to %s, check invalid", + TypeUtils::DataTypeToSerialString(input_dtype).c_str(), + node->GetName().c_str(), node->GetType().c_str(), + TypeUtils::DataTypeToSerialString(DT_RESOURCE).c_str()); GELOGE(FAILED, "Input tensor cannot be a resource variable handle in [%s].", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/hccl_continuous_memcpy_pass.cc b/ge/graph/passes/hccl_continuous_memcpy_pass.cc index 1931baf0..790661bc 100644 --- a/ge/graph/passes/hccl_continuous_memcpy_pass.cc +++ b/ge/graph/passes/hccl_continuous_memcpy_pass.cc @@ -36,6 +36,8 @@ Status HcclContinuousMemcpyPass::Run(ge::ComputeGraphPtr graph) { for (const auto &node : graph->GetDirectNode()) { auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Node with nullptr op_desc exist in Param graph:%s, check invalid", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -76,6 +78,9 @@ Status HcclContinuousMemcpyPass::ContinuousInputProcess(const ComputeGraphPtr &g } auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); if (src_out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) input:%d anchor, peer anchor is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str(), + hccl_in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -115,6 +120,9 @@ Status HcclContinuousMemcpyPass::P2pmemInputProcess(const ComputeGraphPtr &graph } auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); if (src_out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) input:%u anchor, peer anchor is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str(), + index); GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -146,6 +154,7 @@ NodePtr HcclContinuousMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &grap NodePtr pre_node = out_data_anchor->GetOwnerNode(); OpDescPtr pre_op_desc = pre_node->GetOpDesc(); if (pre_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "OpDesc of pre node is invalid."); return nullptr; } @@ -154,6 +163,7 @@ NodePtr HcclContinuousMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &grap node_name = CheckDuplicateName(node_name); OpDescPtr op_desc = MakeShared(node_name.c_str(), IDENTITY); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(INTERNAL_ERROR, "Create Identity op: MakeShared op_desc fail."); return nullptr; } @@ -161,12 +171,16 @@ NodePtr HcclContinuousMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &grap graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op: add input desc fail."); return nullptr; } ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op: add output desc fail."); return nullptr; } @@ -175,6 +189,8 @@ NodePtr HcclContinuousMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &grap NodePtr memcpy_node = graph->AddNode(op_desc); if (memcpy_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); return nullptr; } @@ -247,6 +263,13 @@ Status HcclContinuousMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr Status ret1 = src_out_anchor->Unlink(hccl_in_anchor); if (ret1 != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetType().c_str(), + hccl_in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "The op %s Unlink anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), hccl_in_anchor->GetOwnerNode()->GetName().c_str()); return FAILED; @@ -255,6 +278,13 @@ Status HcclContinuousMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr GE_CHECK_NOTNULL(out_data_anchor_0); ret1 = out_data_anchor_0->LinkTo(hccl_in_anchor); if (ret1 != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%d failed", + out_data_anchor_0->GetOwnerNode()->GetName().c_str(), + out_data_anchor_0->GetOwnerNode()->GetType().c_str(), out_data_anchor_0->GetIdx(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetType().c_str(), + hccl_in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", memcpy_node->GetName().c_str(), hccl_in_anchor->GetOwnerNode()->GetName().c_str()); return FAILED; @@ -262,6 +292,12 @@ Status HcclContinuousMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr Status ret = src_out_anchor->LinkTo(memcpy_node->GetInDataAnchor(kAnchorNum)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + memcpy_node->GetName().c_str(), memcpy_node->GetType().c_str(), + kAnchorNum); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), memcpy_node->GetName().c_str()); return FAILED; @@ -307,6 +343,12 @@ Status HcclContinuousMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeG Status ret = hccl_out_anchor->LinkTo(assign_node->GetInDataAnchor(kAnchorAssignValueIndex)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + hccl_out_anchor->GetOwnerNode()->GetName().c_str(), + hccl_out_anchor->GetOwnerNode()->GetType().c_str(), hccl_out_anchor->GetIdx(), + assign_node->GetName().c_str(), assign_node->GetType().c_str(), + kAnchorAssignValueIndex); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", hccl_out_anchor->GetOwnerNode()->GetName().c_str(), assign_node->GetName().c_str()); return FAILED; @@ -314,6 +356,12 @@ Status HcclContinuousMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeG ret = var_out_anchor->LinkTo(assign_node->GetInDataAnchor(kAnchorAssignRefIndex)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + var_out_anchor->GetOwnerNode()->GetName().c_str(), + var_out_anchor->GetOwnerNode()->GetType().c_str(), var_out_anchor->GetIdx(), + assign_node->GetName().c_str(), assign_node->GetType().c_str(), + kAnchorAssignRefIndex); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", var_out_anchor->GetOwnerNode()->GetName().c_str(), assign_node->GetName().c_str()); return FAILED; @@ -328,7 +376,12 @@ Status HcclContinuousMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeG continue; } ret = assign_out_control_anchor->LinkTo(in_data_anchor->GetOwnerNode()->GetInControlAnchor()); - if (ret != SUCCESS) { + if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control to op:%s(%s) failed", + assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), + assign_out_control_anchor->GetOwnerNode()->GetType().c_str(), + in_data_anchor->GetOwnerNode()->GetName().c_str(), + in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "The op %s link control anchor %s fail.", assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), in_data_anchor->GetOwnerNode()->GetName().c_str()); @@ -342,6 +395,11 @@ Status HcclContinuousMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeG } ret = assign_out_control_anchor->LinkTo(in_control_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control to op:%s(%s) failed", + assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), + assign_out_control_anchor->GetOwnerNode()->GetType().c_str(), + in_control_anchor->GetOwnerNode()->GetName().c_str(), + in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "The op %s link control anchor %s fail.", assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -363,6 +421,7 @@ NodePtr HcclContinuousMemcpyPass::CreateAssignNode(const ComputeGraphPtr &graph, NodePtr pre_node = out_data_anchor->GetOwnerNode(); OpDescPtr pre_op_desc = pre_node->GetOpDesc(); if (pre_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "OpDesc of pre node is invalid."); return nullptr; } @@ -371,36 +430,48 @@ NodePtr HcclContinuousMemcpyPass::CreateAssignNode(const ComputeGraphPtr &graph, node_name = CheckDuplicateName(node_name); OpDescPtr op_desc = MakeShared(node_name.c_str(), ASSIGN); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(INTERNAL_ERROR, "Create Assign op: MakeShared op_desc fail."); return nullptr; } GELOGI("Create Assign op:%s.", op_desc->GetName().c_str()); if (!AttrUtils::SetBool(op_desc, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NEED_COMPILE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set ATTR_NEED_COMPILE Attr for node:%s fail.", op_desc->GetName().c_str()); return nullptr; } graphStatus ret = op_desc->AddInputDesc("ref", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:ref", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add ref input desc fail."); return nullptr; } ret = op_desc->AddInputDesc("value", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:value", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add value input desc fail."); return nullptr; } ret = op_desc->AddOutputDesc("ref", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, name:ref", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add output desc fail."); return nullptr; } NodePtr assign_node = graph->AddNode(op_desc); if (assign_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); return nullptr; } diff --git a/ge/graph/passes/hccl_memcpy_pass.cc b/ge/graph/passes/hccl_memcpy_pass.cc index 537920b7..2d2f8220 100755 --- a/ge/graph/passes/hccl_memcpy_pass.cc +++ b/ge/graph/passes/hccl_memcpy_pass.cc @@ -38,6 +38,8 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { for (const auto &node : graph->GetDirectNode()) { auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Node with nullptr op_desc exist in Param graph:%s, check invalid", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -64,6 +66,8 @@ Status HcclMemcpyPass::MutableInputProcess(const ComputeGraphPtr &graph, const N } if (!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", kInputMutable, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); return FAILED; } @@ -116,6 +120,7 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O NodePtr pre_node = out_data_anchor->GetOwnerNode(); OpDescPtr pre_op_desc = pre_node->GetOpDesc(); if (pre_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "OpDesc of pre node is invalid."); return nullptr; } @@ -124,6 +129,7 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O node_name = CheckDuplicateName(node_name); OpDescPtr op_desc = MakeShared(node_name.c_str(), IDENTITY); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(INTERNAL_ERROR, "Create Identity op: MakeShared op_desc fail."); return nullptr; } @@ -131,12 +137,16 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:x", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op: add input desc fail."); return nullptr; } ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, name:y", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Identity op: add output desc fail."); return nullptr; } @@ -145,6 +155,8 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O NodePtr memcpy_node = graph->AddNode(op_desc); if (memcpy_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); return nullptr; } @@ -215,6 +227,12 @@ Status HcclMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr &graph, co Status ret1 = src_out_anchor->Unlink(hccl_in_anchor); if (ret1 != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetType().c_str(), hccl_in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "The op %s Unlink anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), hccl_in_anchor->GetOwnerNode()->GetName().c_str()); return FAILED; @@ -223,6 +241,13 @@ Status HcclMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr &graph, co GE_CHECK_NOTNULL(out_data_anchor_0); ret1 = out_data_anchor_0->LinkTo(hccl_in_anchor); if (ret1 != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%d failed", + out_data_anchor_0->GetOwnerNode()->GetName().c_str(), + out_data_anchor_0->GetOwnerNode()->GetType().c_str(), out_data_anchor_0->GetIdx(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetType().c_str(), + hccl_in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", memcpy_node->GetName().c_str(), hccl_in_anchor->GetOwnerNode()->GetName().c_str()); return FAILED; @@ -230,6 +255,12 @@ Status HcclMemcpyPass::InsertIdentityBeforeHccl(const ComputeGraphPtr &graph, co Status ret = src_out_anchor->LinkTo(memcpy_node->GetInDataAnchor(kAnchorNum)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + memcpy_node->GetName().c_str(), memcpy_node->GetType().c_str(), + kAnchorNum); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), memcpy_node->GetName().c_str()); return FAILED; @@ -275,6 +306,12 @@ Status HcclMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeGraphPtr &g Status ret = hccl_out_anchor->LinkTo(assign_node->GetInDataAnchor(kAnchorAssignValueIndex)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + hccl_out_anchor->GetOwnerNode()->GetName().c_str(), + hccl_out_anchor->GetOwnerNode()->GetType().c_str(), hccl_out_anchor->GetIdx(), + assign_node->GetName().c_str(), assign_node->GetType().c_str(), + kAnchorAssignValueIndex); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", hccl_out_anchor->GetOwnerNode()->GetName().c_str(), assign_node->GetName().c_str()); return FAILED; @@ -282,6 +319,12 @@ Status HcclMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeGraphPtr &g ret = var_out_anchor->LinkTo(assign_node->GetInDataAnchor(kAnchorAssignRefIndex)); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%u failed", + var_out_anchor->GetOwnerNode()->GetName().c_str(), + var_out_anchor->GetOwnerNode()->GetType().c_str(), var_out_anchor->GetIdx(), + assign_node->GetName().c_str(), assign_node->GetType().c_str(), + kAnchorAssignRefIndex); GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", var_out_anchor->GetOwnerNode()->GetName().c_str(), assign_node->GetName().c_str()); return FAILED; @@ -296,7 +339,14 @@ Status HcclMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeGraphPtr &g continue; } ret = assign_out_control_anchor->LinkTo(in_data_anchor->GetOwnerNode()->GetInControlAnchor()); - if (ret != SUCCESS) { + if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d link to op:%s(%s) in index:%d failed", + assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), + assign_out_control_anchor->GetOwnerNode()->GetType().c_str(), assign_out_control_anchor->GetIdx(), + in_data_anchor->GetOwnerNode()->GetName().c_str(), + in_data_anchor->GetOwnerNode()->GetType().c_str(), + in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "The op %s link control anchor %s fail.", assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), in_data_anchor->GetOwnerNode()->GetName().c_str()); @@ -310,6 +360,11 @@ Status HcclMemcpyPass::InsertAssignAfterBroadcastIfNeed(const ComputeGraphPtr &g } ret = assign_out_control_anchor->LinkTo(in_control_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control to op:%s(%s) failed", + assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), + assign_out_control_anchor->GetOwnerNode()->GetType().c_str(), + in_control_anchor->GetOwnerNode()->GetName().c_str(), + in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "The op %s link control anchor %s fail.", assign_out_control_anchor->GetOwnerNode()->GetName().c_str(), in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -330,6 +385,7 @@ NodePtr HcclMemcpyPass::CreateAssignNode(const ComputeGraphPtr &graph, const Out NodePtr pre_node = out_data_anchor->GetOwnerNode(); OpDescPtr pre_op_desc = pre_node->GetOpDesc(); if (pre_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "OpDesc of pre node is invalid."); return nullptr; } @@ -338,6 +394,7 @@ NodePtr HcclMemcpyPass::CreateAssignNode(const ComputeGraphPtr &graph, const Out node_name = CheckDuplicateName(node_name); OpDescPtr op_desc = MakeShared(node_name.c_str(), ASSIGN); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(INTERNAL_ERROR, "Create Assign op: MakeShared op_desc fail."); return nullptr; } @@ -345,24 +402,32 @@ NodePtr HcclMemcpyPass::CreateAssignNode(const ComputeGraphPtr &graph, const Out graphStatus ret = op_desc->AddInputDesc("ref", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:ref", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add ref input desc fail."); return nullptr; } ret = op_desc->AddInputDesc("value", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:value", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add value input desc fail."); return nullptr; } ret = op_desc->AddOutputDesc("ref", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, name:ref", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Create Assign op: add output desc fail."); return nullptr; } NodePtr assign_node = graph->AddNode(op_desc); if (assign_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); return nullptr; } diff --git a/ge/graph/passes/hccl_tailing_optimization_pass.cc b/ge/graph/passes/hccl_tailing_optimization_pass.cc new file mode 100644 index 00000000..a1bdb2d1 --- /dev/null +++ b/ge/graph/passes/hccl_tailing_optimization_pass.cc @@ -0,0 +1,72 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hccl_tailing_optimization_pass.h" +#include "graph/common/transop_util.h" + +namespace ge { +Status HcclTailingOptimizationPass::Run(ComputeGraphPtr graph) { + for (const auto &node : graph->GetDirectNode()) { + GE_CHECK_NOTNULL(node); + if (node->GetType() != HCOMALLREDUCE) { + continue; + } + for (auto &out_node : node->GetOutDataNodes()) { + if (!TransOpUtil::IsTransOp(out_node)) { + continue; + } + + GE_CHK_STATUS_RET_NOLOG(CopyControlEdgesForTransOp(out_node)); + } + } + return SUCCESS; +} +Status HcclTailingOptimizationPass::CopyControlEdgesForTransOp(NodePtr &first_trans_op) { + auto dst_in_ctrl_anchor = first_trans_op->GetInControlAnchor(); + GE_CHECK_NOTNULL(dst_in_ctrl_anchor); + std::set src_out_ctrl_anchors; + std::vector trans_op_nodes{first_trans_op}; + + while (!trans_op_nodes.empty()) { + auto trans_op_node = trans_op_nodes.back(); + trans_op_nodes.pop_back(); + + for (auto &next_node : trans_op_node->GetOutDataNodes()) { + auto in_ctrl_anchor = next_node->GetInControlAnchor(); + GE_CHECK_NOTNULL(in_ctrl_anchor); + + auto peer_out_ctrl_anchors = in_ctrl_anchor->GetPeerOutControlAnchors(); + + for (auto src_ctrl_anchor : peer_out_ctrl_anchors) { + GE_CHECK_NOTNULL(src_ctrl_anchor->GetOwnerNode()); + src_out_ctrl_anchors.emplace(src_ctrl_anchor); + } + if (TransOpUtil::IsTransOp(next_node)) { + trans_op_nodes.emplace_back(next_node); + } + } + } + + for (auto &src_out_ctrl_anchor : src_out_ctrl_anchors) { + if (!src_out_ctrl_anchor->IsLinkedWith(dst_in_ctrl_anchor)) { + GE_CHK_GRAPH_STATUS_RET( + GraphUtils::AddEdge(src_out_ctrl_anchor, dst_in_ctrl_anchor), "Failed to add edge between %s->%s", + src_out_ctrl_anchor->GetOwnerNode()->GetName().c_str(), first_trans_op->GetName().c_str()); + } + } + + return SUCCESS; +} +} // namespace ge diff --git a/ge/graph/passes/hccl_tailing_optimization_pass.h b/ge/graph/passes/hccl_tailing_optimization_pass.h new file mode 100644 index 00000000..3b6ccaea --- /dev/null +++ b/ge/graph/passes/hccl_tailing_optimization_pass.h @@ -0,0 +1,34 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_HCCL_TAILING_OPTIMIZATION_PASS_H_ +#define GE_GRAPH_PASSES_HCCL_TAILING_OPTIMIZATION_PASS_H_ + +#include + +#include "inc/graph_pass.h" + +namespace ge { +class HcclTailingOptimizationPass : public GraphPass { + public: + Status Run(ComputeGraphPtr graph) override; + + private: + Status CopyControlEdgesForTransOp(NodePtr &first_trans_op); +}; +} // namespace ge + +#endif // GE_GRAPH_PASSES_HCCL_TAILING_OPTIMIZATION_PASS_H_ diff --git a/ge/graph/passes/identity_pass.cc b/ge/graph/passes/identity_pass.cc index 5a54e391..461b126a 100755 --- a/ge/graph/passes/identity_pass.cc +++ b/ge/graph/passes/identity_pass.cc @@ -99,6 +99,8 @@ Status IdentityPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get original type for node:%s failed", + node->GetName().c_str()); GELOGE(status_ret, "Identity pass get original type fail."); return status_ret; } @@ -118,6 +120,9 @@ Status IdentityPass::Run(NodePtr &node) { } size_t n = node->GetOpDesc()->GetOutputsSize(); if (node->GetOpDesc()->GetInputsSize() != n) { + REPORT_CALL_ERROR("E19999", "Num:%zu of input desc node:%s(%s) not equal to it's output desc num:%zu, " + "check invalid", node->GetOpDesc()->GetInputsSize(), + node->GetName().c_str(), node->GetType().c_str(), n); GELOGE(PARAM_INVALID, "Identity input / output size must be equal. in size:%lu, out size:%lu", node->GetOpDesc()->GetInputsSize(), n); return PARAM_INVALID; diff --git a/ge/graph/passes/infershape_pass.cc b/ge/graph/passes/infershape_pass.cc index 7b8f7b50..728f5512 100755 --- a/ge/graph/passes/infershape_pass.cc +++ b/ge/graph/passes/infershape_pass.cc @@ -22,9 +22,60 @@ #include "framework/common/util.h" #include "graph/shape_refiner.h" #include "graph/utils/graph_utils.h" +#include "utils/tensor_utils.h" +#include "utils/type_utils.h" namespace ge { + +void SerialShapeRange(const GeTensorDescPtr &desc, std::string &desc_str) { + desc_str += "["; + std::vector> shape_range; + (void)desc->GetShapeRange(shape_range); + for (const auto &pair : shape_range) { + desc_str += "{"; + desc_str += std::to_string(pair.first) + "," + std::to_string(pair.second); + desc_str += "},"; + } + desc_str += "]"; + shape_range.clear(); + (void)desc->GetOriginShapeRange(shape_range); + for (const auto &pair : shape_range) { + desc_str += ",{"; + desc_str += std::to_string(pair.first) + "," + std::to_string(pair.second); + desc_str += "},"; + } +} + +std::string GetInTensorInfoWithString(const ge::NodePtr &node) { + ge::OpDescPtr op_desc = node->GetOpDesc(); + std::stringstream ss; + ss << "{"; + int32_t in_idx = 0; + for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) { + if (input_desc == nullptr) { + in_idx++; + continue; + } + if (in_idx > 0) { + ss << " "; + } + ss << "input_" << in_idx << " " << "tensor: ["; + ss << "(shape:[" << input_desc->MutableShape().ToString() << "]),"; + ss << "(format:" << TypeUtils::FormatToSerialString(input_desc->GetFormat()) << "),"; + ss << "(dtype:" << TypeUtils::DataTypeToSerialString(input_desc->GetDataType()) << "),"; + ss << "(origin_shape:" << input_desc->GetOriginShape().ToString() << "),"; + ss << "(origin_format:" << TypeUtils::FormatToSerialString(input_desc->GetOriginFormat()) << "),"; + ss << "(origin_dtype:" << TypeUtils::DataTypeToSerialString(input_desc->GetOriginDataType()) << "),"; + string range_str; + SerialShapeRange(input_desc, range_str); + ss << "(shape_range:" << range_str << ")]"; + in_idx++; + } + return ss.str(); +} + Status InferShapePass::Run(NodePtr &node) { + // kOptimizeAfterSubGraph exist means after subgraph auto ret = ShapeRefiner::InferShapeAndType(node, !OptionExists(kOptimizeAfterSubGraph)); if (ret != GRAPH_SUCCESS) { // select INFERSHAPE failed info @@ -38,9 +89,25 @@ Status InferShapePass::Run(NodePtr &node) { (void)Analyzer::GetInstance()->SaveAnalyzerDataToFile(root_graph->GetSessionID(), root_graph->GetGraphID()); + REPORT_CALL_ERROR("E19999", "Call InferShapeAndType for node:%s(%s) failed, input_tensor:%s", + node->GetName().c_str(), node->GetType().c_str(), GetInTensorInfoWithString(node).c_str()); GELOGE(GE_GRAPH_INFERSHAPE_FAILED, "infershape failed. node: %s", node->GetName().c_str()); return GE_GRAPH_INFERSHAPE_FAILED; } + bool need_repass = false; + auto has_attr = AttrUtils::GetBool(node->GetOpDesc(), "_need_infer_again", need_repass); + if (has_attr) { + if (!OptionExists(kOptimizeAfterSubGraph)) { + return SUCCESS; + } + if (need_repass) { + AddImmediateRePassNode(node); + GELOGD("Node %s need repass immediately.", node->GetName().c_str()); + } else { + // clear attr on while + node->GetOpDesc()->DelAttr("_need_infer_again"); + } + } return SUCCESS; } } // namespace ge diff --git a/ge/graph/passes/inplace_support_check_pass.cc b/ge/graph/passes/inplace_support_check_pass.cc index 44a0b3ef..9f0d76d0 100644 --- a/ge/graph/passes/inplace_support_check_pass.cc +++ b/ge/graph/passes/inplace_support_check_pass.cc @@ -69,6 +69,9 @@ Status InplaceSupportCheckPass::Run(NodePtr &node) { GELOGD("add attr INPLACE_SUPPORT_INPUT_INDEX on node %s, input_idx=%d", node->GetName().c_str(), inplace_input_idx); if (!AttrUtils::SetInt(node->GetOpDesc()->MutableOutputDesc(kInplaceSupportOutputIndex), INPLACE_SUPPORT_INPUT_INDEX, inplace_input_idx)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output:%u tensor of op:%s(%s) failed", + INPLACE_SUPPORT_INPUT_INDEX.c_str(), kInplaceSupportOutputIndex, + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Set attr INPLACE_SUPPORT_INPUT_INDEX on node %s failed.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/input_output_connection_identify_pass.cc b/ge/graph/passes/input_output_connection_identify_pass.cc index 0d198dfb..6e2b3049 100644 --- a/ge/graph/passes/input_output_connection_identify_pass.cc +++ b/ge/graph/passes/input_output_connection_identify_pass.cc @@ -42,11 +42,14 @@ inline bool IsDataOp(const std::string &node_type) { Status InputOutputConnectionIdentifyPass::Run(ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Input param graph is null, skip identification of nodes that connect to input and output."); return PARAM_INVALID; } if (graph->GetParentGraph() != nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph's parent graph is nullptr, " + "check invalid"); GELOGD("Current graph %s is a subgraph, skip identification of nodes that connect to input and output.", graph->GetName().c_str()); return SUCCESS; @@ -54,11 +57,15 @@ Status InputOutputConnectionIdentifyPass::Run(ComputeGraphPtr graph) { GELOGD("Start to identify nodes that connect to input and output."); if (graph->TopologicalSorting() != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological Sorting graph:%s failed", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Graph topological sort failed."); return INTERNAL_ERROR; } if (GraphUtils::GetRefMapping(graph, symbol_to_anchors_, anchor_to_symbol_) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get ref mapping from graph:%s failed", + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Get ref-mapping for graph %s failed.", graph->GetName().c_str()); return INTERNAL_ERROR; } @@ -125,6 +132,8 @@ Status InputOutputConnectionIdentifyPass::UpdateNodeIdxMap(const string &symbol_ map> &connect_output_node_idx) { auto symbol_iter = symbol_to_anchors_.find(symbol_string); if (symbol_iter == symbol_to_anchors_.end()) { + REPORT_CALL_ERROR("E19999", "Can't find symbol:%s in symbol_to_anchors map, check invalid", + symbol_string.c_str()); GELOGE(PARAM_INVALID, "Input param symbol string: %s is invalid.", symbol_string.c_str()); return PARAM_INVALID; } @@ -171,6 +180,9 @@ Status InputOutputConnectionIdentifyPass::SetNodeAttrOfConnectingInputOutput( GE_CHECK_NOTNULL(iter.first); if (iter.first->GetOpDesc() != nullptr) { if (!AttrUtils::SetListInt(iter.first->GetOpDesc(), ATTR_NAME_NODE_CONNECT_INPUT, iter.second)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_NODE_CONNECT_INPUT.c_str(), + iter.first->GetName().c_str(), iter.first->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set attr %s for node %s.", ATTR_NAME_NODE_CONNECT_INPUT.c_str(), iter.first->GetName().c_str()); return INTERNAL_ERROR; @@ -182,6 +194,9 @@ Status InputOutputConnectionIdentifyPass::SetNodeAttrOfConnectingInputOutput( GE_CHECK_NOTNULL(iter.first); if (iter.first->GetOpDesc() != nullptr) { if (!AttrUtils::SetListInt(iter.first->GetOpDesc(), ATTR_NAME_NODE_CONNECT_OUTPUT, iter.second)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_NODE_CONNECT_OUTPUT.c_str(), + iter.first->GetName().c_str(), iter.first->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set attr %s for node %s.", ATTR_NAME_NODE_CONNECT_OUTPUT.c_str(), iter.first->GetName().c_str()); return INTERNAL_ERROR; diff --git a/ge/graph/passes/iterator_op_pass.cc b/ge/graph/passes/iterator_op_pass.cc index 1ec2bba9..d0cb434b 100644 --- a/ge/graph/passes/iterator_op_pass.cc +++ b/ge/graph/passes/iterator_op_pass.cc @@ -55,20 +55,36 @@ Status IteratorOpPass::Run(ge::ComputeGraphPtr graph) { if (type == "IteratorV2" || type == "Iterator" || op_type == kGetNext) { ge::NodePtr memcpy_async_node = InsertMemcpyAsyncNode(node, graph); GE_CHECK_NOTNULL(memcpy_async_node); - GE_CHK_STATUS_RET(SetCycleEvent(memcpy_async_node), "Set cycle event fail, node:%s", - memcpy_async_node->GetName().c_str()); + auto status = SetCycleEvent(memcpy_async_node); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set cycle event to op:%s(%s) failed", + memcpy_async_node->GetName().c_str(), memcpy_async_node->GetType().c_str()); + GELOGE(status, "Set cycle event failed."); + return status; + } - GE_CHK_STATUS_RET(SetStreamLabel(memcpy_async_node, memcpy_async_node->GetName()), - "Set stream label fail, node:%s", node->GetName().c_str()); + status = SetStreamLabel(memcpy_async_node, memcpy_async_node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + memcpy_async_node->GetName().c_str(), memcpy_async_node->GetName().c_str(), + memcpy_async_node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } - GE_CHK_STATUS_RET(SetStreamLabel(node, node->GetName()), "Set stream label fail, node:%s", - node->GetName().c_str()); + status = SetStreamLabel(node, node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + node->GetName().c_str(), node->GetName().c_str(), node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } GELOGI("Set independent loop for iterator node success"); int64_t loop_per_iter = 0; ge::GeTensorDesc ge_tensor_desc; - Status status = VarManager::Instance(graph->GetSessionID())->GetCurVarDesc(NODE_NAME_FLOWCTRL_LOOP_PER_ITER, + status = VarManager::Instance(graph->GetSessionID())->GetCurVarDesc(NODE_NAME_FLOWCTRL_LOOP_PER_ITER, ge_tensor_desc); GE_IF_BOOL_EXEC(status != SUCCESS, GELOGW("Fail to Get var_desc of NODE_NAME_FLOWCTRL_LOOP_PER_ITER failed."); continue); @@ -92,8 +108,14 @@ Status IteratorOpPass::Run(ge::ComputeGraphPtr graph) { if (loop_per_iter == kMaxIterationsPerLoop) { ge::NodePtr end_of_sequence_node = InsertEndOfSequenceNode(node, memcpy_async_node, graph); GE_CHECK_NOTNULL(end_of_sequence_node); - GE_CHK_STATUS_RET(SetStreamLabel(end_of_sequence_node, end_of_sequence_node->GetName()), - "Set stream label fail, node:%s", node->GetName().c_str()); + status = SetStreamLabel(end_of_sequence_node, end_of_sequence_node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream label:%s to op:%s(%s) failed", + end_of_sequence_node->GetName().c_str(), end_of_sequence_node->GetName().c_str(), + end_of_sequence_node->GetType().c_str()); + GELOGE(status, "set stream label failed."); + return status; + } GELOGI("Insert EndOfSequence node success."); } } @@ -110,8 +132,13 @@ Status IteratorOpPass::GetVariableValue(uint64_t session_id, const ge::GeTensorD GE_CHECK_NOTNULL(var_mem_base); // offset + logic_base uint8_t *dev_ptr = nullptr; - GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &dev_ptr), - "Get variable %s address failed.", var_name.c_str()); + auto status = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &dev_ptr); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get Var add by name:%s failed, session_id:%lu", + var_name.c_str(), session_id); + GELOGE(status, "Get variable %s address failed.", var_name.c_str()); + return status; + } int64_t offset = static_cast(reinterpret_cast(dev_ptr)); // logic_base_addr auto logic_var_base = VarManager::Instance(session_id)->GetVarMemLogicBase(); @@ -144,7 +171,11 @@ ge::NodePtr IteratorOpPass::InsertEndOfSequenceNode(const ge::NodePtr &pre_node, auto out_anchor = pre_node->GetOutDataAnchor(0); ge::graphStatus status; status = GraphUtils::AddEdge(out_anchor, end_of_seq_node->GetInDataAnchor(0)); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + pre_node->GetName().c_str(), pre_node->GetType().c_str(), + end_of_seq_node->GetName().c_str(), end_of_seq_node->GetType().c_str()); + return nullptr, "Graph add EndOfSequence op input edge fail, dst node: %s.", end_of_seq_node->GetName().c_str()); // EOS(control) --> subsequent of memcpy @@ -157,7 +188,11 @@ ge::NodePtr IteratorOpPass::InsertEndOfSequenceNode(const ge::NodePtr &pre_node, continue; } status = GraphUtils::AddEdge(out_ctrl_anchor, in_ctrl_anchor); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + end_of_seq_node->GetName().c_str(), end_of_seq_node->GetType().c_str(), + out_node->GetName().c_str(), out_node->GetType().c_str()); + return nullptr, "Graph add EndOfSequence op out ctrl edge fail, dst node: %s.", out_node->GetName().c_str()); GELOGI("Graph add EndOfSequence op out ctrl edge, dst node: %s.", @@ -175,19 +210,27 @@ ge::NodePtr IteratorOpPass::InsertEndOfSequenceNode(const ge::NodePtr &pre_node, /// ge::OpDescPtr IteratorOpPass::CreateEndOfSequenceOp(const ge::NodePtr &pre_node) { GELOGI("Start to create endOfSequence op."); - GE_CHK_BOOL_EXEC(pre_node != nullptr, return nullptr, "Input param invalid."); + GE_CHK_BOOL_EXEC(pre_node != nullptr, + REPORT_INNER_ERROR("E19999", "Param pre_node is nullptr, check invalid"); + return nullptr, "Input param invalid."); string node_name = pre_node->GetName() + "_EndOfSequence"; ge::OpDescPtr op_desc = MakeShared(node_name, ENDOFSEQUENCE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "MakeShared fail."); return op_desc; } ge::OpDescPtr pre_node_op_desc = pre_node->GetOpDesc(); - GE_CHK_BOOL_EXEC(pre_node_op_desc != nullptr, return nullptr, "OpDesc of pre_node is invalid."); + GE_CHK_BOOL_EXEC(pre_node_op_desc != nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); + return nullptr, "OpDesc of pre_node is invalid."); GELOGI("Create EndOfSequence op:%s.", op_desc->GetName().c_str()); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_node_op_desc->GetOutputDesc(0)) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_node_op_desc->GetOutputDesc(0)) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return nullptr, "Create EndOfSequence op:add input desc fail."); return op_desc; } @@ -205,7 +248,11 @@ ge::NodePtr IteratorOpPass::InsertMemcpyAsyncNode(const ge::NodePtr &pre_node, c ge::OpDescPtr memcpy_async_op_desc = CreateMemcpyAsyncOp(pre_node); GE_CHK_BOOL_EXEC(memcpy_async_op_desc != nullptr, GELOGW("Create memcpyAsync op fail."); return nullptr); ge::NodePtr memcpy_async_node = graph->AddNode(memcpy_async_op_desc); - GE_CHK_BOOL_EXEC(memcpy_async_node != nullptr, return nullptr, "Insert mencpy node fail."); + GE_CHK_BOOL_EXEC(memcpy_async_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + memcpy_async_op_desc->GetName().c_str(), memcpy_async_op_desc->GetType().c_str(), + graph->GetName().c_str()); + return nullptr, "Insert mencpy node fail."); // Data out for (auto &out_anchor : pre_node->GetAllOutDataAnchors()) { @@ -217,9 +264,24 @@ ge::NodePtr IteratorOpPass::InsertMemcpyAsyncNode(const ge::NodePtr &pre_node, c for (auto &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) { GE_IF_BOOL_EXEC(peer_in_anchor == nullptr, GELOGW("peer_in_anchor is nullptr"); return nullptr); status = GraphUtils::RemoveEdge(out_anchor, peer_in_anchor); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, "Remove edge failed, index:%d.", out_anchor->GetIdx()); + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR( + "E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_node->GetName().c_str(), pre_node->GetType().c_str(), out_anchor->GetIdx(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str(), + peer_in_anchor->GetIdx()); + return nullptr, "Remove edge failed, index:%d.", out_anchor->GetIdx()); status = GraphUtils::AddEdge(memcpy_async_node->GetOutDataAnchor(out_anchor->GetIdx()), peer_in_anchor); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR( + "E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + memcpy_async_node->GetName().c_str(), memcpy_async_node->GetType().c_str(), + out_anchor->GetIdx(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str(), + peer_in_anchor->GetIdx()); + return nullptr, "Graph add memcpyAsync op out edge fail, src index:%d, dst index:%d, dst node: %s.", out_anchor->GetIdx(), peer_in_anchor->GetIdx(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); @@ -227,7 +289,13 @@ ge::NodePtr IteratorOpPass::InsertMemcpyAsyncNode(const ge::NodePtr &pre_node, c peer_in_anchor->GetIdx(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); } status = GraphUtils::AddEdge(out_anchor, memcpy_async_node->GetInDataAnchor(out_anchor->GetIdx())); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, "Graph add memcpyAsync op in edge fail, index:%d.", + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR( + "E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_node->GetName().c_str(), pre_node->GetType().c_str(), out_anchor->GetIdx(), + memcpy_async_node->GetName().c_str(), memcpy_async_node->GetType().c_str(), + out_anchor->GetIdx()); + return nullptr, "Graph add memcpyAsync op in edge fail, index:%d.", out_anchor->GetIdx()); } // Control out @@ -235,10 +303,22 @@ ge::NodePtr IteratorOpPass::InsertMemcpyAsyncNode(const ge::NodePtr &pre_node, c GE_IF_BOOL_EXEC(out_ctrl_anchor != nullptr, for (auto &peer_in_ctrl_anchor : out_ctrl_anchor->GetPeerInControlAnchors()) { ge::graphStatus status = GraphUtils::RemoveEdge(out_ctrl_anchor, peer_in_ctrl_anchor); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, "Remove edge failed, dst node: %s.", + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR( + "E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + pre_node->GetName().c_str(), pre_node->GetType().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetType().c_str()); + return nullptr, "Remove edge failed, dst node: %s.", peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str()); status = GraphUtils::AddEdge(memcpy_async_node->GetOutControlAnchor(), peer_in_ctrl_anchor); - GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(status == GRAPH_SUCCESS, + REPORT_CALL_ERROR( + "E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + memcpy_async_node->GetName().c_str(), memcpy_async_node->GetType().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetType().c_str()); + return nullptr, "Graph add memcpyAsync op out ctrl edge fail, dst node: %s.", peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str()); GELOGI("Graph add memcpyAsync op out ctrl edge, dst node: %s.", @@ -261,20 +341,29 @@ ge::OpDescPtr IteratorOpPass::CreateMemcpyAsyncOp(const ge::NodePtr &pre_node) { string node_name = pre_node->GetName() + "_MemcpyAsync"; ge::OpDescPtr op_desc = MakeShared(node_name.c_str(), MEMCPYASYNC); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "MakeShared fail."); return op_desc; } GELOGI("Create memcpyAsync op:%s.", op_desc->GetName().c_str()); ge::OpDescPtr pre_node_op_desc = pre_node->GetOpDesc(); - GE_CHK_BOOL_EXEC(pre_node_op_desc != nullptr, return nullptr, "OpDesc of pre_node is invalid."); + GE_CHK_BOOL_EXEC(pre_node_op_desc != nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); + return nullptr, "OpDesc of pre_node is invalid."); size_t out_size = pre_node_op_desc->GetOutputsSize(); GELOGI("Create memcpyAsync op, pre_node out_size: %zu.", out_size); for (size_t i = 0; i < out_size; i++) { - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_node_op_desc->GetOutputDesc(i)) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_node_op_desc->GetOutputDesc(i)) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + pre_node_op_desc->GetName().c_str(), pre_node_op_desc->GetType().c_str()); + return nullptr, "Create memcpyAsync op:add input desc fail."); - GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(pre_node_op_desc->GetOutputDesc(i)) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(pre_node_op_desc->GetOutputDesc(i)) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + pre_node_op_desc->GetName().c_str(), pre_node_op_desc->GetType().c_str()); + return nullptr, "Create memcpyAsync op:add output desc fail."); } diff --git a/ge/graph/passes/link_gen_mask_nodes_pass.cc b/ge/graph/passes/link_gen_mask_nodes_pass.cc index 9bd991aa..14f5dfc3 100755 --- a/ge/graph/passes/link_gen_mask_nodes_pass.cc +++ b/ge/graph/passes/link_gen_mask_nodes_pass.cc @@ -70,6 +70,9 @@ Status LinkGenMaskNodesPass::Run(ComputeGraphPtr graph) { graphStatus status_link_to = src_anchor->LinkTo(dest_anchor); if (status_link_to != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control to op:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + dest_node->GetName().c_str(), dest_node->GetType().c_str()); GELOGE(FAILED, "Link from %s to %s failed.", src_node->GetName().c_str(), dest_node->GetName().c_str()); return FAILED; } @@ -93,7 +96,7 @@ bool LinkGenMaskNodesPass::AreAllInputsConst(const NodePtr &node) const { void LinkGenMaskNodesPass::GetAllGenMaskNodes(ComputeGraphPtr graph, vector &gen_mask_nodes) const { set nodes_set; for (const NodePtr &node : graph->GetDirectNode()) { - if (node->GetType() != DROPOUTDOMASK) { + if (node->GetType() != DROPOUTDOMASK && node->GetType() != DROPOUTDOMASKV3 && node->GetType() != DROPOUTDOMASKV3D) { continue; } diff --git a/ge/graph/passes/mark_agnostic_pass.cc b/ge/graph/passes/mark_agnostic_pass.cc index 80b4bd7a..4c489bbe 100644 --- a/ge/graph/passes/mark_agnostic_pass.cc +++ b/ge/graph/passes/mark_agnostic_pass.cc @@ -132,7 +132,13 @@ Status MarkAgnosticPass::SetContinuousAttr(const NodePtr &node, const std::vecto (void)AttrUtils::SetBool(op_desc, ATTR_NAME_REFRESH_CONTINUOUS_FLAG, true); for (auto index : indexes) { auto out = op_desc->MutableOutputDesc(index); - GE_CHECK_NOTNULL(out); + if (out == nullptr) { + REPORT_INNER_ERROR("E19999", "Op:%s(%s) output:%u desc is nullptr, check invalid", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), index); + GELOGE(FAILED, "[Check][Param]Op:%s(%s) output:%u desc is nullptr", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), index); + return FAILED; + } // This attr is for out's dtype and format continuous with it's peer input (void)AttrUtils::SetInt(out, ATTR_NAME_FORMAT_CONTINUOUS, 1); } diff --git a/ge/graph/passes/mark_graph_unknown_status_pass.cc b/ge/graph/passes/mark_graph_unknown_status_pass.cc index ae0919fe..bf69480a 100644 --- a/ge/graph/passes/mark_graph_unknown_status_pass.cc +++ b/ge/graph/passes/mark_graph_unknown_status_pass.cc @@ -28,7 +28,7 @@ Status MarkGraphUnknownStatusPass::Run(ComputeGraphPtr graph) { bool is_unknown_shape = false; bool forced_unknown = false; for (const auto &node : graph->GetDirectNode()) { - GE_CHK_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown_shape), + GE_CHK_GRAPH_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown_shape), "Get node[%s] shape status failed!", node->GetName().c_str()); if (is_unknown_shape) { break; diff --git a/ge/graph/passes/mark_node_unknown_shape_pass.cc b/ge/graph/passes/mark_node_unknown_shape_pass.cc new file mode 100644 index 00000000..c040e846 --- /dev/null +++ b/ge/graph/passes/mark_node_unknown_shape_pass.cc @@ -0,0 +1,99 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/mark_node_unknown_shape_pass.h" +#include "graph/utils/node_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/common/local_context.h" + +namespace ge { +namespace { +const char *const kEngineNameAiCore = "AIcoreEngine"; +const char *const kNeedRefreshShape = "_need_generate"; +const char *const kOriginalNode = "_original_node"; +const int32_t kDynamicState = -2; +} + +Status MarkNodeUnknownShapePass::Run(ComputeGraphPtr graph) { + GE_CHECK_NOTNULL(graph); + if (!GetLocalOmgContext().fuzz_compile_flag) { + return SUCCESS; + } + if (IsAllAicoreSupportDyn(graph)) { + if (UpdateNodeShapeToUnknown(graph) != SUCCESS) { + GELOGE(FAILED, "[Update][Node_Shape]Failed to update node shape to unknown."); + return FAILED; + } + } + return SUCCESS; +} + +bool MarkNodeUnknownShapePass::IsAllAicoreSupportDyn(ComputeGraphPtr &graph) { + bool is_all_aicore_support_dyn = false; + for (const auto &node : graph->GetAllNodes()) { + if (node->GetOpDesc() == nullptr) { + continue; + } + if (node->GetOpDesc()->GetOpKernelLibName() != kEngineNameAiCore) { + GELOGD("Kernel of %s is %s.", node->GetName().c_str(), node->GetOpDesc()->GetOpKernelLibName().c_str()); + continue; + } + NodePtr original_node = nullptr; + original_node = node->GetOpDesc()->TryGetExtAttr(kOriginalNode, original_node); + if ((original_node == nullptr && AttrUtils::HasAttr(node->GetOpDesc(), ATTR_NAME_FUZZ_BUILD_RES_ATTRS)) || + (original_node != nullptr && AttrUtils::HasAttr(node->GetOpDesc(), ATTR_NAME_FUZZ_BUILD_RES_ATTRS) && + !AttrUtils::HasAttr(original_node->GetOpDesc(), kNeedRefreshShape))) { + GELOGD("%s has set ATTR_NAME_FUZZ_BUILD_RES_ATTRS.", node->GetName().c_str()); + is_all_aicore_support_dyn = true; + } else { + GELOGD("%s has not set ATTR_NAME_FUZZ_BUILD_RES_ATTRS.", node->GetName().c_str()); + is_all_aicore_support_dyn = false; + break; + } + } + return is_all_aicore_support_dyn; +} + +Status MarkNodeUnknownShapePass::UpdateNodeShapeToUnknown(ComputeGraphPtr &graph) { + GELOGD("Need to update node shape to dynamic when get fuzz build result."); + for (const auto &node : graph->GetAllNodes()) { + if (NodeUtils::IsConst(*node) || node->GetType() == VARIABLE) { + continue; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { + auto src_node = NodeUtils::GetInDataNodeByIndex(*node, static_cast(i)); + if (src_node != nullptr && (NodeUtils::IsConst(*src_node) || src_node->GetType() == VARIABLE)) { + continue; + } + GELOGD("Update input shape for %s.", node->GetName().c_str()); + auto input_desc = op_desc->MutableInputDesc(static_cast(i)); + if (input_desc != nullptr) { + input_desc->SetShape(GeShape({kDynamicState})); + } + } + + for (auto &output_desc : op_desc->GetAllOutputsDescPtr()) { + if (output_desc != nullptr) { + GELOGD("Update output shape for %s.", node->GetName().c_str()); + output_desc->SetShape(GeShape({kDynamicState})); + } + } + } + return SUCCESS; +} +} // namespace ge \ No newline at end of file diff --git a/ge/graph/passes/mark_node_unknown_shape_pass.h b/ge/graph/passes/mark_node_unknown_shape_pass.h new file mode 100644 index 00000000..b78b7826 --- /dev/null +++ b/ge/graph/passes/mark_node_unknown_shape_pass.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_MARK_NODE_UNKNOWN_SHAPE_PASS_H_ +#define GE_GRAPH_PASSES_MARK_NODE_UNKNOWN_SHAPE_PASS_H_ +#include "graph/graph.h" +#include "inc/graph_pass.h" + +namespace ge { +class MarkNodeUnknownShapePass : public GraphPass { +public: + Status Run(ComputeGraphPtr graph); + +private: + bool IsAllAicoreSupportDyn(ComputeGraphPtr &graph); + Status UpdateNodeShapeToUnknown(ComputeGraphPtr &graph); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_MARK_NODE_UNKNOWN_SHAPE_PASS_H_ diff --git a/ge/graph/passes/memcpy_addr_async_pass.cc b/ge/graph/passes/memcpy_addr_async_pass.cc index b930f7cb..aff89f35 100755 --- a/ge/graph/passes/memcpy_addr_async_pass.cc +++ b/ge/graph/passes/memcpy_addr_async_pass.cc @@ -25,6 +25,14 @@ namespace ge { Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(graph); + for (const auto &node : graph->GetAllNodes()) { + if (node->GetType() == STREAMSWITCH) { + auto sub_graph = node->GetOwnerComputeGraph(); + if (sub_graph != nullptr && !sub_graph->GetGraphUnknownFlag()) { + GE_CHK_STATUS_RET(AddMemcpyAsyncNode(node), "Add memcpyasync node failed in known subgraph."); + } + } + } if (graph->GetGraphUnknownFlag()) { GELOGD("Graph[%s] is unknown graph, skip.", graph->GetName().c_str()); return SUCCESS; @@ -33,6 +41,8 @@ Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) { int64_t value = 0; rtError_t rt_ret = rtGetRtCapability(FEATURE_TYPE_MEMCPY, MEMCPY_INFO_SUPPORT_ZEROCOPY, &value); if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtGetRtCapability failed, ret = 0x%X", + rt_ret); GELOGE(RT_FAILED, "rtGetRtCapability failed, error=0x%x.", rt_ret); return RT_FAILED; } @@ -63,6 +73,28 @@ Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) { return SUCCESS; } +Status MemcpyAddrAsyncPass::AddMemcpyAsyncNode(const NodePtr &node) { + GE_CHECK_NOTNULL(node); + GELOGI("Start add memcpyasync node in front of node %s", node->GetName().c_str()); + known_sub_graph_ = true; + auto sub_graph = node->GetOwnerComputeGraph(); + for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { + OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + auto memcpy_async_node = CreateMemcpyAddrAsyncNode(sub_graph, peer_out_anchor, node); + if (memcpy_async_node == nullptr) { + GELOGE(INTERNAL_ERROR, "Create memcpyasync node failed."); + return INTERNAL_ERROR; + } + Status ret = InsertMemcpyAddrAsyncNode(peer_out_anchor, in_data_anchor, memcpy_async_node); + if (ret != SUCCESS) { + GELOGE(ret, "Insert memcpyasync node failed."); + return ret; + } + } + return SUCCESS; +} + Status MemcpyAddrAsyncPass::AddMemcpyAddrAsyncNode(const ComputeGraphPtr &graph, const NodePtr &node) { GELOGI("Start AddMemcpyAddrAsyncNode for %s.", node->GetName().c_str()); for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { @@ -85,6 +117,9 @@ Status MemcpyAddrAsyncPass::AddMemcpyAddrAsyncNode(const ComputeGraphPtr &graph, } else { uint32_t parent_index = 0; if (!AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to get parent index of %s", in_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -147,6 +182,9 @@ void MemcpyAddrAsyncPass::FindUserDataForNonDynamic(const ge::NodePtr &parent_no InDataAnchorPtr in_data_anchor = parent_node->GetInDataAnchor(parent_index); OutDataAnchorPtr out_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(out_anchor == nullptr, + REPORT_INNER_ERROR("E19999", "Index:%u in data node of op:%s(%s) not exist, check invalid", + parent_index, + parent_node->GetName().c_str(), parent_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Cannot find out_anchor of %s.", parent_node->GetName().c_str()); return); NodePtr in_node = out_anchor->GetOwnerNode(); @@ -160,6 +198,9 @@ void MemcpyAddrAsyncPass::FindUserDataForNonDynamic(const ge::NodePtr &parent_no in_anchor_for_known_ = in_data_anchor; NodePtr pre_in_node = in_node->GetOwnerComputeGraph()->GetParentNode(); if (!AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to refresh parent index of %s", in_node->GetName().c_str()); return; } @@ -176,6 +217,9 @@ void MemcpyAddrAsyncPass::FindUserDataForNonDynamic(const ge::NodePtr &parent_no } } else if (in_node->GetType() == IF || in_node->GetType() == WHILE || in_node->GetType() == CASE) { if (!AttrUtils::GetInt(parent_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s to op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), + parent_node->GetName().c_str(), parent_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to refresh parent index of %s", in_node->GetName().c_str()); return; } @@ -207,18 +251,32 @@ NodePtr MemcpyAddrAsyncPass::CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &gr GELOGD("Start CreateMemcpyAddrAsyncNode."); static uint32_t new_node_index = 0; OpDescPtr pre_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, return nullptr, "Op_desc of pre node is invalid."); - std::string node_name = pre_op_desc->GetName() + "_" + MEMCPYADDRASYNC + "_" + std::to_string(new_node_index++); + GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); + return nullptr, "Op_desc of pre node is invalid."); - OpDescPtr op_desc = MakeShared(node_name, MEMCPYADDRASYNC); - GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + OpDescPtr op_desc = nullptr; + if (known_sub_graph_) { // insert memcpyasync node when known sub graph + string node_name = pre_op_desc->GetName() + "_" + MEMCPYASYNC + "_" + std::to_string(new_node_index++); + op_desc = MakeShared(node_name, MEMCPYASYNC); + } else { + string node_name = pre_op_desc->GetName() + "_" + MEMCPYADDRASYNC + "_" + std::to_string(new_node_index++); + op_desc = MakeShared(node_name, MEMCPYADDRASYNC); + } + GE_CHECK_NOTNULL_EXEC(op_desc, + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); + return nullptr); if (op_desc->AddInputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + pre_op_desc->GetName().c_str(), pre_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add memcpy_addr_async input desc failed."); return nullptr; } if (op_desc->AddOutputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + pre_op_desc->GetName().c_str(), pre_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add memcpy_addr_async output desc failed."); return nullptr; } @@ -239,11 +297,17 @@ NodePtr MemcpyAddrAsyncPass::CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &gr (void)ge::AttrUtils::GetBool(out_of_user_data->GetOpDesc(), ATTR_NAME_NODE_CONNECT_INPUT, labeled_input); if (labeled_input) { if (!ge::AttrUtils::SetBool(out_of_user_data->GetOpDesc(), ATTR_NAME_NODE_CONNECT_INPUT, false)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_NODE_CONNECT_INPUT.c_str(), + out_of_user_data->GetName().c_str(), out_of_user_data->GetType().c_str()); GELOGE(FAILED, "Failed to unset attr %s for node %s.", ATTR_NAME_NODE_CONNECT_INPUT.c_str(), out_of_user_data->GetName().c_str()); return nullptr; } if (!ge::AttrUtils::SetBool(op_desc, ATTR_NAME_NODE_CONNECT_INPUT, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_NODE_CONNECT_INPUT.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Failed to set attr %s for node %s.", ATTR_NAME_NODE_CONNECT_INPUT.c_str(), op_desc->GetName().c_str()); return nullptr; @@ -251,7 +315,11 @@ NodePtr MemcpyAddrAsyncPass::CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &gr } NodePtr memcpy_addr_async_node = graph->AddNode(op_desc); - GE_CHECK_NOTNULL_EXEC(memcpy_addr_async_node, return nullptr); + GE_CHECK_NOTNULL_EXEC(memcpy_addr_async_node, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); + return nullptr); return memcpy_addr_async_node; } @@ -260,16 +328,29 @@ Status MemcpyAddrAsyncPass::InsertMemcpyAddrAsyncNode(const OutDataAnchorPtr &ou const InDataAnchorPtr &in_anchor, const NodePtr &node) { // insert memcpy_addr of each user_data and out_of_user_data if (GraphUtils::RemoveEdge(out_anchor, in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetOwnerNode()->GetType().c_str(), + out_anchor->GetIdx(), + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Remove edge of %s and %s failed.", out_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str()); return INTERNAL_ERROR; } if (GraphUtils::AddEdge(out_anchor, node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetOwnerNode()->GetType().c_str(), + out_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add edge of %s and %s failed.", out_anchor->GetOwnerNode()->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; } if (GraphUtils::AddEdge(node->GetOutDataAnchor(0), in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + node->GetName().c_str(), node->GetType().c_str(), + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Add edge of %s and %s failed.", node->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str()); return INTERNAL_ERROR; diff --git a/ge/graph/passes/memcpy_addr_async_pass.h b/ge/graph/passes/memcpy_addr_async_pass.h index 0f22d10b..a4190828 100755 --- a/ge/graph/passes/memcpy_addr_async_pass.h +++ b/ge/graph/passes/memcpy_addr_async_pass.h @@ -27,6 +27,7 @@ class MemcpyAddrAsyncPass : public GraphPass { private: Status AddMemcpyAddrAsyncNode(const ComputeGraphPtr &graph, const NodePtr &node); + Status AddMemcpyAsyncNode(const NodePtr &node); void FindUserData(const NodePtr &node, uint32_t &parent_index); void FindUserDataForKnown(const NodePtr &parent_node, uint32_t &parent_index); void FindUserDataForNonDynamic(const ge::NodePtr &parent_node, uint32_t &parent_index); @@ -48,6 +49,7 @@ class MemcpyAddrAsyncPass : public GraphPass { OutDataAnchorPtr peer_out_anchor_for_known_; InDataAnchorPtr in_anchor_for_known_; bool find_user_data_for_known_ = false; + bool known_sub_graph_ = false; }; } // namespace ge #endif // GE_GRAPH_PASSES_MEMCPY_ADDR_ASYNC_PASS_H_ diff --git a/ge/graph/passes/merge_input_memcpy_pass.cc b/ge/graph/passes/merge_input_memcpy_pass.cc index e8bf0377..99f8712b 100644 --- a/ge/graph/passes/merge_input_memcpy_pass.cc +++ b/ge/graph/passes/merge_input_memcpy_pass.cc @@ -23,7 +23,9 @@ namespace ge { Status MergeInputMemcpyPass::Run(ComputeGraphPtr graph) { GELOGD("MergeInputMemcpyPass Enter"); for (const auto &node : graph->GetDirectNode()) { - if ((node->GetType() != MERGE) && (node->GetType() != REFMERGE)) { + std::string type; + GE_CHK_STATUS_RET(GetOriginalType(node, type), "Get node type failed."); + if ((type != MERGE) && (type != REFMERGE)) { continue; } GE_CHECK_NOTNULL(node->GetOpDesc()); @@ -55,7 +57,8 @@ Status MergeInputMemcpyPass::AddMemcpyAsyncNodes(const ComputeGraphPtr &graph, c const std::string &memcpy_name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()); NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, memcpy_name, peer_out_anchor, multi_batch_flag); GE_CHK_BOOL_EXEC(memcpy_node != nullptr, return FAILED, "Create MemcpyAsync node failed."); - GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "MemcpyAsync node remove edge failed."); + GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), + "MemcpyAsync node remove edge failed."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, memcpy_node->GetInDataAnchor(0)), "MemcpyAsync node add edge failed."); GE_CHK_STATUS(GraphUtils::AddEdge(memcpy_node->GetOutDataAnchor(0), in_data_anchor), @@ -88,11 +91,14 @@ NodePtr MergeInputMemcpyPass::CreateMemcpyAsyncNode(const ComputeGraphPtr &graph } GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return nullptr, "Create MemcpyAsync op: add input desc failed."); GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return nullptr, "Create MemcpyAsync op: add output desc failed."); return graph->AddNode(op_desc); } } // namespace ge - diff --git a/ge/graph/passes/merge_pass.cc b/ge/graph/passes/merge_pass.cc index 26d82820..3206efa9 100644 --- a/ge/graph/passes/merge_pass.cc +++ b/ge/graph/passes/merge_pass.cc @@ -35,6 +35,7 @@ const size_t kCaseOneInput = 1; Status MergePass::Run(NodePtr &node) { GELOGD("MergePass running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } @@ -46,6 +47,8 @@ Status MergePass::Run(NodePtr &node) { } if (node->GetAllOutDataAnchors().empty()) { + REPORT_INNER_ERROR("E19999", "Param node:%s(%s) all data anchor size is 0, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(PARAM_INVALID, "[%s] Merge node output anchor is empty", node->GetName().c_str()); return PARAM_INVALID; } @@ -79,6 +82,8 @@ Status MergePass::Run(NodePtr &node) { auto in_node = in_data_nodes.at(0); if (IsMergeInputNeedOptimized(in_node)) { if (IsolateAndDeleteNode(in_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(FAILED, "Isolate and delete node %s failed.", in_node->GetName().c_str()); return FAILED; } @@ -115,6 +120,8 @@ Status MergePass::ChangeIndexToConstant(NodePtr &node, int &value_index) { GE_CHECK_NOTNULL(node); ComputeGraphPtr graph = node->GetOwnerComputeGraph(); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner graph of node:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] The owner graph must not be null.", node->GetName().c_str()); return FAILED; } @@ -125,11 +132,17 @@ Status MergePass::ChangeIndexToConstant(NodePtr &node, int &value_index) { } NodePtr const_node = graph->AddNode(constant_op_desc); if (const_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + constant_op_desc->GetName().c_str(), constant_op_desc->GetType().c_str(), + graph->GetName().c_str()); return FAILED; } // Change peer in anchors from value_index to new Constant node if (GraphUtils::ReplaceNodeAnchors(const_node, node, {}, {1}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace node:%s(%s) by node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); GELOGE(FAILED, "[%s] ReplaceNodeAnchors failed.", node->GetName().c_str()); return FAILED; } @@ -137,6 +150,9 @@ Status MergePass::ChangeIndexToConstant(NodePtr &node, int &value_index) { GE_CHECK_NOTNULL(out_control_anchor); // Add control anchor between Merge and Constant if (out_control_anchor->LinkTo(const_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control to op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); return FAILED; } @@ -148,6 +164,7 @@ Status MergePass::CreateConstByValue(NodePtr &node, int value_index, OpDescPtr & // 1. create Constant OpDesc op_desc = MakeShared(constant_name, CONSTANT); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "[%s] Make shared of Constant op desc failed.", constant_name.c_str()); return FAILED; } @@ -155,6 +172,7 @@ Status MergePass::CreateConstByValue(NodePtr &node, int value_index, OpDescPtr & // 2. get OpDesc of output number one of Merge(value_index) OpDescPtr original_op_desc = node->GetOpDesc(); if (original_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "[%s] Op desc must not be null.", constant_name.c_str()); return FAILED; } @@ -165,15 +183,19 @@ Status MergePass::CreateConstByValue(NodePtr &node, int value_index, OpDescPtr & GeTensorPtr const_tensor_ptr = MakeShared(original_out_tensor_desc, reinterpret_cast(&value_index), sizeof(int)); if (const_tensor_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "[%s] Make shared of Constant tensor failed.", constant_name.c_str()); return FAILED; } GE_IF_BOOL_EXEC(!AttrUtils::SetTensor(op_desc, ATTR_NAME_WEIGHTS, const_tensor_ptr), - GELOGE(FAILED, "get ATTR_NAME_WEIGHTS failed"); return FAILED); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_NAME_WEIGHTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(FAILED, "get ATTR_NAME_WEIGHTS failed"); return FAILED); // 4. set Constant output desc - GE_CHK_STATUS_RET(op_desc->AddOutputDesc(original_out_tensor_desc), "add out put desc failed"); + GE_CHK_GRAPH_STATUS_RET(op_desc->AddOutputDesc(original_out_tensor_desc), "add out put desc failed"); return SUCCESS; } diff --git a/ge/graph/passes/merge_to_stream_merge_pass.cc b/ge/graph/passes/merge_to_stream_merge_pass.cc index c1a57a61..8866831b 100644 --- a/ge/graph/passes/merge_to_stream_merge_pass.cc +++ b/ge/graph/passes/merge_to_stream_merge_pass.cc @@ -25,7 +25,9 @@ Status MergeToStreamMergePass::Run(ComputeGraphPtr graph) { bypass_nodes_.clear(); for (const auto &node : graph->GetDirectNode()) { - if ((node->GetType() != MERGE) && (node->GetType() != REFMERGE)) { + std::string type; + GE_CHK_STATUS_RET(GetOriginalType(node, type), "Get node type failed."); + if ((type != MERGE) && (type != REFMERGE)) { continue; } @@ -33,14 +35,24 @@ Status MergeToStreamMergePass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(merge_op_desc); if (merge_op_desc->HasAttr(ATTR_INSERT_BY_MBATCH)) { GE_CHK_STATUS_RET(AddActiveNodes(graph, node), "Merge add active node failed."); - GE_CHK_STATUS_RET(SetStreamLabel(node, node->GetName()), "Set stream label failed"); + auto status = SetStreamLabel(node, node->GetName()); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + node->GetName().c_str(), node->GetName().c_str(), node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } else { GE_CHK_STATUS_RET(ReplaceMergeNode(graph, node), "Add StreamMerge node failed."); } } for (const auto &node : bypass_nodes_) { - GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, return FAILED, + GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), + node->GetType().c_str(), graph->GetName().c_str()); + return FAILED, "Remove merge node failed."); } @@ -62,28 +74,40 @@ Status MergeToStreamMergePass::ReplaceMergeNode(const ComputeGraphPtr &graph, co GELOGI("Create StreamMerge Op, name=%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMMERGE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create op_desc failed, StreamMerge:%s.", node_name.c_str()); return FAILED; } for (const InDataAnchorPtr &in_anchor : merge_node->GetAllInDataAnchors()) { GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(merge_op_desc->GetInputDesc(in_anchor->GetIdx())) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return FAILED, "Create StreamMerge op: add input desc failed."); } for (const OutDataAnchorPtr &out_anchor : merge_node->GetAllOutDataAnchors()) { GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(merge_op_desc->GetOutputDesc(out_anchor->GetIdx())) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return FAILED, "Create StreamMerge op: add output desc failed."); } NodePtr stream_merge = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(stream_merge != nullptr, return FAILED, "Insert StreamMerge node failed."); + GE_CHK_BOOL_EXEC(stream_merge != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Insert StreamMerge node failed."); GE_CHK_STATUS_RET(MoveEdges(merge_node, stream_merge), "Move edges failed."); bypass_nodes_.insert(merge_node); if (merge_op_desc->HasAttr(ATTR_NAME_NEXT_ITERATION)) { std::string next_iteration_name; GE_IF_BOOL_EXEC(!AttrUtils::GetStr(merge_op_desc, ATTR_NAME_NEXT_ITERATION, next_iteration_name), + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", + ATTR_NAME_NEXT_ITERATION.c_str(), + merge_op_desc->GetName().c_str(), merge_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get ATTR_NAME_NEXT_ITERATION failed"); return INTERNAL_ERROR); GE_CHK_STATUS_RET(SetNextIteration(stream_merge, next_iteration_name), "Set next iteration failed"); @@ -99,7 +123,9 @@ Status MergeToStreamMergePass::ReplaceMergeNode(const ComputeGraphPtr &graph, co /// @return Status /// Status MergeToStreamMergePass::AddActiveNodes(const ComputeGraphPtr &graph, const NodePtr &node) { - GE_CHK_BOOL_EXEC(node != nullptr, return FAILED, "Param of pre node is null."); + GE_CHK_BOOL_EXEC(node != nullptr, + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); + return FAILED, "Param of pre node is null."); for (const InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); @@ -132,13 +158,20 @@ NodePtr MergeToStreamMergePass::CreateActiveNode(const ComputeGraphPtr &graph, c GELOGI("Create StreamActive op:%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMACTIVE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create op_desc failed, StreamActive:%s.", node_name.c_str()); return nullptr; } NodePtr active_node = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(active_node != nullptr, return nullptr, "Create StreamActive node failed."); + GE_CHK_BOOL_EXEC(active_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); + return nullptr, "Create StreamActive node failed."); GE_IF_BOOL_EXEC(GraphUtils::AddEdge(node->GetOutControlAnchor(), active_node->GetInControlAnchor()) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add edge failed"); return nullptr); GE_IF_BOOL_EXEC(SetSwitchBranchNodeLabel(active_node, node_name) != SUCCESS, @@ -159,14 +192,16 @@ Status MergeToStreamMergePass::MoveEdges(const NodePtr &old_node, const NodePtr OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); - GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "Merge remove in data edge failed."); + GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), + "Merge remove in data edge failed."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, new_node->GetInDataAnchor(in_data_anchor->GetIdx())), "StreamMerge add in data edge failed."); } for (const OutDataAnchorPtr &out_data_anchor : old_node->GetAllOutDataAnchors()) { for (const InDataAnchorPtr &peer_in_anchor : out_data_anchor->GetPeerInDataAnchors()) { - GE_CHK_STATUS(GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor), "Merge remove out data edge failed."); + GE_CHK_STATUS(GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor), + "Merge remove out data edge failed."); GE_CHK_STATUS(GraphUtils::AddEdge(new_node->GetOutDataAnchor(out_data_anchor->GetIdx()), peer_in_anchor), "StreamMerge add out data edge failed."); } diff --git a/ge/graph/passes/multi_batch_clone_pass.cc b/ge/graph/passes/multi_batch_clone_pass.cc index a33e1f40..9e1fe80a 100755 --- a/ge/graph/passes/multi_batch_clone_pass.cc +++ b/ge/graph/passes/multi_batch_clone_pass.cc @@ -52,7 +52,9 @@ inline bool IsGetNextType(const NodePtr &node) { } Status MultiBatchClonePass::Run(ComputeGraphPtr graph) { - GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(FAILED, "Original graph is nullptr"); return FAILED); + GE_IF_BOOL_EXEC(graph == nullptr, + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); + GELOGE(FAILED, "Original graph is nullptr"); return FAILED); if (graph->GetParentGraph() != nullptr) { GELOGD("Subgraph %s skip the MultiBatchClonePass", graph->GetName().c_str()); return SUCCESS; @@ -99,7 +101,9 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) { (void)AttrUtils::GetStr(graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_); ComputeGraphPtr branch = MakeShared(graph->GetName()); - GE_IF_BOOL_EXEC(branch == nullptr, GELOGE(OUT_OF_MEMORY, "Create multi batch graph failed"); return OUT_OF_MEMORY); + GE_IF_BOOL_EXEC(branch == nullptr, + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); + GELOGE(OUT_OF_MEMORY, "Create multi batch graph failed"); return OUT_OF_MEMORY); (void)AttrUtils::SetStr(branch, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_); graph->InValid(); // Will modify, need topological again. @@ -140,6 +144,8 @@ Status MultiBatchClonePass::CollectIoNodes(const ComputeGraphPtr &graph) { } if (all_data_nodes_.empty() || all_output_nodes_.size() != 1) { + REPORT_INNER_ERROR("E19999", "Data node num is 0 or output node num != 1, graph:%s, check invalid", + graph->GetName().c_str()); GELOGE(FAILED, "data nodes: %zu, output nodes: %zu", all_data_nodes_.size(), all_output_nodes_.size()); return FAILED; } @@ -280,6 +286,7 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { op_builder.AddInput("branch_index").AddDynamicInput("input", input_num).AddDynamicOutput("output", output_num); const OpDescPtr op_desc = op_builder.Build(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Build op:%s(%s) failed", kMultiBatchCaseNode.c_str(), CASE); GELOGE(OUT_OF_MEMORY, "Create multi-batch case desc failed"); return OUT_OF_MEMORY; } @@ -287,12 +294,16 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { op_desc->RegisterSubgraphIrName("branches", kDynamic); case_node_ = graph->AddNode(op_desc); if (case_node_ == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch case node failed"); return OUT_OF_MEMORY; } uint32_t batch_num = static_cast(batch_shapes_.size()); if (!AttrUtils::SetInt(op_desc, ATTR_NAME_BATCH_NUM, batch_num)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_BATCH_NUM.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Set attr ATTR_NAME_BATCH_NUM failed, Case: %s.", op_desc->GetName().c_str()); return FAILED; } @@ -300,6 +311,8 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { for (uint32_t i = 0; i < batch_num; i++) { const std::string &attr_name = ATTR_NAME_PRED_VALUE + "_" + std::to_string(i); if (!AttrUtils::SetListInt(op_desc, attr_name, batch_shapes_[i])) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", attr_name.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Set attr ATTR_NAME_PRED_VALUE failed, Case: %s.", op_desc->GetName().c_str()); return FAILED; } @@ -310,11 +323,15 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { data_name_order.push_back(item.first); } if (!AttrUtils::SetListStr(op_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, data_name_order)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Failed to add user designate shape order attr on case node %s", op_desc->GetName().c_str()); return FAILED; } if (!AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_INSERT_BY_MBATCH.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add insert attr on case node %s", op_desc->GetName().c_str()); return INTERNAL_ERROR; } @@ -338,16 +355,21 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) { Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &shape_node) { const OpDescPtr data_desc = MakeShared(kMultiBatchDataNode, DATA); if (data_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed"); return FAILED; } GeTensorDesc data_tensor(GeShape({static_cast(batch_shapes_[0].size())}), FORMAT_ND, DT_INT32); if (data_desc->AddInputDesc(data_tensor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str()); GELOGE(FAILED, "Add input desc failed"); return FAILED; } if (data_desc->AddOutputDesc(data_tensor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str()); GELOGE(FAILED, "Add output desc failed"); return FAILED; } @@ -359,6 +381,8 @@ Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, No shape_node = graph->AddNode(data_desc); if (shape_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed"); return OUT_OF_MEMORY; } @@ -376,6 +400,7 @@ Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, No Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, NodePtr &node) { const OpDescPtr const_desc = MakeShared(kMultiBatchConstNode, CONSTANT); if (const_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed"); return FAILED; } @@ -395,17 +420,23 @@ Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, N GeTensor tensor(const_tensor); (void)tensor.SetData(reinterpret_cast(addr.get()), count * sizeof(int32_t)); if (!AttrUtils::SetTensor(const_desc, ATTR_NAME_WEIGHTS, tensor)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + const_desc->GetName().c_str(), const_desc->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", const_desc->GetName().c_str()); return FAILED; } if (const_desc->AddOutputDesc(const_tensor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + const_desc->GetName().c_str(), const_desc->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add output desc for const node %s", const_desc->GetName().c_str()); return FAILED; } node = graph->AddNode(const_desc); if (node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + const_desc->GetName().c_str(), const_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch const node failed"); return OUT_OF_MEMORY; } @@ -438,11 +469,14 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) { const OpDescPtr op_desc = op_builder.Build(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Build op:%s(%s) failed", kMultiBatchMapIndexNode.c_str(), "MapIndex"); GELOGE(OUT_OF_MEMORY, "Create multi-batch index desc failed"); return FAILED; } NodePtr index_node = graph->AddNode(op_desc); if (index_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch index node failed"); return OUT_OF_MEMORY; } @@ -450,16 +484,25 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) { GE_CHK_STATUS_RET(AddAttrForGetDynamicDims(shape_node_), "Failed to add attr for %s.", shape_node_->GetName().c_str()); if (GraphUtils::AddEdge(shape_node_->GetOutDataAnchor(0), index_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + shape_node_->GetName().c_str(), shape_node_->GetType().c_str(), + index_node->GetName().c_str(), index_node->GetType().c_str()); GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", shape_node_->GetName().c_str(), index_node->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(1)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:1) failed", + const_node->GetName().c_str(), const_node->GetType().c_str(), + index_node->GetName().c_str(), index_node->GetType().c_str()); GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", const_node->GetName().c_str(), index_node->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(index_node->GetOutDataAnchor(0), case_node_->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + index_node->GetName().c_str(), index_node->GetType().c_str(), + case_node_->GetName().c_str(), case_node_->GetType().c_str()); GELOGE(FAILED, "Failed to add edge between MapIndex:%s to Case:%s", index_node->GetName().c_str(), case_node_->GetName().c_str()); return FAILED; @@ -471,6 +514,7 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) { Status MultiBatchClonePass::CreateGetDynamicDimsNode(const ComputeGraphPtr &graph, NodePtr &shape_node) { const OpDescPtr data_desc = MakeShared(kMultiBatchGetDynamicDimsNode, GETDYNAMICDIMS); if (data_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Create multi-batch get dynamic dims node failed"); return OUT_OF_MEMORY; } @@ -484,24 +528,35 @@ Status MultiBatchClonePass::CreateGetDynamicDimsNode(const ComputeGraphPtr &grap tensor_desc.SetFormat(FORMAT_ND); tensor_desc.SetDataType(DT_INT32); auto ret = data_desc->AddInputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); - return FAILED); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); + return FAILED); continue; } GeTensorDesc tensor_desc(GeShape({static_cast(input_shape_dims)}), FORMAT_ND, DT_INT32); auto ret = data_desc->AddInputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); - return FAILED); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); + return FAILED); } GeTensorDesc tensor_desc(GeShape({static_cast(batch_shapes_.at(0).size())}), FORMAT_ND, DT_INT32); auto ret = data_desc->AddOutputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data"); - return FAILED); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data"); + return FAILED); (void)AttrUtils::SetBool(data_desc, ATTR_INSERT_BY_MBATCH, true); shape_node = graph->AddNode(data_desc); if (shape_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + data_desc->GetName().c_str(), data_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch dynamic dims node failed"); return OUT_OF_MEMORY; } @@ -515,6 +570,8 @@ Status MultiBatchClonePass::AddAttrForGetDynamicDims(const NodePtr &shape_node) } GELOGD("Add attr for :%s, type is %s:", shape_node->GetName().c_str(), shape_node->GetType().c_str()); if (!AttrUtils::SetInt(shape_node->GetOpDesc(), ATTR_GETNEXT_SINK_DATA_COUNT, data_count_from_getnext_)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_GETNEXT_SINK_DATA_COUNT.c_str(), + shape_node->GetName().c_str(), shape_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_DATA_COUNT failed"); return INTERNAL_ERROR; } @@ -531,6 +588,8 @@ Status MultiBatchClonePass::AddAttrForGetDynamicDims(const NodePtr &shape_node) } } if (!AttrUtils::SetListInt(shape_node->GetOpDesc(), ATTR_GETNEXT_SINK_SHAPE_INFO, shape_info)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_GETNEXT_SINK_SHAPE_INFO.c_str(), + shape_node->GetName().c_str(), shape_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_SHAPE_INFO failed"); return INTERNAL_ERROR; } @@ -547,9 +606,13 @@ Status MultiBatchClonePass::LinkGetNextToGetDynamicDims(const NodePtr &getnext_n shape_node->GetName().c_str(), input_index); auto out_data_anchor = getnext_node->GetOutDataAnchor(out_index); auto ret = GraphUtils::AddEdge(out_data_anchor, shape_node->GetInDataAnchor(input_index)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link getnext %s to getdynamicdims %s", - getnext_node->GetName().c_str(), shape_node->GetName().c_str()); - return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%zu) failed", + getnext_node->GetName().c_str(), getnext_node->GetType().c_str(), out_index, + shape_node->GetName().c_str(), shape_node->GetType().c_str(), input_index); + GELOGE(INTERNAL_ERROR, "Failed to link getnext %s to getdynamicdims %s", + getnext_node->GetName().c_str(), shape_node->GetName().c_str()); + return INTERNAL_ERROR); } return SUCCESS; } @@ -557,6 +620,8 @@ Status MultiBatchClonePass::LinkGetNextToGetDynamicDims(const NodePtr &getnext_n Status MultiBatchClonePass::LinkGetDynamicDimsToNetOutput(const NodePtr &output_node) { if (!GetLocalOmgContext().dynamic_node_type.empty()) { if (!AttrUtils::SetStr(output_node->GetOpDesc(), ATTR_ALL_GEARS_INFO, GetLocalOmgContext().dynamic_dims)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ALL_GEARS_INFO.c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set all gears info attr on netoutput %s.", output_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -565,15 +630,23 @@ Status MultiBatchClonePass::LinkGetDynamicDimsToNetOutput(const NodePtr &output_ GELOGD("Start link %s to %s.", shape_node_->GetName().c_str(), output_node->GetName().c_str()); size_t input_index = output_node->GetAllInDataAnchors().size(); if (NodeUtils::AppendInputAnchor(output_node, input_index + 1) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Append input anchor to op:%s(%s) failed, size:%zu", + output_node->GetName().c_str(), output_node->GetType().c_str(), input_index + 1); GELOGE(INTERNAL_ERROR, "Append input anchor of %s of %zu failed.", output_node->GetName().c_str(), input_index); return INTERNAL_ERROR; } auto ret = GraphUtils::AddEdge(shape_node_->GetOutDataAnchor(kDataOutIndex), output_node->GetInDataAnchor(input_index)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link netoutput %s to getdynamicdims %s", - output_node->GetName().c_str(), shape_node_->GetName().c_str()); - return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%zu) failed", + shape_node_->GetName().c_str(), shape_node_->GetType().c_str(), kDataOutIndex, + output_node->GetName().c_str(), output_node->GetType().c_str(), input_index); + GELOGE(INTERNAL_ERROR, "Failed to link netoutput %s to getdynamicdims %s", + output_node->GetName().c_str(), shape_node_->GetName().c_str()); + return INTERNAL_ERROR); if (!AttrUtils::SetBool(output_node->GetOpDesc(), ATTR_GETNEXT_SINK_DYNMAIC, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_GETNEXT_SINK_DYNMAIC.c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set getnext sink dynamic attr on netoutput %s.", output_node->GetName().c_str()); return INTERNAL_ERROR; @@ -598,17 +671,25 @@ Status MultiBatchClonePass::CreateInputNode(const ComputeGraphPtr &graph) { const auto &node = all_data_nodes_[i]; const OpDescPtr op_desc = AttrUtils::CopyOpDesc(node->GetOpDesc()); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Copy op_desc from op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch Data node failed, name: %s", node->GetName().c_str()); return FAILED; } if (GraphUtils::CopyTensorAttrs(op_desc, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy tensor attr from op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } op_desc->SetName(node->GetName()); const NodePtr &data = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(data != nullptr, return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); + GE_CHK_BOOL_EXEC(data != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); if (IsGetNextType(node)) { getnext_node = data; input_index_of_getnext = case_input_index; @@ -617,6 +698,9 @@ Status MultiBatchClonePass::CreateInputNode(const ComputeGraphPtr &graph) { } else { if (GraphUtils::AddEdge(data->GetOutDataAnchor(0), case_node_->GetInDataAnchor(case_input_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%zu) failed", + data->GetName().c_str(), data->GetType().c_str(), + case_node_->GetName().c_str(), case_node_->GetType().c_str(), case_input_index); GELOGE(FAILED, "Failed to add edge between Data:%s to Case:%s", data->GetName().c_str(), case_node_->GetName().c_str()); return FAILED; @@ -651,6 +735,9 @@ Status MultiBatchClonePass::LinkEdgeForGetNext(const NodePtr &getnext_node, size for (size_t out_index = 0; out_index < data_count_from_getnext_; ++out_index, ++case_input_index) { if (GraphUtils::AddEdge(getnext_node->GetOutDataAnchor(out_index), case_node_->GetInDataAnchor(case_input_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%zu) failed", + getnext_node->GetName().c_str(), getnext_node->GetType().c_str(), out_index, + case_node_->GetName().c_str(), case_node_->GetType().c_str(), case_input_index); GELOGE(FAILED, "Failed to add data edge between %zu Data:%s to %zu Case:%s", out_index, getnext_node->GetName().c_str(), case_input_index, case_node_->GetName().c_str()); return FAILED; @@ -681,18 +768,29 @@ Status MultiBatchClonePass::CreateConstNode(const ComputeGraphPtr &graph) { const auto &node = all_const_nodes_[i]; const OpDescPtr op_desc = AttrUtils::CopyOpDesc(node->GetOpDesc()); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Copy op_desc from op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch Const node failed, name: %s", node->GetName().c_str()); return FAILED; } op_desc->SetName(node->GetName()); if (GraphUtils::CopyTensorAttrs(op_desc, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy tensor attr from op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } const NodePtr &data = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(data != nullptr, return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); + GE_CHK_BOOL_EXEC(data != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); if (GraphUtils::AddEdge(data->GetOutDataAnchor(0), case_node_->GetInDataAnchor(arg_index + i)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%zu) failed", + data->GetName().c_str(), data->GetType().c_str(), + case_node_->GetName().c_str(), case_node_->GetType().c_str(), arg_index + i); GELOGE(FAILED, "Failed to add edge between Const:%s to Case:%s", data->GetName().c_str(), case_node_->GetName().c_str()); return FAILED; @@ -741,22 +839,33 @@ Status MultiBatchClonePass::CreateOutputNode(const ComputeGraphPtr &graph) { const auto &output = all_output_nodes_[0]; const OpDescPtr op_desc = AttrUtils::CopyOpDesc(output->GetOpDesc()); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Copy op_desc from op:%s(%s) failed", + output->GetName().c_str(), output->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Create multi-batch output node failed"); return FAILED; } if (GraphUtils::CopyTensorAttrs(op_desc, output) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy tensor attr from op:%s(%s) failed", + output->GetName().c_str(), output->GetType().c_str()); return FAILED; } op_desc->SetName(output->GetName()); const NodePtr &node = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(node != nullptr, return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); + GE_CHK_BOOL_EXEC(node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str()); for (size_t i = 0; i < case_node_->GetAllOutDataAnchorsSize(); ++i) { const auto it = direct_output_.find(i); if (it == direct_output_.end()) { if (GraphUtils::AddEdge(case_node_->GetOutDataAnchor(i), node->GetInDataAnchor(i)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%zu) failed", + case_node_->GetName().c_str(), case_node_->GetType().c_str(), i, + node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(FAILED, "Failed to add edge between Case:%s to NetOutput:%s", case_node_->GetName().c_str(), node->GetName().c_str()); return FAILED; @@ -764,10 +873,14 @@ Status MultiBatchClonePass::CreateOutputNode(const ComputeGraphPtr &graph) { } else { const auto data_node = graph->FindNode(it->second); if (data_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Find node:%s from graph:%s failed", it->second.c_str(), graph->GetName().c_str()); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Data node:%s not found", it->second.c_str()); return GE_GRAPH_GRAPH_NODE_NULL; } if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(kDataOutIndex), node->GetInDataAnchor(i)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%zu) failed", + data_node->GetName().c_str(), data_node->GetType().c_str(), kDataOutIndex, + node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(FAILED, "Failed to add edge between Data:%s to NetOutput:%s", data_node->GetName().c_str(), node->GetName().c_str()); return FAILED; @@ -857,6 +970,8 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &node, size_t out_an int64_t size = 1; for (auto dim : data_to_dynamic_info_.at(data_name).at(i)) { if (INT64_MAX / dim < size) { + REPORT_INNER_ERROR("E19999", "The shape %s size will overflow after multi", + formats::ShapeToString(data_to_dynamic_info_.at(data_name).at(i)).c_str()); GELOGE(PARAM_INVALID, "The shape %s size overflow", formats::ShapeToString(data_to_dynamic_info_.at(data_name).at(i)).c_str()); return PARAM_INVALID; @@ -890,11 +1005,15 @@ Status MultiBatchClonePass::SetShapeToData(const std::vector &shapes, c } if (NodeUtils::UpdateOutputShape(*data, out_anchor_index, data_shape) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update ouput desc shape to op:%s(%s) failed, index:%zu", + data->GetName().c_str(), data->GetType().c_str(), out_anchor_index); GELOGE(INTERNAL_ERROR, "Failed to update output shape for data %s", data->GetName().c_str()); return INTERNAL_ERROR; } if (!IsGetNextType(data)) { if (NodeUtils::UpdateInputShape(*data, kDataInIndex, data_shape) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input desc shape to op:%s(%s) failed, index:%u", + data->GetName().c_str(), data->GetType().c_str(), kDataInIndex); GELOGE(INTERNAL_ERROR, "Failed to update input shape for data %s", data->GetName().c_str()); return INTERNAL_ERROR; } @@ -920,6 +1039,8 @@ Status MultiBatchClonePass::UpdateShapeOfShapeNode(const NodePtr &node, size_t o GeShape output_shape(output_dims); output_desc.SetShape(output_shape); if (node->GetOpDesc()->UpdateOutputDesc(shape_index, output_desc) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update ouput desc to op:%s(%s) failed, index:%zu", + node->GetName().c_str(), node->GetType().c_str(), shape_index); GELOGE(FAILED, "Update output desc fail."); return FAILED; } @@ -936,12 +1057,16 @@ Status MultiBatchClonePass::UpdateShapeOfShapeNode(const NodePtr &node, size_t o Status MultiBatchClonePass::UpdateSubgraphData(const NodePtr &data, size_t batch_index) { int node_index = -1; if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_INDEX, node_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Failed to get index from data[%s]", data->GetName().c_str()); return FAILED; } int parent_index = node_index + 1; if (!AttrUtils::SetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Failed to set parent index for node %s", data->GetName().c_str()); return FAILED; } @@ -958,6 +1083,8 @@ Status MultiBatchClonePass::UpdateSubgraphData(const NodePtr &data, size_t batch auto data_name = data->GetName(); size_t pos = data_name.find(kMultiBatchNodePostfix); if (pos == string::npos) { + REPORT_INNER_ERROR("E19999", "Cannot find key string [%s] of multi-batch in name of virtual input node:%s(%s)", + kMultiBatchNodePostfix.c_str(), data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.", kMultiBatchNodePostfix.c_str(), data_name.c_str()); return FAILED; @@ -980,18 +1107,26 @@ Status MultiBatchClonePass::CreateOriGraph(const ComputeGraphPtr &graph) { auto out_data_anchor = node->GetOutDataAnchor(out_index); GE_IF_BOOL_EXEC(out_data_anchor == nullptr, continue); NodePtr data_node = CreateDataNode(graph, out_data_anchor, data_index); - GE_IF_BOOL_EXEC(data_node == nullptr, GELOGE(INTERNAL_ERROR, "Create %d data node failed.", - out_data_anchor->GetIdx()); return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(data_node == nullptr, + REPORT_CALL_ERROR("E19999", "Create data node in graph:%s failed", graph->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "Create %d data node failed.", out_data_anchor->GetIdx()); + return INTERNAL_ERROR); for (auto &in_anchor : out_data_anchor->GetPeerInDataAnchors()) { GE_IF_BOOL_EXEC(in_anchor == nullptr, continue); NodePtr dst_node = in_anchor->GetOwnerNode(); if (GraphUtils::RemoveEdge(out_data_anchor, in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%d) failed", + node->GetName().c_str(), node->GetType().c_str(), out_index, + dst_node->GetName().c_str(), dst_node->GetType().c_str(), in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to remove edge between %s to %s", node->GetName().c_str(), dst_node->GetName().c_str()); return INTERNAL_ERROR; } if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), dst_node->GetInDataAnchor(in_anchor->GetIdx())) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + data_node->GetName().c_str(), data_node->GetType().c_str(), + dst_node->GetName().c_str(), dst_node->GetType().c_str(), in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add edge between %s to %s", data_node->GetName().c_str(), dst_node->GetName().c_str()); return INTERNAL_ERROR; @@ -999,6 +1134,8 @@ Status MultiBatchClonePass::CreateOriGraph(const ComputeGraphPtr &graph) { } } if (graph->RemoveNode(node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(GRAPH_FAILED, "Remove node %s failed!", node->GetName().c_str()); return GRAPH_FAILED; } @@ -1014,6 +1151,7 @@ NodePtr MultiBatchClonePass::CreateDataNode(const ComputeGraphPtr &graph, const std::string node_name = out_data_anchor->GetOwnerNode()->GetName() + "_" + std::to_string(out_anchor_index); OpDescPtr op_desc = MakeShared(node_name, DATA); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Create data node failed."); return nullptr; } @@ -1021,14 +1159,19 @@ NodePtr MultiBatchClonePass::CreateDataNode(const ComputeGraphPtr &graph, const OpDescPtr getnext_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc(); if (getnext_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out_data_anchor's owner node is nullptr, check invalid"); GELOGE(OUT_OF_MEMORY, "Op desc of %s is nullptr.", out_data_anchor->GetOwnerNode()->GetName().c_str()); return nullptr; } if (op_desc->AddInputDesc(getnext_op_desc->GetOutputDesc(out_anchor_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add %s input desc failed.", op_desc->GetName().c_str()); return nullptr; } if (op_desc->AddOutputDesc(getnext_op_desc->GetOutputDesc(out_anchor_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + getnext_op_desc->GetName().c_str(), getnext_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add %s output desc failed.", op_desc->GetName().c_str()); return nullptr; } @@ -1052,7 +1195,9 @@ Status MultiBatchClonePass::CreateSubgraphs(const ComputeGraphPtr &graph, const std::vector output_nodes; const std::string postfix = kMultiBatchNodePostfix + std::to_string(i); ComputeGraphPtr subgraph = (i == 0) ? branch : GraphUtils::CloneGraph(branch, postfix, input_nodes, output_nodes); - GE_IF_BOOL_EXEC(subgraph == nullptr, GELOGE(FAILED, "Create multi-batch case node failed"); return FAILED); + GE_IF_BOOL_EXEC(subgraph == nullptr, + REPORT_CALL_ERROR("E19999", "Clone graph from graph:%s failed", branch->GetName().c_str()); + GELOGE(FAILED, "Create multi-batch case node failed"); return FAILED); subgraph->SetName("Batch_" + std::to_string(i)); subgraph->SetParentNode(case_node_); subgraph->SetParentGraph(graph); @@ -1095,6 +1240,9 @@ Status MultiBatchClonePass::UpdateSubgraphOutput() { GeTensorDescPtr tensor = op_desc->MutableInputDesc(index); GE_CHECK_NOTNULL(tensor); if (!AttrUtils::SetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to input:%zu tensor of op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), index, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Failed to set parent index for node %s", output_node->GetName().c_str()); return FAILED; } @@ -1138,9 +1286,9 @@ Status MultiBatchClonePass::PruneDirectOutput(const ComputeGraphPtr &graph) { return SUCCESS; } - GE_CHK_STATUS_RET(NodeUtils::RemoveOutputAnchor(case_node_, output_num - unused_num), "Remove output failed"); + GE_CHK_GRAPH_STATUS_RET(NodeUtils::RemoveOutputAnchor(case_node_, output_num - unused_num), "Remove output failed"); for (const auto &item : all_branch_output_) { - GE_CHK_STATUS_RET(NodeUtils::RemoveInputAnchor(item.second, output_num - unused_num), "Remove input failed"); + GE_CHK_GRAPH_STATUS_RET(NodeUtils::RemoveInputAnchor(item.second, output_num - unused_num), "Remove input failed"); } return SUCCESS; diff --git a/ge/graph/passes/multi_batch_pass.cc b/ge/graph/passes/multi_batch_pass.cc index 74f7e30e..eafe982c 100644 --- a/ge/graph/passes/multi_batch_pass.cc +++ b/ge/graph/passes/multi_batch_pass.cc @@ -21,6 +21,7 @@ #include "common/ge/ge_util.h" #include "graph/common/omg_util.h" #include "graph/utils/type_utils.h" +#include "common/formats/utils/formats_trans_utils.h" namespace ge { Status MultiBatchPass::Run(ComputeGraphPtr graph) { @@ -72,6 +73,8 @@ Status MultiBatchPass::Run(ComputeGraphPtr graph) { for (const NodePtr &node : bypass_nodes_) { if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Remove SwitchN nodes %s failed.", node->GetName().c_str()); return FAILED; } @@ -139,11 +142,15 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor const auto &in_data_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT); if (in_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u data anchor of node:%s(%s) is nullptr, check invalid", + SWITCH_PRED_INPUT, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "FindPredInput failed, in_data_anchor is null, node:%s.", node->GetName().c_str()); return FAILED; } const auto &pred_input = in_data_anchor->GetPeerOutAnchor(); if (pred_input == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u data anchor of node:%s(%s), its peer anchor is nullptr, check invalid", + SWITCH_PRED_INPUT, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "FindPredInput failed, pred_input is null, node:%s.", node->GetName().c_str()); return FAILED; } @@ -151,6 +158,8 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor if (pred_value == nullptr) { pred_value = pred_input; } else if (pred_value != pred_input) { + REPORT_INNER_ERROR("E19999", "Multi pred_value of case node exist in graph:%s, check invalid", + graph->GetName().c_str()); GELOGE(FAILED, "Multi pred_value node exist."); return FAILED; } @@ -163,6 +172,7 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor } if (pred_value == nullptr) { + REPORT_INNER_ERROR("E19999", "Find Pred Input of case node in graph:%s failed", graph->GetName().c_str()); GELOGE(FAILED, "FindPredInput failed, pred_value is null."); return FAILED; } @@ -179,14 +189,22 @@ Status MultiBatchPass::GetDynamicType() { for (const auto &switch_n : switch_n_nodes_) { int32_t dynamic_type = static_cast(FIXED); if (!AttrUtils::GetInt(switch_n->GetOpDesc(), ATTR_DYNAMIC_TYPE, dynamic_type)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_DYNAMIC_TYPE.c_str(), + switch_n->GetName().c_str(), switch_n->GetType().c_str()); GELOGE(FAILED, "Get attr ATTR_DYNAMIC_TYPE of node: %s failed.", switch_n->GetName().c_str()); return FAILED; } if (dynamic_type == static_cast(FIXED)) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%d check invalid", ATTR_DYNAMIC_TYPE.c_str(), + switch_n->GetName().c_str(), switch_n->GetType().c_str(), dynamic_type); GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE shouldn't be 0."); return FAILED; } if (dynamic_type_ != static_cast(FIXED) && dynamic_type_ != dynamic_type) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%d not same as attr value:%d in node before, " + "check invalid", + ATTR_DYNAMIC_TYPE.c_str(), switch_n->GetName().c_str(), switch_n->GetType().c_str(), + dynamic_type, dynamic_type_); GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE of all switch_n node should be same, while one is %d and another is %d.", dynamic_type, dynamic_type_); return FAILED; @@ -194,6 +212,7 @@ Status MultiBatchPass::GetDynamicType() { dynamic_type_ = dynamic_type; } if (dynamic_type_ == static_cast(FIXED)) { + REPORT_INNER_ERROR("E19999", "Find Attr:%s in all switcnn node failed", ATTR_DYNAMIC_TYPE.c_str()); GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE shouldn't be 0."); return FAILED; } @@ -211,6 +230,8 @@ Status MultiBatchPass::GetUserDesignateShape() { for (const auto &switch_n : switch_n_nodes_) { std::vector cur_data_name_order; if (!AttrUtils::GetListStr(switch_n->GetOpDesc(), ATTR_USER_DESIGNEATE_SHAPE_ORDER, cur_data_name_order)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str(), + switch_n->GetName().c_str(), switch_n->GetType().c_str()); GELOGE(FAILED, "Get attr ATTR_USER_DESIGNEATE_SHAPE_ORDER of node: %s failed.", switch_n->GetName().c_str()); return FAILED; } @@ -219,6 +240,11 @@ Status MultiBatchPass::GetUserDesignateShape() { first_check = false; } else { if (data_name_order_ != cur_data_name_order) { + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%s not same as attr value:%s in node before, " + "check invalid", ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str(), + switch_n->GetName().c_str(), switch_n->GetType().c_str(), + formats::JoinToString(cur_data_name_order).c_str(), + formats::JoinToString(data_name_order_).c_str()); GELOGE(FAILED, "The ATTR_USER_DESIGNEATE_SHAPE_ORDER of switchN must be same: %s failed.", switch_n->GetName().c_str()); return FAILED; @@ -226,6 +252,7 @@ Status MultiBatchPass::GetUserDesignateShape() { } } if (data_name_order_.empty()) { + REPORT_INNER_ERROR("E19999", "Find Attr:%s in all switcnn node failed", ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str()); GELOGE(FAILED, "user shape order can not be empty"); return FAILED; } @@ -248,6 +275,8 @@ bool MultiBatchPass::CheckSwitchN(std::vector> &batch_shape if (batch_num == 0) { batch_num = tmp_num; } else if (batch_num != tmp_num) { + REPORT_INNER_ERROR("E19999", "Ouput size num:%u of node:%s(%s) not same as output size num:%d of node before, " + "check invalid", tmp_num, node->GetName().c_str(), node->GetType().c_str(), batch_num); GELOGE(FAILED, "Output size of SwitchN not equal;"); return false; } @@ -259,10 +288,12 @@ bool MultiBatchPass::CheckSwitchN(std::vector> &batch_shape } if (batch_shape.empty()) { + REPORT_INNER_ERROR("E19999", "batch_shape size is empty after GetBatchInfo, check invalid"); GELOGE(FAILED, "batch_shape is empty."); return false; } if (combined_batch.empty()) { + REPORT_INNER_ERROR("E19999", "combined_batch size is empty after GetBatchInfo, check invalid"); GELOGE(FAILED, "combined_batch is empty."); return false; } @@ -271,11 +302,15 @@ bool MultiBatchPass::CheckSwitchN(std::vector> &batch_shape for (uint32_t i = 1; i < batch_num; i++) { size_t tmp_dim_num = batch_shape[i].size(); if (dim_num != tmp_dim_num) { + REPORT_INNER_ERROR("E19999", "Dim num of batch_shape not equal, batch_0:%zu, batch_%u:%zu, check invalid", + dim_num, i, tmp_dim_num); GELOGE(FAILED, "Dim num of batch_shape not equal, batch_0:%zu, batch_%u:%zu.", dim_num, i, tmp_dim_num); return false; } size_t tmp_combined_dim_num = combined_batch[i].size(); if (combined_dim_num != tmp_combined_dim_num) { + REPORT_INNER_ERROR("E19999", "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu, check invalid", + combined_dim_num, i, tmp_combined_dim_num); GELOGE(FAILED, "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu.", combined_dim_num, i, tmp_combined_dim_num); return false; @@ -303,23 +338,32 @@ bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, std::vectorGetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "CheckDims failed, get op_desc failed, node: %s.", node->GetName().c_str()); return false; } std::vector output_dims; if (!AttrUtils::GetListInt(op_desc->GetOutputDesc(i), ATTR_NAME_SWITCHN_PRED_VALUE, output_dims)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from output:%u tensor of op:%s(%s) failed", + ATTR_NAME_SWITCHN_PRED_VALUE.c_str(), i, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "CheckDims failed, get attr ATTR_NAME_SWITCHN_PRED_VALUE failed, batch_index=%u.", i); return false; } idx_batch_shape.emplace_back(output_dims); output_dims.clear(); if (!AttrUtils::GetListInt(op_desc->GetOutputDesc(i), ATTR_NAME_COMBINED_DYNAMIC_DIMS, output_dims)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from output:%u tensor of op:%s(%s) failed", + ATTR_NAME_COMBINED_DYNAMIC_DIMS.c_str(), i, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "CheckDims failed, get attr ATTR_NAME_COMBINED_DYNAMIC_DIMS failed, batch_index=%u.", i); return false; } idx_combined_batch.emplace_back(output_dims); } if (!CheckDims(idx_batch_shape)) { + REPORT_INNER_ERROR("E19999", "Attr:%s of all output:%u tensor in switcnn node not equal, or not exist, " + "check invalid", ATTR_NAME_SWITCHN_PRED_VALUE.c_str(), i); GELOGE(FAILED, "CheckDims failed, batch_index=%u.", i); return false; } @@ -351,6 +395,9 @@ Status MultiBatchPass::FindSwitchOutNodes(uint32_t batch_num) { } bypass_nodes_.emplace_back(out_node); if (GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + node->GetName().c_str(), node->GetType().c_str(), i, + out_node->GetName().c_str(), out_node->GetType().c_str(), peer_in_anchor->GetIdx()); GELOGE(FAILED, "Remove SwitchN out_data_edge failed, %s->%s.", node->GetName().c_str(), out_node->GetName().c_str()); return FAILED; @@ -359,6 +406,9 @@ Status MultiBatchPass::FindSwitchOutNodes(uint32_t batch_num) { output_nodes.emplace_back(identity_out_node); if (GraphUtils::RemoveEdge(out_node->GetOutControlAnchor(), identity_out_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_node->GetName().c_str(), out_node->GetType().c_str(), + identity_out_node->GetName().c_str(), identity_out_node->GetType().c_str()); GELOGE(FAILED, "Remove SwitchN out_data_edge failed, %s->%s.", node->GetName().c_str(), out_node->GetName().c_str()); return FAILED; @@ -401,6 +451,9 @@ Status MultiBatchPass::ReplaceSwitchN(const ComputeGraphPtr &graph, const OutDat // Add switchCase input edge if (GraphUtils::AddEdge(pred_value, switch_case->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + pred_value_node->GetName().c_str(), pred_value_node->GetType().c_str(), pred_value->GetIdx(), + switch_case->GetName().c_str(), switch_case->GetType().c_str()); GELOGE(FAILED, "Add SwitchCase in_data_edge failed, %s->%s.", pred_value_node->GetName().c_str(), switch_case->GetName().c_str()); return FAILED; @@ -448,6 +501,7 @@ NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const const std::vector> &combined_batch) { OpDescPtr op_desc = MakeShared(name, STREAMSWITCHN); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } @@ -455,41 +509,56 @@ NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const GELOGI("Create StreamSwitchN op:%s.", name.c_str()); OpDescPtr pred_desc = pred_value->GetOwnerNode()->GetOpDesc(); if (pred_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "Get pred_desc failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } if (op_desc->AddInputDesc(pred_desc->GetOutputDesc(pred_value->GetIdx())) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "AddInputDesc failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } NodePtr switch_case_node = graph->AddNode(op_desc); if (switch_case_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Create node failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } uint32_t batch_num = static_cast(batch_shape.size()); if (!AttrUtils::SetInt(op_desc, ATTR_NAME_BATCH_NUM, batch_num)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_BATCH_NUM.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "set attr ATTR_NAME_BATCH_NUM failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } if (!AttrUtils::SetInt(op_desc, ATTR_DYNAMIC_TYPE, dynamic_type_)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_DYNAMIC_TYPE.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Set attr ATTR_DYNAMIC_TYPE failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } if (!AttrUtils::SetListStr(op_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, data_name_order_)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Set attr ATTR_USER_DESIGNEATE_SHAPE_ORDER failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } for (uint32_t i = 0; i < batch_num; i++) { const std::string &attr_name = ATTR_NAME_PRED_VALUE + "_" + std::to_string(i); if (!AttrUtils::SetListInt(op_desc, attr_name, batch_shape[i])) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", attr_name.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "set attr ATTR_NAME_PRED_VALUE failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } const std::string &attr_combined_batch = ATTR_NAME_COMBINED_BATCH + "_" + std::to_string(i); if (!AttrUtils::SetListInt(op_desc, attr_combined_batch, combined_batch[i])) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", attr_combined_batch.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "set attr ATTR_NAME_COMBINED_BATCH failed, StreamSwitchN:%s.", name.c_str()); return nullptr; } @@ -507,11 +576,15 @@ NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const Status MultiBatchPass::BypassSwitchN(const NodePtr &switch_n_node, const NodePtr &switch_case) { InDataAnchorPtr in_data_anchor = switch_n_node->GetInDataAnchor(SWITCH_DATA_INPUT); if (in_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u in data anchor of node:%s(%s) is nullptr, check invalid", + SWITCH_DATA_INPUT, switch_n_node->GetName().c_str(), switch_n_node->GetType().c_str()); GELOGE(FAILED, "Check in_data_anchor failed, SwitchN:%s.", switch_n_node->GetName().c_str()); return FAILED; } OutDataAnchorPtr peer_data_anchor = in_data_anchor->GetPeerOutAnchor(); if (peer_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u in data anchor of node:%s(%s), its peer ahcnhor is nullptr, check invalid", + SWITCH_DATA_INPUT, switch_n_node->GetName().c_str(), switch_n_node->GetType().c_str()); GELOGE(FAILED, "Check peer_data_anchor failed, SwitchN:%s.", switch_n_node->GetName().c_str()); return FAILED; } @@ -519,11 +592,17 @@ Status MultiBatchPass::BypassSwitchN(const NodePtr &switch_n_node, const NodePtr // Remove SwitchN data input if (GraphUtils::RemoveEdge(peer_data_anchor, in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + data_input->GetName().c_str(), data_input->GetType().c_str(), peer_data_anchor->GetIdx(), + switch_n_node->GetName().c_str(), switch_n_node->GetType().c_str(), SWITCH_DATA_INPUT); GELOGE(FAILED, "Remove SwitchN in_data_edge failed, %s->%s.", data_input->GetName().c_str(), switch_n_node->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(data_input->GetOutControlAnchor(), switch_case->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + data_input->GetName().c_str(), data_input->GetType().c_str(), + switch_case->GetName().c_str(), switch_case->GetType().c_str()); GELOGE(FAILED, "Add StreamSwitchN in_control_edge failed, %s->%s.", data_input->GetName().c_str(), switch_case->GetName().c_str()); return FAILED; @@ -535,11 +614,20 @@ Status MultiBatchPass::BypassSwitchN(const NodePtr &switch_n_node, const NodePtr NodePtr data_output = peer_in_anchor->GetOwnerNode(); if ((GraphUtils::RemoveEdge(out_data_anchor, peer_in_anchor) != GRAPH_SUCCESS) || (GraphUtils::AddEdge(peer_data_anchor, peer_in_anchor) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) or " + "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + switch_n_node->GetName().c_str(), switch_n_node->GetType().c_str(), out_data_anchor->GetIdx(), + data_output->GetName().c_str(), data_output->GetType().c_str(), peer_in_anchor->GetIdx(), + data_input->GetName().c_str(), data_input->GetType().c_str(), peer_data_anchor->GetIdx(), + data_output->GetName().c_str(), data_output->GetType().c_str(), peer_in_anchor->GetIdx()); GELOGE(FAILED, "Bypass SwitchN data_edge failed, %s->%s->%s.", data_input->GetName().c_str(), switch_n_node->GetName().c_str(), data_output->GetName().c_str()); return FAILED; } if (GraphUtils::AddEdge(switch_case->GetOutControlAnchor(), data_output->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + switch_case->GetName().c_str(), switch_case->GetType().c_str(), + data_output->GetName().c_str(), data_output->GetType().c_str()); GELOGE(FAILED, "Add SwitchCase out_control_edge failed, %s->%s.", switch_case->GetName().c_str(), data_output->GetName().c_str()); return FAILED; @@ -602,10 +690,15 @@ Status MultiBatchPass::AttachBatchLabel(uint32_t batch_idx) { if (cur_desc->HasAttr(ATTR_NAME_BATCH_LABEL)) { std::string tmp_label; if (!AttrUtils::GetStr(cur_desc, ATTR_NAME_BATCH_LABEL, tmp_label)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_BATCH_LABEL.c_str(), + cur_desc->GetName().c_str(), cur_desc->GetType().c_str()); GELOGE(FAILED, "get attr ATTR_NAME_BATCH_LABEL failed, node: %s.", cur_desc->GetName().c_str()); return FAILED; } if (tmp_label != batch_label) { + REPORT_INNER_ERROR("E19999", "Attr:%s from op:%s(%s) value:%s not equal to expect:%s, check invalid", + ATTR_NAME_BATCH_LABEL.c_str(), cur_desc->GetName().c_str(), cur_desc->GetType().c_str(), + tmp_label.c_str(), batch_label.c_str()); GELOGE(FAILED, "Reach other batch_branch, node:%s, cur_label:%s, batch_label:%s.", cur_desc->GetName().c_str(), tmp_label.c_str(), batch_label.c_str()); return FAILED; @@ -613,6 +706,8 @@ Status MultiBatchPass::AttachBatchLabel(uint32_t batch_idx) { } GELOGD("Attach batch_label %s to node %s.", batch_label.c_str(), cur_desc->GetName().c_str()); if (!AttrUtils::SetStr(cur_desc, ATTR_NAME_BATCH_LABEL, batch_label)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_BATCH_LABEL.c_str(), + cur_desc->GetName().c_str(), cur_desc->GetType().c_str()); GELOGE(FAILED, "set attr ATTR_NAME_BATCH_LABEL failed, node:%s.", cur_desc->GetName().c_str()); return FAILED; } @@ -625,6 +720,8 @@ Status MultiBatchPass::AttachBatchLabel(uint32_t batch_idx) { continue; } if (type == NETOUTPUT) { + REPORT_CALL_ERROR("E19999", "SReach net_output without Merge, cur_node:%s(%s), check invalid", + cur_node->GetName().c_str(), cur_node->GetType().c_str()); GELOGE(FAILED, "Reach net_output without Merge, cur_node:%s.", cur_node->GetName().c_str()); return FAILED; } @@ -661,6 +758,8 @@ Status MultiBatchPass::AttachStreamLabel(uint32_t batch_idx, const std::string & GELOGD("Attach stream_label %s to node %s.", stream_label.c_str(), cur_desc->GetName().c_str()); if (SetStreamLabel(cur_node, stream_label) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), cur_node->GetName().c_str(), cur_node->GetType().c_str()); GELOGE(FAILED, "Set stream_label failed, node:%s.", cur_node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/net_output_pass.cc b/ge/graph/passes/net_output_pass.cc index c553607f..aca7058d 100644 --- a/ge/graph/passes/net_output_pass.cc +++ b/ge/graph/passes/net_output_pass.cc @@ -40,6 +40,7 @@ static std::map output_type_str_to_datatype = { // the size of user defined output datatype or format string after split by ":". const size_t kUserDefinedElementCount = 2; +const size_t kNodesCount = 2; Status NetOutputPass::GetRetvalOutputInfo(const ge::NodePtr &node, std::map &retval_node_index_map) { @@ -47,10 +48,14 @@ Status NetOutputPass::GetRetvalOutputInfo(const ge::NodePtr &node, GE_CHECK_NOTNULL(node->GetOpDesc()); int64_t output_index = 0; if (!AttrUtils::GetInt(node->GetOpDesc(), RETVAL_ATTR_NAME_INDEX, output_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", RETVAL_ATTR_NAME_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(PARAM_INVALID, "Get output index failed."); return PARAM_INVALID; } if (retval_node_index_map.count(output_index) > 0) { + REPORT_INNER_ERROR("E19999", "Attr:%s from op:%s(%s), value:%ld duplicate with other node, check invalid", + RETVAL_ATTR_NAME_INDEX.c_str(), node->GetName().c_str(), node->GetType().c_str(), output_index); GELOGE(PARAM_INVALID, "Retval has duplicate index."); return PARAM_INVALID; } @@ -109,7 +114,15 @@ Status NetOutputPass::GetOutputNode(const ge::ComputeGraphPtr &graph, std::vecto if (op_desc->HasAttr(ATTR_ATC_USER_DEFINE_OUTPUT_NODES)) { is_user_define_ouput_nodes = true; } - output_nodes_info.push_back({ele.first, ele.second, -1}); + int parent_index = -1; + auto output_desc = op_desc->MutableOutputDesc(ele.second); + if (output_desc == nullptr) { + GELOGE(FAILED, "[Get][OutputDesc]Can not find output tensor desc from node:%s, index %d", + op_desc->GetName().c_str(), ele.second); + return FAILED; + } + (void)ge::AttrUtils::GetInt(output_desc, ge::ATTR_NAME_PARENT_NODE_INDEX, parent_index); + output_nodes_info.push_back({ele.first, ele.second, parent_index}); } GELOGI("Output node set by user or leaf node, size:%zu.", output_nodes_info.size()); for (auto &ele : out_nodes_tmp) { @@ -129,10 +142,13 @@ Status NetOutputPass::CheckOutputNodeInfo(const ComputeGraphPtr &graph, const st for (auto &item : outputs) { NodePtr node = item.output_node; if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param outputs has item which output_node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Node in outputs is null."); return PARAM_INVALID; } else { if (graph->FindNode(node->GetName()) == nullptr) { + REPORT_INNER_ERROR("E19999", "Find node:%s from graph:%s failed", + node->GetName().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Out node (%s) is not in graph.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -140,6 +156,8 @@ Status NetOutputPass::CheckOutputNodeInfo(const ComputeGraphPtr &graph, const st int32_t out_size = node->GetOpDesc()->GetOutputsSize(); int32_t index = item.node_output_index; if (index < 0 || index >= out_size) { + REPORT_INNER_ERROR("E19999", "Index:%d in param outputs item, < 0 or > output size:%d of node:%s(%s)", + index, out_size, node->GetName().c_str(), node->GetType().c_str()); GELOGE(PARAM_INVALID, "User declared out node (%s) output index:%d must be smaller " "than node ouput size:%d and cann't be negative!", @@ -169,6 +187,8 @@ Status NetOutputPass::RemoveUnusedNode(const ge::ComputeGraphPtr &graph) { continue; } if (graph->RemoveNode(node) != GRAPH_SUCCESS) { + REPORT_INNER_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node failed, node name:%s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -179,10 +199,13 @@ Status NetOutputPass::RemoveUnusedNode(const ge::ComputeGraphPtr &graph) { Status NetOutputPass::UpdateNetOutputDesc(const ge::NodePtr &net_output) { OpDescPtr net_output_desc = net_output->GetOpDesc(); if (net_output_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in Param net_output is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Opdesc of net output node is nullptr."); return INTERNAL_ERROR; } if (net_output_desc->GetInputsSize() == 0) { + REPORT_INNER_ERROR("E19999", "Input desc num of node:%s(%s) is 0, check invalid", + net_output_desc->GetName().c_str(), net_output_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Net output node input is empty."); return INTERNAL_ERROR; } @@ -192,6 +215,9 @@ Status NetOutputPass::UpdateNetOutputDesc(const ge::NodePtr &net_output) { GE_CHECK_NOTNULL(in_anchor); uint32_t index = static_cast(in_anchor->GetIdx()); if (index >= net_output_desc->GetAllInputsDesc().size()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has in_anchor index:%u >= its input desc num:%zu, check invalid", + net_output_desc->GetName().c_str(), net_output_desc->GetType().c_str(), index, + net_output_desc->GetAllInputsDesc().size()); GELOGE(INTERNAL_ERROR, "Index is invalid, index:%u, size:%zu.", index, net_output_desc->GetAllInputsDesc().size()); return INTERNAL_ERROR; @@ -203,6 +229,8 @@ Status NetOutputPass::UpdateNetOutputDesc(const ge::NodePtr &net_output) { uint32_t peer_index = static_cast(in_anchor->GetPeerOutAnchor()->GetIdx()); ge::GeTensorDesc output_in_desc = src_op_desc->GetOutputDesc(peer_index); if (net_output_desc->UpdateInputDesc(index, output_in_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input desc of op:%s(%s) failed, index:%u", + net_output_desc->GetName().c_str(), net_output_desc->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Update input desc failed, index:%u.", index); return INTERNAL_ERROR; } @@ -216,6 +244,7 @@ Status NetOutputPass::UpdateNetOutputDesc(const ge::NodePtr &net_output) { Status NetOutputPass::AddCtrlEdgeForTargets(const ge::NodePtr &net_out_node) { if (net_out_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param net_out_node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "net out node is null."); return PARAM_INVALID; } @@ -227,6 +256,9 @@ Status NetOutputPass::AddCtrlEdgeForTargets(const ge::NodePtr &net_out_node) { // no need to check null because have handled it in run SaveAndRemoveTargets function graphStatus status = GraphUtils::AddEdge(node->GetOutControlAnchor(), net_out_node->GetInControlAnchor()); if (status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + net_out_node->GetName().c_str(), net_out_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add ctrl edge to netoutput node[%s] for target node [%s] failed!", net_out_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; @@ -258,6 +290,9 @@ Status NetOutputPass::AddEdgesForNetOutput(const ge::ComputeGraphPtr &graph, con graphStatus status = GraphUtils::AddEdge(src_node->GetOutDataAnchor(item.node_output_index), net_out_node->GetInDataAnchor(net_input_index)); if (status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%u) and op:%s(%s)(index:%d) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), item.node_output_index, + net_out_node->GetName().c_str(), net_out_node->GetType().c_str(), net_input_index); GELOGE(INTERNAL_ERROR, "AddEdge failed, src name:%s, src index:%d, dst index:%d.", src_node->GetName().c_str(), item.node_output_index, net_input_index); return INTERNAL_ERROR; @@ -269,10 +304,15 @@ Status NetOutputPass::AddEdgesForNetOutput(const ge::ComputeGraphPtr &graph, con graph->GetName().c_str()); auto input_desc = net_out_node->GetOpDesc()->MutableInputDesc(net_input_index); if (input_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Node:%s(%s) has no input desc index is %d, check invalid", + net_out_node->GetName().c_str(), net_out_node->GetType().c_str(), net_input_index); GELOGE(INTERNAL_ERROR, "Can not find intput tensor desc from NetOutput, index %d", net_input_index); return INTERNAL_ERROR; } if (!AttrUtils::SetInt(input_desc, ATTR_NAME_PARENT_NODE_INDEX, item.parent_node_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to input:%d tensor of op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), net_input_index, + net_out_node->GetName().c_str(), net_out_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add parent index to NetOutput, index %d", net_input_index); return INTERNAL_ERROR; } @@ -289,6 +329,8 @@ Status NetOutputPass::AddEdgesForNetOutput(const ge::ComputeGraphPtr &graph, con } // Add true stream, netoutput is 0 GE_IF_BOOL_EXEC(!ge::AttrUtils::SetInt(net_out_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, 0), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_TRUE_BRANCH_STREAM.c_str(), + net_out_node->GetName().c_str(), net_out_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_TRUE_BRANCH_STREAM failed"); return INTERNAL_ERROR); return SUCCESS; @@ -305,6 +347,7 @@ bool NetOutputPass::CheckNodeIsInOutputNodes(const ge::ComputeGraphPtr &graph, c } Status NetOutputPass::UnLinkDataAnchorOfNetoutput(const ge::ComputeGraphPtr &graph, const ge::NodePtr &net_out_node) { if (net_out_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param net_out_node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "net out node is null."); return PARAM_INVALID; } @@ -326,6 +369,10 @@ Status NetOutputPass::UnLinkDataAnchorOfNetoutput(const ge::ComputeGraphPtr &gra if (!CheckNodeIsInOutputNodes(graph, node)) { ret = in_data_anchor->Unlink(peer_out_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + net_out_node->GetName().c_str(), net_out_node->GetType().c_str(), in_data_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str(), peer_out_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Unlink peer_out_anchor fail!"); return ret; } @@ -340,12 +387,14 @@ Status NetOutputPass::UnLinkDataAnchorOfNetoutput(const ge::ComputeGraphPtr &gra Status NetOutputPass::UnLinkControlAnchorOfNetoutput(const ge::ComputeGraphPtr &graph, const ge::NodePtr &net_out_node) { if (net_out_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param net_out_node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "net out node is null."); return PARAM_INVALID; } Status ret = SUCCESS; auto in_control_anchor = net_out_node->GetInControlAnchor(); if (in_control_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param net_out_node's in control anchor is nullptr, check invalid"); GELOGE(PARAM_INVALID, "in control anchor is null."); return PARAM_INVALID; } @@ -360,6 +409,9 @@ Status NetOutputPass::UnLinkControlAnchorOfNetoutput(const ge::ComputeGraphPtr & if (CheckNodeIsInOutputNodes(graph, node) == false) { ret = in_control_anchor->Unlink(peer_out_data_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) unlink control edge from op:%s(%s) failed", + net_out_node->GetName().c_str(), net_out_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Unlink peer_out_anchor fail!"); return ret; } @@ -424,14 +476,16 @@ Status NetOutputPass::AddCtrlEdgesBetweenLeafAndNetOutput(const ge::ComputeGraph GELOGI("No need to add ctrl edge to netoutput because user out nodes have been set."); return SUCCESS; } + bool graph_has_only_one_node_except_netoutput = (graph->GetDirectNodesSize() == kNodesCount); for (const auto &node : graph->GetDirectNode()) { if (node == nullptr || node->GetOpDesc() == nullptr || node->GetOpDesc()->GetType() == NETOUTPUT) { continue; } - if ((node->GetInControlNodes().size() != 0 || node->GetInDataNodes().size() != 0) && + if ((node->GetInControlNodes().size() != 0 || node->GetInDataNodes().size() != 0 || + graph_has_only_one_node_except_netoutput) && node->GetOutDataNodesSize() == 0 && node->GetOutControlNodes().size() == 0) { - GE_CHK_STATUS_RET(GraphUtils::AddEdge(node->GetOutControlAnchor(), net_out_node->GetInControlAnchor()), - "add edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(node->GetOutControlAnchor(), net_out_node->GetInControlAnchor()), + "add edge failed"); GELOGD("Add ctrl edge success. src name :%s, dst name :%s", node->GetName().c_str(), net_out_node->GetName().c_str()); } @@ -445,6 +499,7 @@ Status NetOutputPass::CreateNetOutputNode(OpDescPtr &net_output_desc, const ge:: (graph->GetParentGraph() != nullptr) ? (graph->GetName() + "_" + NODE_NAME_NET_OUTPUT) : NODE_NAME_NET_OUTPUT; net_output_desc = MakeShared(node_name, NETOUTPUT); if (net_output_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(MEMALLOC_FAILED, "Make shared net output op failed."); return MEMALLOC_FAILED; } @@ -455,10 +510,11 @@ Status NetOutputPass::CreateNetOutputNode(OpDescPtr &net_output_desc, const ge:: Status NetOutputPass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(GE_GRAPH_PARAM_NULLPTR, "Compute graph is null."); return GE_GRAPH_PARAM_NULLPTR; } - GELOGI("NetOutputPass Run.graph is [%s]", graph->GetName().c_str()); + GELOGI("[NETOUTPUT PASS] Run.graph is [%s]", graph->GetName().c_str()); NodePtr output_node = graph->FindFirstNodeMatchType(NETOUTPUT); // save user targets node SaveAndRemoveTargets(graph); @@ -493,10 +549,20 @@ Status NetOutputPass::AddNetOutputNodeToGraph(const ge::ComputeGraphPtr &graph, } GELOGI("[NETOUTPUT PASS] OutNodesInfo size:%zu, Targets Size:%zu, is_include_special_node_:%d", graph->GetGraphOutNodesInfo().size(), graph->GetGraphTargetNodesInfo().size(), is_include_special_node_); - // If user does not set out nodes and targets and no retval node, return false + // If user does not set out nodes and targets and no retval node, also add netoutput node if ((graph->GetGraphOutNodesInfo().empty()) && (graph->GetGraphTargetNodesInfo().empty()) && !is_include_special_node_) { - GELOGI("[NETOUTPUT PASS] output_nodes and target_nodes and special nodes is empty!It means no need netoutput!"); + GELOGI("[NETOUTPUT PASS] Both output, target and special nodes are empty! add net output node"); + output_node = graph->AddNode(net_output_desc); + GE_CHK_STATUS_RET(AddCtrlEdgesBetweenLeafAndNetOutput(graph, output_node), + "add ctrl edge between leaf and netoutput failed"); + if (!ge::AttrUtils::SetInt(output_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, 0)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_TRUE_BRANCH_STREAM.c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "set ATTR_NAME_TRUE_BRANCH_STREAM failed"); + return INTERNAL_ERROR; + } + GELOGI("[NETOUTPUT PASS] Add net output node succeed"); return SUCCESS; } GELOGI("[NETOUTPUT PASS] Output node size:%lu.", output_nodes_info.size()); @@ -504,12 +570,17 @@ Status NetOutputPass::AddNetOutputNodeToGraph(const ge::ComputeGraphPtr &graph, // because retval node is contained by output_nodes_info, here means targets is non-empty output_node = graph->AddNode(net_output_desc); if (output_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + net_output_desc->GetName().c_str(), net_output_desc->GetType().c_str(), + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Add output node failed."); return INTERNAL_ERROR; } GE_CHK_STATUS_RET(AddCtrlEdgeForTargets(output_node), "add ctrl edge for targets failed"); // Add true stream, netoutput is 0 GE_IF_BOOL_EXEC(!ge::AttrUtils::SetInt(output_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, 0), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_TRUE_BRANCH_STREAM.c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_TRUE_BRANCH_STREAM failed"); return INTERNAL_ERROR); return SUCCESS; @@ -518,6 +589,9 @@ Status NetOutputPass::AddNetOutputNodeToGraph(const ge::ComputeGraphPtr &graph, AddInOutForNetOutputOp(graph, net_output_desc, output_nodes_info); output_node = graph->AddNode(net_output_desc); if (output_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + net_output_desc->GetName().c_str(), net_output_desc->GetType().c_str(), + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Add output node failed."); return INTERNAL_ERROR; } @@ -551,10 +625,14 @@ void NetOutputPass::AddInOutForNetOutputOp(const ComputeGraphPtr &graph, OpDescP /// Get the output attribute of src_node, /// and set to the input/output of net_out_node. if (src_node == nullptr || src_node->GetOpDesc() == nullptr || net_output_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param output_nodes_info has RetvalInfo item, which src_node is invalid; " + "or Param net_output_desc is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "src node or net output desc is null."); return; } ge::GeTensorDesc out_desc = src_node->GetOpDesc()->GetOutputDesc(src_index); + out_desc.SetFormat(FORMAT_ND); + out_desc.SetOriginFormat(FORMAT_ND); GE_IF_BOOL_EXEC(net_output_desc->AddInputDesc(out_desc) != SUCCESS, GELOGW("add input desc failed"); return ); is_input_const.push_back(PassUtils::IsConstant(src_node)); ++iter; @@ -648,10 +726,14 @@ Status NetOutputPass::SetUserDefDTypeAndFormatFromAtcParams(const NodePtr &outpu } } if (!userdef_dtypes.empty() && !ge::AttrUtils::SetListStr(op_desc, ATTR_ATC_USER_DEFINE_DATATYPE, userdef_dtypes)) { + REPORT_INNER_ERROR("E19999", "User define datatype is empty or Set Attr:%s to op:%s(%s) failed", + ATTR_ATC_USER_DEFINE_DATATYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set user_define_dtype attr list for netoutput failed."); return INTERNAL_ERROR; } if (!userdef_formats.empty() && !ge::AttrUtils::SetListStr(op_desc, ATTR_ATC_USER_DEFINE_FORMAT, userdef_formats)) { + REPORT_INNER_ERROR("E19999", "User define format is empty or Set Attr:%s to op:%s(%s) failed", + ATTR_ATC_USER_DEFINE_FORMAT.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set user_define_format attr list for netoutput failed."); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/next_iteration_pass.cc b/ge/graph/passes/next_iteration_pass.cc index cf46f09d..c52e6743 100644 --- a/ge/graph/passes/next_iteration_pass.cc +++ b/ge/graph/passes/next_iteration_pass.cc @@ -22,6 +22,10 @@ using std::string; namespace ge { +namespace { +const int64_t kLoopType = 1; +} + Status NextIterationPass::Run(ComputeGraphPtr graph) { GELOGD("NextIterationPass Enter"); /// Enter-----------+ @@ -67,6 +71,8 @@ Status NextIterationPass::GroupEnterNode(const NodePtr &enter_node) { GE_CHECK_NOTNULL(enter_desc); std::string frame_name; if (!ge::AttrUtils::GetStr(enter_desc, ENTER_ATTR_FRAME_NAME, frame_name) || frame_name.empty()) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ENTER_ATTR_FRAME_NAME.c_str(), + enter_desc->GetName().c_str(), enter_desc->GetType().c_str()); GELOGE(FAILED, "Get attr ENTER_ATTR_FRAME_NAME failed, node: %s", enter_desc->GetName().c_str()); return FAILED; } @@ -80,6 +86,7 @@ Status NextIterationPass::GroupEnterNode(const NodePtr &enter_node) { if (iter == loop_group_map_.end()) { LoopCondGroupPtr loop_group = MakeShared(); if (loop_group == nullptr) { + REPORT_CALL_ERROR("E19999", "New LoopCondGroup failed"); GELOGE(FAILED, "MakeShared for LoopCondGroup failed."); return FAILED; } @@ -101,7 +108,8 @@ Status NextIterationPass::FindWhileGroups() { const std::string &frame_name = loop_group_iter.first; for (const auto &enter_node : loop_group_iter.second->enter_nodes) { for (const auto &out_node : enter_node->GetOutAllNodes()) { - const string &type = out_node->GetType(); + std::string type; + GE_CHK_STATUS_RET(GetOriginalType(out_node, type), "Get node type failed."); if ((type != MERGE) && (type != REFMERGE)) { continue; } @@ -121,7 +129,12 @@ Status NextIterationPass::FindWhileGroups() { if (switch_node == nullptr) { continue; } - + if (!AttrUtils::SetInt(switch_node->GetOpDesc(), ATTR_NAME_STREAM_SWITCH_TYPE, kLoopType)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_STREAM_SWITCH_TYPE.c_str(), + switch_node->GetName().c_str(), switch_node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "set int failed"); + return INTERNAL_ERROR; + } NodePtr loop_cond = nullptr; if (FindTargetNode(switch_node, LOOPCOND, true, loop_cond) != SUCCESS) { GELOGE(INTERNAL_ERROR, "Get LoopCond node failed, frame_name: %s.", frame_name.c_str()); @@ -130,6 +143,7 @@ Status NextIterationPass::FindWhileGroups() { if (loop_group_iter.second->loop_cond == nullptr) { loop_group_iter.second->loop_cond = loop_cond; } else if (loop_group_iter.second->loop_cond != loop_cond) { + REPORT_INNER_ERROR("E19999", "Multi LoopCond nodes exist, frame_name:%s, check invalid", frame_name.c_str()); GELOGE(FAILED, "Multi LoopCond nodes exist, frame_name: %s.", frame_name.c_str()); return FAILED; } @@ -149,16 +163,20 @@ bool NextIterationPass::VerifyWhileGroup() { for (const auto &loop_group_iter : loop_group_map_) { const std::string &frame_name = loop_group_iter.first; if (frame_name.empty()) { + REPORT_INNER_ERROR("E19999", "Verify while group failed, frame_name is empty"); GELOGE(INTERNAL_ERROR, "Verify while group failed, frame_name is empty."); return false; } if (loop_group_iter.second->loop_cond == nullptr) { + REPORT_INNER_ERROR("E19999", "Verify while group failed, LoopCond is null, frame_name:%s.", frame_name.c_str()); GELOGE(INTERNAL_ERROR, "Verify while group failed, LoopCond is null, frame_name: %s.", frame_name.c_str()); return false; } for (const auto &pair_iter : loop_group_iter.second->merge_next_pairs) { if ((pair_iter.first == nullptr) || (pair_iter.second == nullptr)) { + REPORT_INNER_ERROR("E19999", "Verify while group failed, merge_node/next_node is null, frame_name:%s.", + frame_name.c_str()); GELOGE(INTERNAL_ERROR, "Verify while group failed, merge_node/next_node is null, frame_name: %s.", frame_name.c_str()); return false; @@ -190,6 +208,9 @@ Status NextIterationPass::HandleWhileGroup(ComputeGraphPtr &graph) { for (const auto &enter_node : loop_cond_iter.second->enter_nodes) { // Enter --> Active if (GraphUtils::AddEdge(enter_node->GetOutControlAnchor(), enter_active->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + enter_node->GetName().c_str(), enter_node->GetType().c_str(), + enter_active->GetName().c_str(), enter_active->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add control edge from %s to %s failed.", enter_node->GetName().c_str(), enter_active->GetName().c_str()); return INTERNAL_ERROR; @@ -201,12 +222,18 @@ Status NextIterationPass::HandleWhileGroup(ComputeGraphPtr &graph) { NodePtr next_node = pair.second; // Active --> Merge if (GraphUtils::AddEdge(enter_active->GetOutControlAnchor(), merge_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + enter_active->GetName().c_str(), enter_active->GetType().c_str(), + merge_node->GetName().c_str(), merge_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add control edge failed."); return INTERNAL_ERROR; } // NextIteration --> Active if (GraphUtils::AddEdge(next_node->GetOutControlAnchor(), next_active->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + next_node->GetName().c_str(), next_node->GetType().c_str(), + next_active->GetName().c_str(), next_active->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Add control edge failed."); return INTERNAL_ERROR; } @@ -237,17 +264,22 @@ Status NextIterationPass::HandleWhileGroup(ComputeGraphPtr &graph) { NodePtr NextIterationPass::CreateActiveNode(ComputeGraphPtr &graph, const std::string &name) { OpDescPtr op_desc = MakeShared(name, STREAMACTIVE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); return nullptr; } GELOGI("Create StreamActive op:%s.", op_desc->GetName().c_str()); NodePtr active_node = graph->AddNode(op_desc); if (active_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Create node[%s] failed.", name.c_str()); return nullptr; } if (SetSwitchBranchNodeLabel(active_node, name) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set switch branch node label:%s to node:%s(%s) failed", + name.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set attr SWITCH_BRANCH_NODE_LABEL for node: %s failed.", active_node->GetName().c_str()); return nullptr; } @@ -272,11 +304,17 @@ Status NextIterationPass::BreakNextIteration(const NodePtr &next_node, NodePtr & continue; } if (GraphUtils::RemoveEdge(out_anchor, in_anchor) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetOwnerNode()->GetType().c_str(), + out_anchor->GetIdx(), + merge_node->GetName().c_str(), merge_node->GetType().c_str(), in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Remove data edge failed, %s->%s.", next_node->GetName().c_str(), merge_node->GetName().c_str()); return INTERNAL_ERROR; } if (SetNextIteration(merge_node, next_node->GetName()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set attr NEXT_ITERATION value:%s to node:%s(%s) failed", + next_node->GetName().c_str(), merge_node->GetName().c_str(), merge_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set attr NEXT_ITERATION for node %s failed.", merge_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -295,6 +333,7 @@ Status NextIterationPass::BreakNextIteration(const NodePtr &next_node, NodePtr & Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string &target_type, bool is_input, NodePtr &target_node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "node is null."); return PARAM_INVALID; } @@ -310,7 +349,8 @@ Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string } for (const auto &tmp_node : nodes) { - const std::string type = tmp_node->GetType(); + std::string type; + GE_CHK_STATUS_RET(GetOriginalType(tmp_node, type), "Get node type failed."); if ((target_type == LOOPCOND) && (type == target_type)) { target_node = tmp_node; break; @@ -321,6 +361,8 @@ Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string } if ((target_type != SWITCH) && (target_node == nullptr)) { + REPORT_INNER_ERROR("E19999", "Find target_type:%s node around node:%s(%s) failed", + target_type.c_str(), node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Find node %s failed.", target_type.c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/no_use_reshape_remove_pass.cc b/ge/graph/passes/no_use_reshape_remove_pass.cc index 1da939c6..ca71378e 100644 --- a/ge/graph/passes/no_use_reshape_remove_pass.cc +++ b/ge/graph/passes/no_use_reshape_remove_pass.cc @@ -37,6 +37,7 @@ Status NoUseReshapeRemovePass::Run(ge::NodePtr &node) { GE_CHECK_NOTNULL(node); OpDescPtr op_desc_ptr = node->GetOpDesc(); if (op_desc_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "NoUseReshapeRemovePass enter. OpDesc is null."); return PARAM_INVALID; } @@ -48,6 +49,8 @@ Status NoUseReshapeRemovePass::Run(ge::NodePtr &node) { bool to_be_deleted = true; // compare input and output dims if (op_desc_ptr->GetAllInputsDesc().empty() || op_desc_ptr->GetAllOutputsDesc().empty()) { + REPORT_INNER_ERROR("E19999", "Input or Output desc num is zero in node:%s(%s), check invalid", + op_desc_ptr->GetName().c_str(), op_desc_ptr->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Input or output num is zero. node name:%s, input size:%zu, output size:%zu", op_desc_ptr->GetName().c_str(), op_desc_ptr->GetAllInputsDesc().size(), op_desc_ptr->GetAllOutputsDesc().size()); @@ -107,6 +110,8 @@ Status NoUseReshapeRemovePass::TryRemoveConstShapeInput(ge::NodePtr &reshape_nod // const input can unlink but should copy control_dependency auto ret = PassUtils::UnlinkNodeWithControlCopy(reshape_node, kReshapeShapeIndex); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Unlink op:%s(%s) data input:%u with control edge copy failed", + reshape_node->GetName().c_str(), reshape_node->GetType().c_str(), kReshapeShapeIndex); GELOGE(ret, "Unlink node %s with control copy failed.", shape_input->GetName().c_str()); return ret; } diff --git a/ge/graph/passes/parallel_concat_start_op_pass.cc b/ge/graph/passes/parallel_concat_start_op_pass.cc index 508d9b19..f64fa2f3 100755 --- a/ge/graph/passes/parallel_concat_start_op_pass.cc +++ b/ge/graph/passes/parallel_concat_start_op_pass.cc @@ -43,6 +43,9 @@ Status ParallelConcatStartOpPass::Run(NodePtr &node) { GELOGI("Start to replace operator _ParallelConcatStart with Constant, node name: %s.", node_name.c_str()); if (node_op_desc->GetOutputsSize() != kParallelConcatStartOutputSize) { + REPORT_INNER_ERROR("E19999", "Output tensor num:%zu of node:%s(%s) != %zu, check invalid", + node_op_desc->GetOutputsSize(), node_op_desc->GetName().c_str(), + node_op_desc->GetType().c_str(), kParallelConcatStartOutputSize); GELOGE(PARAM_INVALID, "Node[%s] output size is unexpected, the value is %zu.", node_name.c_str(), node_op_desc->GetOutputsSize()); return PARAM_INVALID; @@ -50,12 +53,15 @@ Status ParallelConcatStartOpPass::Run(NodePtr &node) { auto output_tensor_desc = node_op_desc->GetOutputDesc(kParallelConcatStartOutputDataIndex); GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(MEMALLOC_FAILED, "Malloc GeTensor failed, node name %s.", node_name.c_str()); return FAILED; } ge::DataType attr_dtype; if (!ge::AttrUtils::GetDataType(node_op_desc, kAttrDtype, attr_dtype)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", kAttrDtype, + node_op_desc->GetName().c_str(), node_op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Node:%s failed to get attribute dtype.", node_name.c_str()); return PARAM_INVALID; } @@ -63,6 +69,8 @@ Status ParallelConcatStartOpPass::Run(NodePtr &node) { vector attr_shape_list; if (!ge::AttrUtils::GetListInt(node_op_desc, kAttrShape, attr_shape_list)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", kAttrShape, + node_op_desc->GetName().c_str(), node_op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Node:%s failed to get attribute shape.", node_name.c_str()); return PARAM_INVALID; } diff --git a/ge/graph/passes/parallel_group_pass.cc b/ge/graph/passes/parallel_group_pass.cc new file mode 100644 index 00000000..9c93f6cf --- /dev/null +++ b/ge/graph/passes/parallel_group_pass.cc @@ -0,0 +1,348 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/parallel_group_pass.h" + +#include "framework/common/debug/ge_log.h" +#include "common/ge/ge_util.h" +#include "framework/common/ge_inner_error_codes.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" + +namespace ge { +namespace { +const int32_t kMaxRecursionDepth = 10; +const int64_t kLoopType = 1; +} + +Status ParallelGroupPass::Run(ComputeGraphPtr graph) { + GELOGD("ParallelGroupPass running"); + if (graph == nullptr) { + GELOGE(PARAM_INVALID, "[Check][Graph]Input param graph is null, skip ParallelGroupPass."); + REPORT_INNER_ERROR("E19999", "Input param graph is null, skip ParallelGroupPass."); + return PARAM_INVALID; + } + + if (graph->GetParentGraph() != nullptr) { + GELOGD("Current graph %s is a subgraph, this pass only support root graph.", + graph->GetName().c_str()); + return SUCCESS; + } + + if (graph->TopologicalSorting() != GRAPH_SUCCESS) { + GELOGE(FAILED, "[TopoSort][Graph]Graph:%s topological sort failed.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Graph:%s topological sort failed when ParallelGroupPass run.", + graph->GetName().c_str()); + return FAILED; + } + + std::unordered_set parallel_groups; + int depth = 0; + if (ProcessGraphGroupNodes(graph, depth, parallel_groups) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "[Process][Graph]Process group nodes of graph %s failed.", graph->GetName().c_str()); + return INTERNAL_ERROR; + } + + if (graph->TopologicalSorting() != GRAPH_SUCCESS) { + GELOGE(FAILED, "[TopoSort][Graph]Graph:%s topological sort failed.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Graph:%s topological sort failed when ParallelGroupPass run.", + graph->GetName().c_str()); + return FAILED; + } + + return SUCCESS; +} + +Status ParallelGroupPass::ProcessGraphGroupNodes(ComputeGraphPtr graph, int32_t depth, + std::unordered_set ¶llel_groups) { + if (depth >= kMaxRecursionDepth) { + GELOGE(FAILED, "[Process][SubGraph]There are too much subgraphs:%d > %d(max subgraphs)", depth, kMaxRecursionDepth); + REPORT_INNER_ERROR("E19999", "There are too much subgraphs:%d > %d(max subgraphs)", depth, kMaxRecursionDepth); + return FAILED; + } + std::map> group_nodes; + auto candidates = graph->GetDirectNode(); + auto root_graph = GraphUtils::FindRootGraph(graph); + for (const auto &node : candidates) { + OpDescPtr op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + std::string group_name; + if (AttrUtils::GetStr(op_desc, ATTR_NAME_PARALLEL_GROUP, group_name)) { + group_nodes[group_name].push_back(node); + parallel_groups.insert(group_name); + GELOGD("Find group node:%s, group_name:%s", node->GetName().c_str(), group_name.c_str()); + } + + const auto &subgraph_name = op_desc->GetSubgraphInstanceNames(); + GE_CHECK_NOTNULL(root_graph); + for (auto name_iter = subgraph_name.rbegin(); name_iter != subgraph_name.rend(); ++name_iter) { + const auto &sub_graph = root_graph->GetSubgraph(*name_iter); + GE_CHECK_NOTNULL(sub_graph); + // if the pass add control edge for known and unknown graph, then the known graph will become unknown graph + // the order between known and unknown graph is guaranteed by dynamic shape executor + // so the parallel group pass do nothing for unknown graph + if (sub_graph->GetGraphUnknownFlag()) { + continue; + } + std::unordered_set sub_parallel_groups; + auto ret = ProcessGraphGroupNodes(sub_graph, depth + 1, sub_parallel_groups); + if (ret != SUCCESS) { + GELOGE(FAILED, "[Process][SubGraph]Process sub graph %s failed.", sub_graph->GetName().c_str()); + return FAILED; + } + for (const auto &sub_parallel_group : sub_parallel_groups) { + parallel_groups.insert(sub_parallel_group); + group_nodes[sub_parallel_group].emplace_back(node); + } + } + } + + std::map, NodePtr>> node_2_switch_merge; + if (ProcessGroupNodeInSwitch(graph, node_2_switch_merge) != SUCCESS) { + GELOGE(FAILED, "[Process][Node]Process group node in switch failed, graph:%s.", graph->GetName().c_str()); + return FAILED; + } + + for (const auto &itr : group_nodes) { + const auto &nodes = itr.second; + if (nodes.empty()) { + continue; + } + NodePtr pre_node = nodes[0]; + NodePtr cur_node = nullptr; + for (std::size_t i = 1; i < nodes.size(); i++) { + cur_node = nodes[i]; + GELOGD("Original add ctrl anchor for node:%s->%s", pre_node->GetName().c_str(), cur_node->GetName().c_str()); + if (ReplaceWithSwitchAndMerge(pre_node, cur_node, node_2_switch_merge) != SUCCESS) { + GELOGE(FAILED, "[Replace][Node]Replace switch and merges for nodes: %s and %s failed.", + pre_node->GetName().c_str(), cur_node->GetName().c_str()); + return FAILED; + } + pre_node = cur_node; + } + } + + return SUCCESS; +} + +Status ParallelGroupPass::AddCtrlEdge(NodePtr pre_node, NodePtr cur_node) { + if (pre_node == cur_node) { + GELOGD("Pre_node and cur_node are same, no need add anchor"); + return SUCCESS; + } + auto in_nodes = cur_node->GetInAllNodes(); + for (const auto &node : in_nodes) { + if (pre_node == node) { + GELOGD("Node:%s and %s already linked", pre_node->GetName().c_str(), + cur_node->GetName().c_str()); + return SUCCESS; + } + } + GELOGD("Finally add ctrl anchor for node:%s->%s", pre_node->GetName().c_str(), cur_node->GetName().c_str()); + return GraphUtils::AddEdge(pre_node->GetOutControlAnchor(), cur_node->GetInControlAnchor()); +} + +Status ParallelGroupPass::ProcessGroupNodeInSwitch(ComputeGraphPtr graph, + std::map, NodePtr>> &node_2_switch_merge) { + + std::string type; + auto direct_nodes = graph->GetDirectNode(); + for (const auto &node : direct_nodes) { + type = node->GetType(); + if (type != STREAMSWITCH) { + continue; + } + + if (IsBigSmallLoopStreamSwitch(node->GetOpDesc()) || + IsWhileStreamSwitch(node->GetOpDesc())) { + continue; + } + + std::vector merge_nodes; + std::set group_nodes; + std::set stream_labels; + + FindGroupNodeAndMerge(node, group_nodes, merge_nodes, stream_labels); + + if (merge_nodes.empty() || (!group_nodes.empty() && stream_labels.size() > 1)) { + GELOGE(FAILED, "[Process][Node]Cannot find merge node or exist switch nestification, switch node:%s," + "merge_vec size:%zu, stream_labels size:%zu, graph:%s.", node->GetName().c_str(), + merge_nodes.size(), stream_labels.size(), graph->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Cannot find merge node or exist switch nest, switch node:%s," + "merge_vec size: %zu, stream_labels size: %zu, graph:%s.", node->GetName().c_str(), + merge_nodes.size(), stream_labels.size(), graph->GetName().c_str()); + return FAILED; + } + + std::sort(merge_nodes.begin(), merge_nodes.end(), + [] (NodePtr a, NodePtr b) -> bool { + return (a->GetOpDesc()->GetId() < b->GetOpDesc()->GetId()); + }); + + NodePtr cast_node = NodeUtils::GetInDataNodeByIndex(*node, 0); + GE_CHECK_NOTNULL(cast_node); + if (MappingNodeToSwitchAndMerge(group_nodes, merge_nodes, cast_node, node, node_2_switch_merge) != SUCCESS) { + GELOGE(FAILED, "[Mapping][Node]Mapping node to switch and merge failed, graph:%s.", graph->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[Mapping][Node]Mapping node to switch and merge failed, graph:%s.", + graph->GetName().c_str()); + return FAILED; + } + } + + return SUCCESS; +} + +void ParallelGroupPass::FindGroupNodeAndMerge(NodePtr stream_switch_node, std::set &group_nodes, + std::vector &merge_nodes, std::set &stream_labels) { + std::string type; + std::deque candidates; + std::set visited; + + candidates.push_back(stream_switch_node); + while (!candidates.empty()) { + NodePtr tmp_node = candidates.front(); + candidates.pop_front(); + for (const auto &out_node : tmp_node->GetOutAllNodes()) { + type = out_node->GetType(); + if (type == STREAMMERGE) { + merge_nodes.emplace_back(out_node); + continue; + } + const auto &op = out_node->GetOpDesc(); + if (op != nullptr && op->HasAttr(ATTR_NAME_PARALLEL_GROUP)) { + group_nodes.emplace(out_node); + } + if (visited.count(out_node) > 0) { + continue; + } + candidates.push_back(out_node); + visited.insert(out_node); + std::string stream_label; + if (ge::AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) { + stream_labels.insert(stream_label); + } + } + } +} + +Status ParallelGroupPass::MappingNodeToSwitchAndMerge(const std::set &group_nodes, + const std::vector &merge_nodes, const NodePtr &cast_node, const NodePtr &switch_node, + std::map, NodePtr>> &node_2_switch_merge) { + for (const auto &group_node : group_nodes) { + auto itr = node_2_switch_merge.find(group_node); + if (itr != node_2_switch_merge.end()) { + auto &tmp = itr->second; + auto &switch_set = tmp.first; + const auto &merge_node = tmp.second; + GELOGD("Find group node: %s in switch %s and merge %s.", + group_node->GetName().c_str(), switch_node->GetName().c_str(), merge_node->GetName().c_str()); + if (merge_node != merge_nodes.back()) { + GELOGE(FAILED, "[Mapping][Node]Has two different merge nodes: %s and %s, graph's structure is invalid", + merge_node->GetName().c_str(), merge_nodes.back()->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Has two different merge nodes: %s and %s," + "graph's structure is invalid", + merge_node->GetName().c_str(), merge_nodes.back()->GetName().c_str()); + return FAILED; + } + switch_set.insert(cast_node); + } else { + node_2_switch_merge.emplace(group_node, + std::make_pair(std::set{cast_node}, merge_nodes.back())); + } + } + return SUCCESS; +} + +Status ParallelGroupPass::ReplaceWithSwitchAndMerge(NodePtr pre_node, NodePtr cur_node, + const std::map, NodePtr>> &node_2_switch_merge) { + auto pre_itr = node_2_switch_merge.find(pre_node); + auto cur_itr = node_2_switch_merge.find(cur_node); + if (pre_itr != node_2_switch_merge.end()) { + if (cur_itr != node_2_switch_merge.end()) { + const auto &pre_set = pre_itr->second.first; + const auto &cur_set = cur_itr->second.first; + if (!HasSameSwitch(pre_set, cur_set)) { + pre_node = pre_itr->second.second; + for (const auto &switch_node : cur_itr->second.first) { + if (AddCtrlEdge(pre_node, switch_node) != SUCCESS) { + GELOGE(FAILED, "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + return FAILED; + } + } + } + return SUCCESS; + } else { + pre_node = pre_itr->second.second; + return AddCtrlEdge(pre_node, cur_node); + } + } else { + if (cur_itr != node_2_switch_merge.end()) { + for (const auto &switch_node : cur_itr->second.first) { + int64_t pre_id = pre_node->GetOpDesc()->GetId(); + int64_t switch_id = switch_node->GetOpDesc()->GetId(); + // avoid ring + if (pre_id > switch_id) { + auto merge_node = cur_itr->second.second; + if (AddCtrlEdge(merge_node, pre_node) != SUCCESS) { + GELOGE(FAILED, "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + return FAILED; + } + } else { + if (AddCtrlEdge(pre_node, switch_node) != SUCCESS) { + GELOGE(FAILED, "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[AddEdge][Node]Add edge for nodes: %s->%s failed.", + pre_node->GetName().c_str(), switch_node->GetName().c_str()); + return FAILED; + } + } + } + } else { + return AddCtrlEdge(pre_node, cur_node); + } + } + return SUCCESS; +} + +bool ParallelGroupPass::HasSameSwitch(const std::set &switch_set1, const std::set &switch_set2) { + for (const auto &node1 : switch_set1) { + auto itr = switch_set2.find(node1); + if (itr != switch_set2.end()) { + return true; + } + } + return false; +} + +bool ParallelGroupPass::IsBigSmallLoopStreamSwitch(OpDescPtr switch_op_desc) { + return !AttrUtils::HasAttr(switch_op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG); +} + +bool ParallelGroupPass::IsWhileStreamSwitch(OpDescPtr switch_op_desc) { + int64_t stream_switch_type = -1; + return (AttrUtils::GetInt(switch_op_desc, ATTR_NAME_STREAM_SWITCH_TYPE, stream_switch_type) && + stream_switch_type == kLoopType); +} +} // namespace ge diff --git a/ge/graph/passes/parallel_group_pass.h b/ge/graph/passes/parallel_group_pass.h new file mode 100644 index 00000000..9b895598 --- /dev/null +++ b/ge/graph/passes/parallel_group_pass.h @@ -0,0 +1,53 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_PARALLEL_GROUP_PASS_H +#define GE_GRAPH_PASSES_PARALLEL_GROUP_PASS_H + +#include +#include +#include "graph/graph.h" +#include "inc/graph_pass.h" + +namespace ge { +class ParallelGroupPass : public GraphPass { + public: + Status Run(ComputeGraphPtr graph) override; + private: + Status ProcessGraphGroupNodes(ComputeGraphPtr graph, int32_t depth, std::unordered_set ¶llel_group); + + Status AddCtrlEdge(NodePtr pre_node, NodePtr cur_node); + + Status ReplaceWithSwitchAndMerge(NodePtr pre_node, NodePtr cur_node, + const std::map, NodePtr>> &node_2_switch_merge); + + bool HasSameSwitch(const std::set &a, const std::set &b); + + Status ProcessGroupNodeInSwitch(ComputeGraphPtr graph, + std::map, NodePtr>> &node_2_switch_merge); + + void FindGroupNodeAndMerge(NodePtr stream_switch_node, std::set &group_nodes, + std::vector &merge_nodes, std::set &stream_labels); + + Status MappingNodeToSwitchAndMerge(const std::set &group_set, const std::vector &merge_vec, + const NodePtr &cast_node, const NodePtr &switch_node, + std::map, NodePtr>> &node_2_switch_merge); + + bool IsBigSmallLoopStreamSwitch(OpDescPtr switch_op_desc); + bool IsWhileStreamSwitch(OpDescPtr switch_op_desc); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_PARALLEL_GROUP_PASS_H diff --git a/ge/graph/passes/pass_utils.cc b/ge/graph/passes/pass_utils.cc index 3adfbde3..db379433 100644 --- a/ge/graph/passes/pass_utils.cc +++ b/ge/graph/passes/pass_utils.cc @@ -35,9 +35,10 @@ #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" +#include "utils/node_utils.h" +#include "common/formats/utils/formats_trans_utils.h" namespace ge { - Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std::vector &data, std::vector &v_output, const bool scalar_output) { Status ret = SUCCESS; @@ -46,11 +47,13 @@ Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std: if (data_type == DT_INT32) { unique_ptr buf(new (std::nothrow) int32_t[dim_size]()); if (buf == nullptr) { + REPORT_CALL_ERROR("E19999", "New buffer failed, size:%u", dim_size); GELOGE(MEMALLOC_FAILED, "new failed"); return MEMALLOC_FAILED; } for (uint32_t i = 0; i < dim_size; i++) { if (data[i] >= INT_MAX) { + REPORT_CALL_ERROR("E19999", "Param data:%s will overflow after multi", formats::JoinToString(data).c_str()); GELOGE(PARAM_INVALID, "int32 overflow, data[%u]:%ld", i, data[i]); return PARAM_INVALID; } @@ -60,6 +63,7 @@ Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std: } else if (data_type == DT_INT64) { unique_ptr buf(new (std::nothrow) int64_t[dim_size]()); if (buf == nullptr) { + REPORT_CALL_ERROR("E19999", "New buffer failed, size:%u", dim_size); GELOGE(MEMALLOC_FAILED, "new failed"); return MEMALLOC_FAILED; } @@ -68,6 +72,8 @@ Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std: } ret = ConstructTensorDescWithData(out_desc, buf.get(), dim_size, v_output, scalar_output); } else { + REPORT_CALL_ERROR("E19999", "Only support DT_INT32 and DT_INT64. Input data_type:%s not support", + formats::JoinToString(data).c_str()); GELOGE(PARAM_INVALID, "Only support DT_INT32 and DT_INT64. data_type:%s", TypeUtils::DataTypeToSerialString(data_type).c_str()); return PARAM_INVALID; @@ -92,6 +98,7 @@ Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, T *b GeTensorPtr output_tensor_ptr = MakeShared( output_tensor_desc, reinterpret_cast(buf), sizeof(T) * len); if (output_tensor_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(MEMALLOC_FAILED, "Make shared failed"); return MEMALLOC_FAILED; } @@ -102,6 +109,7 @@ Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, T *b bool PassUtils::IsConstant(const ConstNodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "node is null"); return false; } @@ -112,19 +120,25 @@ bool PassUtils::IsConstant(const ConstNodePtr &node) { } Status PassUtils::SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, const NodePtr &src_node) { - GE_IF_BOOL_EXEC(src_node == nullptr, GELOGE(PARAM_INVALID, "src_node is null"); return PARAM_INVALID); + GE_IF_BOOL_EXEC(src_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param src_node is nullptr, check invalid"); + GELOGE(PARAM_INVALID, "src_node is null"); return PARAM_INVALID); if (!IsConstant(src_node)) { return SUCCESS; } auto weights = OpDescUtils::MutableWeights(src_node); if (weights.empty()) { + REPORT_INNER_ERROR("E19999", "Weight of node:%s(%s) is empty, check invalid", + src_node->GetName().c_str(), src_node->GetType().c_str()); return PARAM_INVALID; } auto weight = weights.at(0); auto src_in_ctrl = src_node->GetInControlAnchor(); if ((src_in_ctrl == nullptr) || (out_data_anchor == nullptr)) { + REPORT_INNER_ERROR("E19999", "Param out_data_anchor or in control anchor in Param src_node:%s(%s) is nullptr, " + "check invalid", src_node->GetName().c_str(), src_node->GetType().c_str()); GELOGE(FAILED, "parameter is null."); return FAILED; } @@ -143,7 +157,7 @@ Status PassUtils::SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, cons dst_op_desc->SetIsInputConst(is_input_const); } - GE_CHK_STATUS_RET(GraphUtils::RemoveEdge(out_data_anchor, dst_in_data), "remove edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_data_anchor, dst_in_data), "remove edge failed"); graphStatus ret = OpDescUtils::AddConstOpToAnchor(dst_in_data, weight); if (ret != SUCCESS) { return ret; @@ -155,7 +169,7 @@ Status PassUtils::SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, cons // restore control inputs to dynamically added constant ops, if any for (const auto &src_out_control_anchor : src_out_control_anchors) { - GE_CHK_STATUS_RET(GraphUtils::AddEdge(src_out_control_anchor, dynamic_const_node->GetInControlAnchor()), + GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(src_out_control_anchor, dynamic_const_node->GetInControlAnchor()), "add edge failed"); } } @@ -166,7 +180,7 @@ Status PassUtils::SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, cons /// Op1 - - - > Op2 for (const auto &dst_in_ctrl : out_data_anchor->GetPeerInControlAnchors()) { for (const auto &src_out_control_anchor : src_out_control_anchors) { - GE_CHK_STATUS_RET(GraphUtils::AddEdge(src_out_control_anchor, dst_in_ctrl), "add edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(src_out_control_anchor, dst_in_ctrl), "add edge failed"); } } @@ -176,6 +190,7 @@ Status PassUtils::SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, cons Status PassUtils::RemoveBranch(const NodePtr &node, std::vector &delete_nodes, std::vector &end_nodes) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } @@ -201,6 +216,8 @@ Status PassUtils::RemoveBranch(const NodePtr &node, std::vector &delete GE_CHK_STATUS_RET(GetOriginalType(dst_node, node_type), "get original type failed"); if (node_type == NETOUTPUT) { if (dst_in_anchor->IsTypeOf()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) nactive branch connected to NetOutput with data anchor, " + "check invalid", node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "[%s] Inactive branch connected to " "NetOutput with data anchor.", @@ -208,13 +225,13 @@ Status PassUtils::RemoveBranch(const NodePtr &node, std::vector &delete return INTERNAL_ERROR; } else { // safe to unlink control edges - GE_CHK_STATUS_RET(GraphUtils::RemoveEdge(src_out_anchor, dst_in_anchor), "remove edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(src_out_anchor, dst_in_anchor), "remove edge failed"); end_nodes.push_back(dst_node); } } else if (node_type == MERGE) { /// Unlink connection between the inactive branch and Merge/NetOutput. /// The removal of inactive nodes will be handled in PrunePass - GE_CHK_STATUS_RET(GraphUtils::RemoveEdge(src_out_anchor, dst_in_anchor), "remove edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(src_out_anchor, dst_in_anchor), "remove edge failed"); end_nodes.push_back(dst_node); GELOGD("Reach the end merge node %s, the branch removing stop", dst_node->GetName().c_str()); } else { @@ -246,6 +263,12 @@ NodePtr PassUtils::GetInDataNode(const ConstNodePtr &node, int index) { return src_node; } +NodePtr PassUtils::GetInNodeCrossSubgraphByIndex(const ConstNodePtr &node, int index) { + auto src_node = GetInDataNode(node, index); + + return NodeUtils::GetInNodeCrossSubgraph(src_node); +} + bool PassUtils::IsNeedTrainIteFlowCtrl(const ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { return false; @@ -267,6 +290,7 @@ bool PassUtils::IsNeedTrainIteFlowCtrl(const ComputeGraphPtr &compute_graph) { int PassUtils::GetUniqueInDataAnchorIndex(const NodePtr &node_ptr) { const int invalid_index = -1; if (node_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node_ptr is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "GetUniqueInDataAnchorIndex: node is null"); return invalid_index; } @@ -276,6 +300,9 @@ int PassUtils::GetUniqueInDataAnchorIndex(const NodePtr &node_ptr) { return (in_anchor->GetIdx()); } } + + REPORT_INNER_ERROR("E19999", "Failed to find in data anchor of node:%s(%s) with a valid peer out node", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str()); GELOGE(INTERNAL_ERROR, "GetUniqueInDataAnchorIndex: [%s] failed to find " "in data anchor with a valid peer out node", @@ -285,6 +312,7 @@ int PassUtils::GetUniqueInDataAnchorIndex(const NodePtr &node_ptr) { Status PassUtils::UnlinkNodeWithControlCopy(NodePtr &node, int index) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "node is null."); return PARAM_INVALID; } @@ -295,6 +323,8 @@ Status PassUtils::UnlinkNodeWithControlCopy(NodePtr &node, int index) { } auto out_data_anchor = in_data_anchor->GetPeerOutAnchor(); if (out_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%d in data anchor of node:%s(%s), its peer anchor is nullptr, check invalid", + index, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] peer out_data_anchor is null with index [%d].", node->GetName().c_str(), index); return FAILED; } @@ -304,6 +334,9 @@ Status PassUtils::UnlinkNodeWithControlCopy(NodePtr &node, int index) { auto father_node = out_data_anchor->GetOwnerNode(); // link father_node's in control nodes to node if (GraphUtils::CopyInCtrlEdges(father_node, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + father_node->GetName().c_str(), father_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } return SUCCESS; @@ -312,6 +345,7 @@ Status PassUtils::UnlinkNodeWithControlCopy(NodePtr &node, int index) { Status PassUtils::RemoveInactiveBranchToMerge(const OutDataAnchorPtr &inactive_output_anchor, std::vector &delete_nodes, std::vector &end_nodes) { if (inactive_output_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param inactive_output_anchor is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } @@ -325,7 +359,7 @@ Status PassUtils::RemoveInactiveBranchToMerge(const OutDataAnchorPtr &inactive_o GE_CHK_STATUS_RET(GetOriginalType(dst_node, dst_node_type), "get original type failed"); if (dst_node_type == MERGE) { GELOGD("[%s] Switch connected directly to Merge", inactive_output_anchor->GetOwnerNode()->GetName().c_str()); - GE_CHK_STATUS_RET(GraphUtils::RemoveEdge(inactive_output_anchor, dst_anchor), "remove edge failed"); + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(inactive_output_anchor, dst_anchor), "remove edge failed"); continue; } diff --git a/ge/graph/passes/pass_utils.h b/ge/graph/passes/pass_utils.h index fbfb3b47..bd506d09 100755 --- a/ge/graph/passes/pass_utils.h +++ b/ge/graph/passes/pass_utils.h @@ -30,6 +30,8 @@ class PassUtils { static NodePtr GetInDataNode(const ConstNodePtr &node, int index); + static NodePtr GetInNodeCrossSubgraphByIndex(const ConstNodePtr &node, int index); + static bool IsConstant(const ConstNodePtr &node); static Status SetOutNodeWeight(const OutDataAnchorPtr &out_data_anchor, const NodePtr &src_node); diff --git a/ge/graph/passes/permute_pass.cc b/ge/graph/passes/permute_pass.cc index 73d9a7f1..8ac3aedf 100644 --- a/ge/graph/passes/permute_pass.cc +++ b/ge/graph/passes/permute_pass.cc @@ -108,6 +108,8 @@ Status PermutePass::Run(ComputeGraphPtr graph) { OpDescPtr op_desc_ptr = outNode->GetOpDesc(); GE_CHECK_NOTNULL(op_desc_ptr); if (!AttrUtils::SetBool(op_desc_ptr, ATTR_NAME_PRED_PERMUTE_DELETED, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PRED_PERMUTE_DELETED.c_str(), + op_desc_ptr->GetName().c_str(), op_desc_ptr->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_PRED_PERMUTE_DELETED failed"); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/print_op_pass.cc b/ge/graph/passes/print_op_pass.cc index 28b2332b..0b20da84 100755 --- a/ge/graph/passes/print_op_pass.cc +++ b/ge/graph/passes/print_op_pass.cc @@ -21,6 +21,7 @@ namespace ge { Status PrintOpPass::Run(ge::NodePtr &node) { GELOGD("PrintOpPass running"); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/prune_pass.cc b/ge/graph/passes/prune_pass.cc index 3c95f3b1..49daa037 100644 --- a/ge/graph/passes/prune_pass.cc +++ b/ge/graph/passes/prune_pass.cc @@ -29,6 +29,7 @@ namespace ge { Status PrunePass::Run(ge::ComputeGraphPtr graph) { GELOGD("PrunePass Start, graph is [%s]", graph->GetName().c_str()); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(GE_GRAPH_ISNULL, "input compute graph is NULL."); return GE_GRAPH_ISNULL; } @@ -70,6 +71,9 @@ Status PrunePass::Run(ge::ComputeGraphPtr graph) { if (node_ptr->GetOpDesc()->GetType() == DATA || node_ptr->GetOpDesc()->GetType() == AIPPDATA) { Status status = ge::GraphUtils::AddEdge(node_ptr->GetOutControlAnchor(), out_nodes[0]->GetInControlAnchor()); if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str(), + out_nodes[0]->GetName().c_str(), out_nodes[0]->GetType().c_str()); GELOGE(INTERNAL_ERROR, "[PrunePass] add control edge fail between DATA node[%s] and NETOUTPUT node[%s]!", node_ptr->GetOpDesc()->GetName().c_str(), out_nodes[0]->GetOpDesc()->GetName().c_str()); return INTERNAL_ERROR; diff --git a/ge/graph/passes/ref_identity_delete_op_pass.cc b/ge/graph/passes/ref_identity_delete_op_pass.cc index 95f710f2..b729b443 100644 --- a/ge/graph/passes/ref_identity_delete_op_pass.cc +++ b/ge/graph/passes/ref_identity_delete_op_pass.cc @@ -29,6 +29,8 @@ Status RefIdentityDeleteOpPass::Run(ComputeGraphPtr graph) { int input_index = 0; NodePtr ref_node = GetRefNode(node, input_index); CHECK_FALSE_EXEC(GetRefNode(node, input_index) != nullptr, + REPORT_CALL_ERROR("E19999", "Get Ref node of node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Ref node of RefIdentity[%s] not found", node->GetName().c_str()); return FAILED); CHECK_FALSE_EXEC(DealNoOutputRef(ref_node, node, input_index, graph) == SUCCESS, @@ -61,6 +63,8 @@ Status RefIdentityDeleteOpPass::DealNoOutputRef(const NodePtr &node, const NodeP NodePtr first_node = nullptr; NodePtr variable_ref = GetVariableRef(node, ref_identity, first_node); if (variable_ref == nullptr) { + REPORT_CALL_ERROR("E19999", "Get variable ref of node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[RefIdentityDeleteOpPass]Can not find variable ref for %s:%d", node->GetName().c_str(), input_index); return FAILED; @@ -83,11 +87,17 @@ Status RefIdentityDeleteOpPass::DealNoOutputRef(const NodePtr &node, const NodeP // +-----------+ +-----------+ auto ret = ge::GraphUtils::AddEdge(node->GetOutControlAnchor(), first_node->GetInControlAnchor()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + first_node->GetName().c_str(), first_node->GetType().c_str()); GELOGE(FAILED, "Add control edge between ref node and trans node failed"); return FAILED; } ret = ge::GraphUtils::RemoveEdge(node->GetOutControlAnchor(), variable_ref->GetInControlAnchor()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + first_node->GetName().c_str(), first_node->GetType().c_str()); GELOGE(FAILED, "Remove control edge between ref node and its peer node failed"); return FAILED; } @@ -113,11 +123,15 @@ Status RefIdentityDeleteOpPass::DealNoOutputRef(const NodePtr &node, const NodeP } // remove ref identity if (GraphUtils::IsolateNode(ref_identity, {0}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate op:%s(%s) failed", + ref_identity->GetName().c_str(), ref_identity->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", ref_identity->GetName().c_str(), variable_ref->GetType().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, ref_identity) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + ref_identity->GetName().c_str(), ref_identity->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node: %s, type: %s without relink failed", ref_identity->GetName().c_str(), ref_identity->GetType().c_str()); return FAILED; @@ -214,6 +228,9 @@ Status RefIdentityDeleteOpPass::RemoveUselessControlEdge(const NodePtr &ref, con if (out_nodes_map.find(peer_node->GetName()) != out_nodes_map.end()) { auto ret = ge::GraphUtils::RemoveEdge(out_control_anchor, peer_in_control_anchor); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str(), + peer_node->GetName().c_str(), peer_node->GetType().c_str()); GELOGE(FAILED, "Remove control edge between variable ref node[%s] and ref node's peer node[%s] failed", variable_ref->GetName().c_str(), peer_node->GetName().c_str()); return FAILED; diff --git a/ge/graph/passes/remove_same_const_pass.cc b/ge/graph/passes/remove_same_const_pass.cc index 3d18a92d..ec38a0af 100644 --- a/ge/graph/passes/remove_same_const_pass.cc +++ b/ge/graph/passes/remove_same_const_pass.cc @@ -85,6 +85,9 @@ Status RemoveSameConstPass::Run(ComputeGraphPtr graph) { ret = GraphUtils::ReplaceNodeAnchors(iter->second, node, {}, output_map); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace node:%s(%s)'s anchor by node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + iter->second->GetName().c_str(), iter->second->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to replace node %s by node %s, ret=%u", node->GetName().c_str(), iter->second->GetName().c_str(), ret); return INTERNAL_ERROR; @@ -94,6 +97,8 @@ Status RemoveSameConstPass::Run(ComputeGraphPtr graph) { ret = GraphUtils::RemoveNodeWithoutRelink(graph, node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to remove node %s from graph", node->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/replace_transshape_pass.cc b/ge/graph/passes/replace_transshape_pass.cc index 9004df4e..ac654083 100644 --- a/ge/graph/passes/replace_transshape_pass.cc +++ b/ge/graph/passes/replace_transshape_pass.cc @@ -43,6 +43,8 @@ Status ReplaceTransShapePass::ReplaceTransShapeNode(ComputeGraphPtr &graph, Node std::string op_type; auto ret = GetOriginalType(trans_shape_node, op_type); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + trans_shape_node->GetName().c_str(), trans_shape_node->GetType().c_str()); GELOGE(FAILED, "Get node %s original type failede", trans_shape_node->GetName().c_str()); return FAILED; } @@ -52,6 +54,7 @@ Status ReplaceTransShapePass::ReplaceTransShapeNode(ComputeGraphPtr &graph, Node std::string node_name = trans_shape_node->GetName() + "ToMemcpy"; auto dst_op_desc = MakeShared(node_name, MEMCPYASYNC); if (dst_op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Make node %s opdesc failed", node_name.c_str()); return FAILED; } @@ -59,6 +62,8 @@ Status ReplaceTransShapePass::ReplaceTransShapeNode(ComputeGraphPtr &graph, Node for (InDataAnchorPtr &in_anchor : trans_shape_node->GetAllInDataAnchors()) { auto ret = dst_op_desc->AddInputDesc(src_op_desc->GetInputDesc(in_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + dst_op_desc->GetName().c_str(), dst_op_desc->GetType().c_str()); GELOGE(FAILED, "Add input desc failed"); return FAILED; } @@ -66,6 +71,8 @@ Status ReplaceTransShapePass::ReplaceTransShapeNode(ComputeGraphPtr &graph, Node for (OutDataAnchorPtr &out_anchor : trans_shape_node->GetAllOutDataAnchors()) { auto ret = dst_op_desc->AddOutputDesc(src_op_desc->GetOutputDesc(out_anchor->GetIdx())); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + src_op_desc->GetName().c_str(), src_op_desc->GetType().c_str()); GELOGE(FAILED, "Add output desc failed"); return FAILED; } diff --git a/ge/graph/passes/replace_with_empty_const_pass.cc b/ge/graph/passes/replace_with_empty_const_pass.cc index 5962fe0e..bfa6ff95 100644 --- a/ge/graph/passes/replace_with_empty_const_pass.cc +++ b/ge/graph/passes/replace_with_empty_const_pass.cc @@ -26,10 +26,12 @@ namespace ge { Status ReplaceWithEmptyConstPass::Run(NodePtr &node) { GELOGD("ReplaceWithEmptyConstPass in."); if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Parameter is null."); return PARAM_INVALID; } if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [opDesc] must not be null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/reshape_recovery_pass.cc b/ge/graph/passes/reshape_recovery_pass.cc index f0987ff5..7a9d085b 100644 --- a/ge/graph/passes/reshape_recovery_pass.cc +++ b/ge/graph/passes/reshape_recovery_pass.cc @@ -23,18 +23,25 @@ NodePtr CreateReshape(const ConstGeTensorDescPtr &src, const ConstGeTensorDescPt auto next_num = reshape_num.fetch_add(1); auto reshape = MakeShared("Reshape_ReshapeRecoveryPass_" + std::to_string(next_num), RESHAPE); if (reshape == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); return nullptr; } auto ret = reshape->AddInputDesc("x", *src); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:x", + reshape->GetName().c_str(), reshape->GetType().c_str()); return nullptr; } ret = reshape->AddInputDesc("shape", GeTensorDesc(GeShape(), Format(), DT_INT32)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:shape", + reshape->GetName().c_str(), reshape->GetType().c_str()); return nullptr; } ret = reshape->AddOutputDesc("y", *dst); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, name:y", + reshape->GetName().c_str(), reshape->GetType().c_str()); return nullptr; } @@ -55,14 +62,27 @@ Status InsertReshapeIfNeed(const NodePtr &node) { GE_CHECK_NOTNULL(dst_node->GetOpDesc()); auto dst_tensor = dst_node->GetOpDesc()->GetInputDescPtr(dst_anchor->GetIdx()); GE_CHECK_NOTNULL(dst_tensor); - bool is_need_insert_reshape = src_tensor->GetShape().GetDims() != UNKNOWN_RANK && - dst_tensor->GetShape().GetDims() != UNKNOWN_RANK && - src_tensor->GetShape().GetDims() != dst_tensor->GetShape().GetDims(); + bool is_dynamic = false; + const auto &src_tensor_dims = src_tensor->GetShape().GetDims(); + const auto &dst_tensor_dims = dst_tensor->GetShape().GetDims(); + if ((std::any_of(src_tensor_dims.begin(), src_tensor_dims.end(), [](int64_t val) { return val < 0 ; })) + || (std::any_of(dst_tensor_dims.begin(), dst_tensor_dims.end(), [](int64_t val) { return val < 0; }))) { + GELOGD("No need to insert reshape node between %s nad %s.", node->GetName().c_str(), + dst_node->GetName().c_str()); + is_dynamic = true; + } + bool is_need_insert_reshape = src_tensor_dims != dst_tensor_dims && + !is_dynamic; if (is_need_insert_reshape) { auto reshape = CreateReshape(src_tensor, dst_tensor, node->GetOwnerComputeGraph()); GE_CHECK_NOTNULL(reshape); auto ret = GraphUtils::InsertNodeBetweenDataAnchors(src_anchor, dst_anchor, reshape); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Insert node:%s(%s) between node:%s(%s)(out_index:%d) and node:%s(%s)(out_index:%d) failed", + reshape->GetName().c_str(), reshape->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str(), src_anchor->GetIdx(), + dst_node->GetName().c_str(), dst_node->GetType().c_str(), dst_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to insert reshape between node %s and %s", node->GetName().c_str(), dst_node->GetName().c_str()); return INTERNAL_ERROR; diff --git a/ge/graph/passes/resource_pair_add_control_pass.cc b/ge/graph/passes/resource_pair_add_control_pass.cc index 29a19f26..cd93725d 100755 --- a/ge/graph/passes/resource_pair_add_control_pass.cc +++ b/ge/graph/passes/resource_pair_add_control_pass.cc @@ -83,6 +83,9 @@ Status ResourcePairAddControlPass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(to_anchor); graphStatus ret = from_anchor->LinkTo(to_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) link control edge to op:%s(%s) failed", + from_node->GetName().c_str(), from_node->GetType().c_str(), + to_node->GetName().c_str(), to_node->GetType().c_str()); GELOGE(PARAM_INVALID, "link fail, from_node:%s, to_node:%s, from_type:%s, to_type:%s", from_node->GetName().c_str(), to_node->GetName().c_str(), resource_type_pair.first.c_str(), resource_type_pair.second.c_str()); diff --git a/ge/graph/passes/resource_pair_remove_control_pass.cc b/ge/graph/passes/resource_pair_remove_control_pass.cc index 7048ed1d..64b04a13 100755 --- a/ge/graph/passes/resource_pair_remove_control_pass.cc +++ b/ge/graph/passes/resource_pair_remove_control_pass.cc @@ -82,6 +82,9 @@ Status ResourcePairRemoveControlPass::Run(ComputeGraphPtr graph) { auto to_anchor = to_node->GetInControlAnchor(); graphStatus ret = from_anchor->Unlink(to_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s) unlink control edge to op:%s(%s) failed", + from_node->GetName().c_str(), from_node->GetType().c_str(), + to_node->GetName().c_str(), to_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "unlink fail, from_node:%s, to_node:%s, from_type:%s, to_type:%s", from_node->GetName().c_str(), to_node->GetName().c_str(), resource_type_pair.first.c_str(), resource_type_pair.second.c_str()); diff --git a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc index 44778dd3..c0a3328e 100644 --- a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc +++ b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc @@ -71,6 +71,7 @@ OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); if (cast_op == nullptr) { + REPORT_INNER_ERROR("E19999", "Create Operator:%s(%s) failed", cast_op_name.str().c_str(), CAST); GELOGE(INTERNAL_ERROR, "new fusion cast op failed!"); return nullptr; } @@ -96,6 +97,8 @@ OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, } } if (!AttrUtils::SetInt(cast_op, CAST_ATTR_DST_TYPE, static_cast(out_desc.GetDataType()))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", CAST_ATTR_DST_TYPE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set dst_type attr failed"); return nullptr; } @@ -204,6 +207,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkDataOutput2PreNode(const NodeP GELOGI("remove edge.src:%s, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_anchor, transdata_peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str(), out_anchor->GetIdx(), + transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "remove edge failed!src node:%s, dst node:%s", transdata_node->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -211,6 +220,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkDataOutput2PreNode(const NodeP GELOGI("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); @@ -231,6 +246,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("remove edge.src:%s, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove edge failed!src node:%s, dst node:%s", transdata_node->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -240,6 +260,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -249,6 +274,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("add edge.src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -290,6 +320,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("remove edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_control_anchor->GetOwnerNode()->GetName().c_str(), + out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove transdata control edge failed!"); return GRAPH_FAILED; } @@ -298,6 +333,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -305,6 +345,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("add edge.src:%s, dst:%s", transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -329,6 +374,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("remove edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_control_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_control_anchor->GetOwnerNode()->GetName().c_str(), + out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove transdata control edge failed!"); return GRAPH_FAILED; } @@ -337,6 +387,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -344,6 +400,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("add edge.src:%s, dst:%s", transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -460,6 +521,12 @@ graphStatus SameTransdataBreadthFusionPass::RelinkRemainTransdata(const ComputeG GELOGI("add edge.out node %s, in node %s", head_node->GetName().c_str(), transdata_node_keep->GetName().c_str()); if (GraphUtils::AddEdge(head_node_anchor, transdata_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + head_node_anchor->GetOwnerNode()->GetName().c_str(), + head_node_anchor->GetOwnerNode()->GetType().c_str(), head_node_anchor->GetIdx(), + transdata_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!out node %s, in node %s", head_node->GetName().c_str(), transdata_node_keep->GetName().c_str()); return GRAPH_FAILED; @@ -545,6 +612,12 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor GELOGI("add edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), head_node_peer_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_out_anchor, head_node_peer_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + transdata_out_anchor->GetOwnerNode()->GetName().c_str(), + transdata_out_anchor->GetOwnerNode()->GetType().c_str(), transdata_out_anchor->GetIdx(), + head_node_peer_anchor->GetOwnerNode()->GetName().c_str(), + head_node_peer_anchor->GetOwnerNode()->GetType().c_str(), + head_node_peer_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), head_node_peer_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -562,6 +635,8 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor auto input_desc = in_op_desc->GetInputDesc(in_data_anchor->GetIdx()); CopyTensorDesc(transdata_output_desc, input_desc); if (in_op_desc->UpdateInputDesc(in_data_anchor->GetIdx(), input_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input:%d desc in op:%s(%s) failed", in_data_anchor->GetIdx(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "UpdateInputDesc fail."); return FAILED; } @@ -569,6 +644,8 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor auto output_desc = in_op_desc->GetOutputDesc(output_idx); CopyTensorDesc(transdata_output_desc, output_desc); GE_IF_BOOL_EXEC(in_op_desc->UpdateOutputDesc(output_idx, output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update output:%d desc in op:%s(%s) failed", output_idx, + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(GRAPH_FAILED, "update input desc failed"); return GRAPH_FAILED); // relink control edge @@ -610,6 +687,13 @@ graphStatus SameTransdataBreadthFusionPass::LinkNewCastNode2RemainTransdata( GELOGI("remove edge.src:%s, dst:%s", transdata_peer_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_remove_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(transdata_peer_out_anchor, transdata_remove_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + transdata_peer_out_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_out_anchor->GetIdx(), + transdata_remove_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_remove_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_remove_in_anchor->GetIdx()); return GRAPH_FAILED; } @@ -642,6 +726,9 @@ graphStatus SameTransdataBreadthFusionPass::LinkNewCastNode2RemainTransdata( } if (graph->RemoveNode(transdata_node_remove) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + transdata_node_remove->GetName().c_str(), transdata_node_remove->GetType().c_str(), + graph->GetName().c_str()); GELOGE(GRAPH_FAILED, "remove node %s failed!", transdata_node_remove->GetName().c_str()); return GRAPH_FAILED; } @@ -660,6 +747,10 @@ graphStatus SameTransdataBreadthFusionPass::RelinkInControlEdge(const NodePtr &n GELOGD("remove edge.src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_src->GetName().c_str()); if (GraphUtils::RemoveEdge(peer_out_control_anchor, node_src->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + node_src->GetName().c_str(), node_src->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove edge faliled!src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_src->GetName().c_str()); return GRAPH_FAILED; @@ -667,6 +758,10 @@ graphStatus SameTransdataBreadthFusionPass::RelinkInControlEdge(const NodePtr &n GELOGD("add edge.src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_dst->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_control_anchor, node_dst->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + node_dst->GetName().c_str(), node_dst->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_dst->GetName().c_str()); return GRAPH_FAILED; @@ -713,10 +808,16 @@ graphStatus SameTransdataBreadthFusionPass::AddCastNode(const ComputeGraphPtr &g auto cast_node = graph->AddNode(cast_op_desc); if (cast_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + cast_op_desc->GetName().c_str(), cast_op_desc->GetType().c_str(), graph->GetName().c_str()); return GRAPH_FAILED; } GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), cast_node->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, cast_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + cast_node->GetName().c_str(), cast_node->GetType().c_str()); return GRAPH_FAILED; } if (i == 0) { @@ -724,6 +825,8 @@ graphStatus SameTransdataBreadthFusionPass::AddCastNode(const ComputeGraphPtr &g } if (!AttrUtils::SetBool(cast_op_desc, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + cast_op_desc->GetName().c_str(), cast_op_desc->GetType().c_str()); GELOGE(FAILED, "SetExtAttr fail."); return FAILED; } @@ -738,6 +841,7 @@ graphStatus SameTransdataBreadthFusionPass::GetSubGraphsBetweenNormalAndTransdat std::vector> &nodes_list) { graphStatus ret = GRAPH_SUCCESS; if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out_anchor is nullptr, check invalid"); GELOGE(GRAPH_FAILED, "out data anchor is null!This should not happen!"); return GRAPH_FAILED; } diff --git a/ge/graph/passes/save_pass.cc b/ge/graph/passes/save_pass.cc index a2e34b1d..b82a6420 100755 --- a/ge/graph/passes/save_pass.cc +++ b/ge/graph/passes/save_pass.cc @@ -47,7 +47,9 @@ Status SavePass::Run(ge::ComputeGraphPtr graph) { out_index.emplace_back(out_anchor->GetIdx()); ge::OpDescPtr op_desc = peer_node->GetOpDesc(); GE_IF_BOOL_EXEC(!ge::AttrUtils::SetStr(op_desc, kVarAttrVarIsSave, kVarIsSave), - GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); return INTERNAL_ERROR); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kVarAttrVarIsSave, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); return INTERNAL_ERROR); } } } @@ -65,6 +67,8 @@ Status SavePass::Run(ge::ComputeGraphPtr graph) { for (auto &node_ptr : del_nodes) { auto ret = graph->RemoveNode(node_ptr); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str(), graph->GetName().c_str()); GELOGE(ret, "GraphUtils::RemoveNodeWithoutRelink failed."); return ret; } diff --git a/ge/graph/passes/set_input_output_offset_pass.cc b/ge/graph/passes/set_input_output_offset_pass.cc index ec41d6be..d3c1e07d 100644 --- a/ge/graph/passes/set_input_output_offset_pass.cc +++ b/ge/graph/passes/set_input_output_offset_pass.cc @@ -54,6 +54,8 @@ Status SetInputOutputOffsetPass::SetInputOffsetForFusion(const std::vector input_offset_of_node; input_offset_of_node = op_desc->GetInputOffset(); if (input_offset_of_node.size() < i) { + REPORT_INNER_ERROR("E19999", "Input offsets size:%zu of node:%s(%s) < index:%zu, check invalid", + input_offset_of_node.size(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), i); GELOGE(PARAM_INVALID, "not get input_offset of %zu", i); return PARAM_INVALID; } @@ -77,10 +79,15 @@ Status SetInputOutputOffsetPass::SetInputOffsetForFusion(const std::vectorGetName().c_str(), data_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC( ge::AttrUtils::SetListInt(data_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + data_op_desc->GetName().c_str(), data_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); } @@ -115,10 +122,15 @@ Status SetInputOutputOffsetPass::SetInputOffsetForHcom(const ge::NodePtr &node, zero_copy_basic_offset.emplace_back(output_offset); zero_copy_relative_offset.emplace_back(relative_offset); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(in_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC( ge::AttrUtils::SetListInt(in_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); } @@ -159,6 +171,9 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForConcat(const NodePtr &node) { output_offset_of_concat = op_desc->GetOutputOffset(); // phony_concat has one output GE_IF_BOOL_EXEC(output_offset_of_concat.size() != 1, + REPORT_INNER_ERROR("E19999", "Output offsets size:%zu of node:%s(%s) not equal to 1, check invalid", + output_offset_of_concat.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "%s should has one output.", node->GetName().c_str()); return PARAM_INVALID); NodePtr net_output = node->GetOutDataNodes().at(0); @@ -186,9 +201,14 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForConcat(const NodePtr &node) { zero_copy_relative_offset.emplace_back(relative_offset); } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); return SUCCESS; @@ -232,9 +252,14 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForHcom(const NodePtr &node, con } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); return SUCCESS; diff --git a/ge/graph/passes/snapshot_pass.cc b/ge/graph/passes/snapshot_pass.cc index 2b578e51..469a70af 100644 --- a/ge/graph/passes/snapshot_pass.cc +++ b/ge/graph/passes/snapshot_pass.cc @@ -29,6 +29,8 @@ Status SnapshotPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(status_ret, "SnapshotPass get original type failed."); return status_ret; } diff --git a/ge/graph/passes/stop_gradient_pass.cc b/ge/graph/passes/stop_gradient_pass.cc index 223e4513..33d07803 100644 --- a/ge/graph/passes/stop_gradient_pass.cc +++ b/ge/graph/passes/stop_gradient_pass.cc @@ -20,12 +20,15 @@ namespace ge { Status StopGradientPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(status_ret, "StopGradientPass get original type failed."); return status_ret; } diff --git a/ge/graph/passes/subexpression_migration_pass.cc b/ge/graph/passes/subexpression_migration_pass.cc index dc4d2185..d70ed05d 100755 --- a/ge/graph/passes/subexpression_migration_pass.cc +++ b/ge/graph/passes/subexpression_migration_pass.cc @@ -144,6 +144,8 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -156,6 +158,8 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap uint32_t parent_index = 0; if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str()); return FAILED; } @@ -229,6 +233,7 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const mapsecond; auto data_it = data_nodes.find(node_idx); if (data_it == data_nodes.end()) { + REPORT_INNER_ERROR("E19999", "Find node in data_nodes by index:%u failed", node_idx); GELOGE(FAILED, "Data: %s not fount, index: %u", base_node->GetName().c_str(), node_idx); return false; } @@ -238,12 +243,15 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const mapGetPeerInDataAnchors(); const auto &in_anchor = in_anchors.at(anchor_idx); if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u anchor not exist in out:%u data anchor's peer of node:%s(%s)", + node_idx, kDataOutIndex, work_data->GetName().c_str(), work_data->GetType().c_str()); GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_anchors.size()); return false; } const auto &work_node = in_anchor->GetOwnerNode(); if (work_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner node of anchor is nullptr, check invalid"); GELOGE(FAILED, "Data: %s not found, index: %u", base_node->GetName().c_str(), node_idx); return false; } @@ -279,7 +287,7 @@ Status SubexpressionMigrationPass::GraphNodeMigration(const ComputeGraphPtr &gra const auto &in_anchor = in_anchors.at(i); const auto &base_node = in_anchor->GetOwnerNode(); GELOGD("Get Data direct node: %s", base_node->GetName().c_str()); - if (!base_node->GetHostNode()) { + if (!base_node->GetHostNode() || base_node->GetType() == SWITCH) { continue; } @@ -338,17 +346,22 @@ Status SubexpressionMigrationPass::AppendParallelNode(mapGetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, item.second)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } @@ -392,12 +405,14 @@ Status SubexpressionMigrationPass::DetachParallelNode(const mapGetAllOutDataAnchorsSize(); ++i) { auto it_idx = outputs.find(i); if (it_idx == outputs.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", detach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i); return FAILED; } auto it_data = graph_datas.find(it_idx->second); if (it_data == graph_datas.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", detach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i); return FAILED; } @@ -444,6 +459,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra for (uint32_t i = 0; i < attach->GetAllInDataAnchorsSize(); ++i) { auto it_idx = inputs.find(i); if (it_idx == inputs.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", attach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", attach->GetName().c_str(), i); return FAILED; } @@ -505,6 +521,7 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph uint32_t anchor_idx, const map &inputs, const map &outputs) { if (inputs.empty()) { + REPORT_INNER_ERROR("E19999", "Param inputs is empty, check invalid"); GELOGE(FAILED, "Graph: %s, inputs is empty", graph->GetName().c_str()); return FAILED; } @@ -516,6 +533,8 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &subnodes = groups.second; auto it = subnodes.find(base_index); if (it == subnodes.end()) { + REPORT_INNER_ERROR("E19999", "Index:%u data node not found in graph:%s, check invalid", + base_index, subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Data: %u node not found", subgraph->GetName().c_str(), base_index); return FAILED; } @@ -525,12 +544,15 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &in_anchors = out_anchor->GetPeerInDataAnchors(); const auto &in_anchor = in_anchors.at(anchor_idx); if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u anchor not exist in out:%u data anchor's peer of node:%s(%s)", + anchor_idx, kDataOutIndex, base_data->GetName().c_str(), base_data->GetType().c_str()); GELOGE(FAILED, "Data anchor index: %u, anchor size: %zu", anchor_idx, in_anchors.size()); return FAILED; } move_node = in_anchor->GetOwnerNode(); if (move_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner node of anchor is nullptr, check invalid"); GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), base_index); return FAILED; } diff --git a/ge/graph/passes/subgraph_const_migration_pass.cc b/ge/graph/passes/subgraph_const_migration_pass.cc index d27cacf7..3b3b7e0b 100644 --- a/ge/graph/passes/subgraph_const_migration_pass.cc +++ b/ge/graph/passes/subgraph_const_migration_pass.cc @@ -141,6 +141,8 @@ Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &gra for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -152,6 +154,8 @@ Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &gra if (node->GetType() == DATA) { uint32_t parent_index = kInvalidParent; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } @@ -166,8 +170,8 @@ Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &gra string node_full_name = peer_node->GetName(); size_t pos = node_full_name.find(kMbatchNodeNameMark); if (pos == string::npos) { - GELOGE(FAILED, "find: %s of multi-batch in node: %s", kMbatchNodeNameMark.c_str(), node_full_name.c_str()); - return FAILED; + GELOGI("Can not find: %s of multi-batch in node: %s", kMbatchNodeNameMark.c_str(), node_full_name.c_str()); + continue; } string fixed_name = node_full_name.substr(0, pos); @@ -326,17 +330,22 @@ Status SubgraphConstMigrationPass::AppendParallelNode(const NodePtr &func_node, OpDescBuilder op_builder(data_name, DATA); const auto op_desc = op_builder.AddInput("x").AddOutput("y").Build(); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Build op:%s(%s) failed", data_name.c_str(), DATA); GELOGE(OUT_OF_MEMORY, "Create multi-batch subgraph data desc failed"); return OUT_OF_MEMORY; } uint32_t data_index = parent_index - kCaseInputBase; if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } @@ -460,6 +469,8 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const map> &all_data_nodes, const string &node_key, uint32_t parent_index) { if (node_key.empty() || parent_index == kInvalidParent) { + REPORT_INNER_ERROR("E19999", "Param node_key is empty or param parent_index is 0x%X, check invalid", + kInvalidParent); GELOGE(FAILED, "Graph: %s, node key: %s, parent index: %u invalid", graph->GetName().c_str(), node_key.c_str(), parent_index); return FAILED; @@ -470,6 +481,8 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &subgraph = item.first; const auto it_const = item.second.find(node_key); if (it_const == item.second.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } @@ -477,11 +490,15 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto it_nodes = all_data_nodes.find(subgraph); if (it_nodes == all_data_nodes.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } const auto it_data = it_nodes->second.find(parent_index); if (it_data == it_nodes->second.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } diff --git a/ge/graph/passes/subgraph_pass.cc b/ge/graph/passes/subgraph_pass.cc index f140644e..b931eea8 100755 --- a/ge/graph/passes/subgraph_pass.cc +++ b/ge/graph/passes/subgraph_pass.cc @@ -94,6 +94,8 @@ Status SubgraphPass::SubgraphInputNode(const ComputeGraphPtr &graph, const NodeP uint32_t parent_index = 0; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Get attr PARENT_NODE_INDEX failed, node:%s.", node->GetName().c_str()); return FAILED; } @@ -208,6 +210,8 @@ Status SubgraphPass::WhileBodySubgraph(const ComputeGraphPtr &graph, const NodeP // index of body_subgraph is 1 ComputeGraphPtr while_body = NodeUtils::GetSubgraph(*node, 1); if (while_body == nullptr) { + REPORT_INNER_ERROR("E19999", "While_body of node:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "while_body of %s is NULL.", node->GetName().c_str()); return FAILED; } @@ -242,12 +246,16 @@ Status SubgraphPass::WhileBodySubgraph(const ComputeGraphPtr &graph, const NodeP if (output_node == nullptr) { output_node = n; } else { + REPORT_INNER_ERROR("E19999", "While_body graph:%s exists multi NetOutput nodes, check invalid", + while_body->GetName().c_str()); GELOGE(FAILED, "while_body %s exists multi NetOutput nodes.", while_body->GetName().c_str()); return FAILED; } } } if (output_node == nullptr) { + REPORT_INNER_ERROR("E19999", "While_body graph:%s has no output, check invalid", + while_body->GetName().c_str()); GELOGE(FAILED, "while_body %s has no output.", while_body->GetName().c_str()); return FAILED; } @@ -462,6 +470,10 @@ Status SubgraphPass::InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDat (void)AttrUtils::SetBool(op_desc, ATTR_NO_NEED_CONSTANT_FOLDING, false); (void)AttrUtils::SetBool(op_desc, ATTR_NAME_CANNOT_BE_DELETED, true); if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(op_desc)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert Cast node %s(%s) after %s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(FAILED, "Insert IDENTITY node %s after %s failed.", name.c_str(), in_node->GetName().c_str()); return FAILED; } @@ -481,6 +493,9 @@ Status SubgraphPass::InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDat Status SubgraphPass::InsertNodeBetween(const OutDataAnchorPtr &src, const std::vector &dsts, const NodePtr &insert_node, uint32_t input_index, uint32_t output_index) { if (GraphUtils::AddEdge(src, insert_node->GetInDataAnchor(input_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + src->GetOwnerNode()->GetName().c_str(), src->GetOwnerNode()->GetType().c_str(), src->GetIdx(), + insert_node->GetName().c_str(), insert_node->GetType().c_str(), input_index); GELOGE(FAILED, "Add data_edge %s:%d->%s:%u failed.", src->GetOwnerNode()->GetName().c_str(), src->GetIdx(), insert_node->GetName().c_str(), input_index); return FAILED; @@ -490,6 +505,12 @@ Status SubgraphPass::InsertNodeBetween(const OutDataAnchorPtr &src, const std::v dst->GetOwnerNode()->GetName().c_str()); if ((GraphUtils::RemoveEdge(src, dst) != GRAPH_SUCCESS) || (GraphUtils::AddEdge(insert_node->GetOutDataAnchor(output_index), dst) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) or " + "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + src->GetOwnerNode()->GetName().c_str(), src->GetOwnerNode()->GetType().c_str(), src->GetIdx(), + dst->GetOwnerNode()->GetName().c_str(), dst->GetOwnerNode()->GetType().c_str(), dst->GetIdx(), + insert_node->GetName().c_str(), insert_node->GetType().c_str(), output_index, + dst->GetOwnerNode()->GetName().c_str(), dst->GetOwnerNode()->GetType().c_str(), dst->GetIdx()); GELOGE(FAILED, "Replace data_edge %s:%d->%s:%d by %s:%u->%s:%d failed.", src->GetOwnerNode()->GetName().c_str(), src->GetIdx(), dst->GetOwnerNode()->GetName().c_str(), dst->GetIdx(), diff --git a/ge/graph/passes/switch_data_edges_bypass.cc b/ge/graph/passes/switch_data_edges_bypass.cc index f7453dd7..6a925ae3 100644 --- a/ge/graph/passes/switch_data_edges_bypass.cc +++ b/ge/graph/passes/switch_data_edges_bypass.cc @@ -50,6 +50,8 @@ bool IsSwitchInWhileLoop(const NodePtr &node) { std::vector> GetOutDataNodesByIndex(const NodePtr &node, int index) { auto out_anchor = node->GetOutDataAnchor(index); if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d out data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(PARAM_INVALID, "Failed to get out data nodes of index %d from node %s, the anchor does not exists", index, node->GetName().c_str()); return {}; @@ -84,18 +86,23 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to add identity after node %s index %d, the op desc is null", node->GetName().c_str(), index); return nullptr; } auto tensor = node_desc->GetOutputDescPtr(index); if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d output tensor, check invalid", + node_desc->GetName().c_str(), node_desc->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to find the tensor by index %d from node %s, can not add the identity node", index, node->GetName().c_str()); return nullptr; } auto anchor = node->GetOutDataAnchor(index); if (anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d out data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d, the out anchor does not exists", node->GetName().c_str(), index); return nullptr; @@ -104,6 +111,7 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto identity_opdesc = MakeShared("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter), IDENTITY); if (identity_opdesc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -111,6 +119,9 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto ret2 = identity_opdesc->AddOutputDesc("y", *tensor); auto identity = node->GetOwnerComputeGraph()->AddNode(identity_opdesc); if (ret1 != GRAPH_SUCCESS || ret2 != GRAPH_SUCCESS || identity == nullptr) { + REPORT_CALL_ERROR("E19999", "Add input ouput desc to op:%s(%s) failed or add it to graph:%s failed", + identity_opdesc->GetName().c_str(), identity_opdesc->GetType().c_str(), + node->GetOwnerComputeGraph()->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -124,18 +135,23 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to add memcpy before node %s index %d, null op desc", node->GetName().c_str(), index); return nullptr; } auto tensor = node_desc->GetInputDescPtr(index); if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d input tensor, check invalid", + node_desc->GetName().c_str(), node_desc->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to find the tensor by index %d from node %s, can not add the memcpy node", index, node->GetName().c_str()); return nullptr; } auto anchor = node->GetInDataAnchor(index); if (anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d in data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to add memcpy before node %s index %d, the in anchor does not exists", node->GetName().c_str(), index); return nullptr; @@ -143,6 +159,7 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto memcpy_opdesc = MakeShared("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter), MEMCPYASYNC); if (memcpy_opdesc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -150,6 +167,9 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto ret2 = memcpy_opdesc->AddOutputDesc(*tensor); auto memcpy_node = node->GetOwnerComputeGraph()->AddNode(memcpy_opdesc); if (ret1 != GRAPH_SUCCESS || ret2 != GRAPH_SUCCESS || memcpy_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add input ouput desc to op:%s(%s) failed or add it to graph:%s failed", + memcpy_opdesc->GetName().c_str(), memcpy_opdesc->GetType().c_str(), + node->GetOwnerComputeGraph()->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index); return nullptr; } diff --git a/ge/graph/passes/switch_dead_branch_elimination.cc b/ge/graph/passes/switch_dead_branch_elimination.cc index 70105aea..b840bfc7 100644 --- a/ge/graph/passes/switch_dead_branch_elimination.cc +++ b/ge/graph/passes/switch_dead_branch_elimination.cc @@ -31,6 +31,7 @@ const int kDefaultInputIndex = -1; bool ParsePred(const ConstGeTensorPtr &tensor) { if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param tensor is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return false; } @@ -65,6 +66,8 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA OutDataAnchorPtr &inactive_out_data_anchor) { auto tensors = OpDescUtils::MutableWeights(pred_node); if (tensors.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no weight, check invalid", + pred_node->GetName().c_str(), pred_node->GetType().c_str()); return false; } @@ -72,6 +75,7 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA int inactive_output_index = pred_value ? 0 : 1; if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return false; } @@ -91,11 +95,21 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pred_node, const OutDataAnchorPtr &active_out_data_anchor) { if (node == nullptr || active_out_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node or active_out_data_anchor is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } + + // If two nodes aren't in same graph, get node's direct in_node instead of pred_node. + if (node->GetOwnerComputeGraph() != pred_node->GetOwnerComputeGraph()) { + pred_node = PassUtils::GetInDataNode(node, kPredInputIndex); + } + // link pred's in control nodes to switch if (GraphUtils::CopyInCtrlEdges(pred_node, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + pred_node->GetName().c_str(), pred_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } // Remove link between pred and switch @@ -108,6 +122,8 @@ Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pre std::vector switch_io_map = {kDefaultInputIndex, kDefaultInputIndex}; size_t out_index = static_cast(active_out_data_anchor->GetIdx()); if (out_index >= switch_io_map.size()) { + REPORT_INNER_ERROR("E19999", "Out index:%zu of node:%s(%s) >= %zu, check invalid", out_index, + node->GetName().c_str(), node->GetType().c_str(), switch_io_map.size()); GELOGE(FAILED, "[%s] out index check failed, out_index:%zu.", node->GetName().c_str(), out_index); return FAILED; } @@ -117,6 +133,7 @@ Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pre Status SwitchDeadBranchElimination::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [node] must not be null."); return PARAM_INVALID; } @@ -131,7 +148,7 @@ Status SwitchDeadBranchElimination::Run(NodePtr &node) { return SUCCESS; } - auto pred_node = PassUtils::GetInDataNode(node, kPredInputIndex); + auto pred_node = PassUtils::GetInNodeCrossSubgraphByIndex(node, kPredInputIndex); if (pred_node == nullptr) { GELOGD("[%s] Pred input is null.", node->GetName().c_str()); return SUCCESS; @@ -143,7 +160,7 @@ Status SwitchDeadBranchElimination::Run(NodePtr &node) { return SUCCESS; } - auto input_node = PassUtils::GetInDataNode(node, kDataInputIndex); + auto input_node = PassUtils::GetInNodeCrossSubgraphByIndex(node, kDataInputIndex); if (input_node == nullptr) { GELOGD("[%s] Data input is null.", node->GetName().c_str()); return SUCCESS; @@ -162,6 +179,8 @@ Status SwitchDeadBranchElimination::Run(NodePtr &node) { std::vector end_nodes; Status ret = PassUtils::RemoveInactiveBranchToMerge(inactive_out_data_anchor, del_nodes, end_nodes); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove inactive branch from node:%s(%s) to merge failed", + node->GetName().c_str(), node->GetType().c_str()); return ret; } diff --git a/ge/graph/passes/switch_logic_remove_pass.cc b/ge/graph/passes/switch_logic_remove_pass.cc index a6758e86..bce714ad 100644 --- a/ge/graph/passes/switch_logic_remove_pass.cc +++ b/ge/graph/passes/switch_logic_remove_pass.cc @@ -45,11 +45,15 @@ Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) GE_CHECK_NOTNULL(switch_node); auto pred_in_anchor = switch_node->GetInDataAnchor(kSwitchPredIndex); if (pred_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d in data anchor, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, no pred anchor", switch_node->GetName().c_str()); return INTERNAL_ERROR; } auto pred_node_anchor = pred_in_anchor->GetPeerOutAnchor(); if (pred_node_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d in data anchor, its peer anchor is nullptr, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, node peer out anchor", switch_node->GetName().c_str()); @@ -57,6 +61,8 @@ Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) } auto pred_node = pred_node_anchor->GetOwnerNode(); if (pred_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d in data anchor, its peer node is nullptr, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, null node", switch_node->GetName().c_str()); @@ -89,11 +95,15 @@ Status SwitchLogicRemovePass::Run(NodePtr &node) { } for (auto &in_anchor : out_anchor->GetPeerInDataAnchors()) { if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d out data anchor, its peer anchors has nullptr, " + "check invalid", node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "The in-anchor from out anchor %d node %s is null", i, node->GetName().c_str()); return INTERNAL_ERROR; } auto dst_node = in_anchor->GetOwnerNode(); if (dst_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d out data anchor, its peer nodes has nullptr, " + "check invalid", node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "The peer node from out anchor %d node %s is null", i, node->GetName().c_str()); return INTERNAL_ERROR; } @@ -143,6 +153,8 @@ Status SwitchLogicRemovePass::RemoveSwitchNodeLogically(int parent_index, NodePt std::vector end_nodes; auto ret = PassUtils::RemoveInactiveBranchToMerge(out_anchor, deleted_nodes, end_nodes); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove inactive branch from node:%s(%s) to merge failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); return ret; } diff --git a/ge/graph/passes/switch_to_stream_switch_pass.cc b/ge/graph/passes/switch_to_stream_switch_pass.cc index 392968e7..97d9926f 100644 --- a/ge/graph/passes/switch_to_stream_switch_pass.cc +++ b/ge/graph/passes/switch_to_stream_switch_pass.cc @@ -33,8 +33,14 @@ Status SwitchToStreamSwitchPass::Run(ComputeGraphPtr graph) { GE_CHK_STATUS_RET(CombineSwitchNode(graph), "Combine StreamSwitch nodes failed."); for (const auto &node : bypass_nodes_) { - GE_CHK_BOOL_EXEC(graph->IsolateNode(node) == GRAPH_SUCCESS, return FAILED, "Isolate node failed."); - GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, return FAILED, + GE_CHK_BOOL_EXEC(graph->IsolateNode(node) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); + return FAILED, "Isolate node failed."); + GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); + return FAILED, "Remove switch node failed."); } @@ -159,7 +165,11 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, OpDescPtr cond_desc = peer_cond_anchor->GetOwnerNode()->GetOpDesc(); GE_CHECK_NOTNULL(cond_desc); DataType cond_data_type = cond_desc->GetOutputDesc(peer_cond_anchor->GetIdx()).GetDataType(); - GE_CHK_BOOL_EXEC(cond_data_type == DT_BOOL, return FAILED, + GE_CHK_BOOL_EXEC(cond_data_type == DT_BOOL, + REPORT_INNER_ERROR("E19999", "Pred_input of Switch node:%s(%s) only support DT_BOOL data_type, " + "but %s exactly", switch_node->GetName().c_str(), switch_node->GetType().c_str(), + TypeUtils::DataTypeToSerialString(cond_data_type).c_str()); + return FAILED, "pred_input of Switch only support DT_BOOL data_type, but %s exactly.", TypeUtils::DataTypeToSerialString(cond_data_type).c_str()); @@ -176,6 +186,8 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, stream_switch = CreateStreamSwitchNode(graph, switch_node, true_branch_flag ? "_t" : "_f", peer_cond_anchor); GE_CHK_BOOL_EXEC(stream_switch != nullptr, return FAILED, "Create stream_switch node failed."); if (SetSwitchTrueBranchFlag(stream_switch, true_branch_flag) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set switch true branch flag from node:%s(%s) failed", + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "SetSwitchTrueBranchFlag for node %s failed.", stream_switch->GetName().c_str()); return FAILED; } @@ -204,6 +216,8 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, MoveCtrlEdges(switch_node, stream_switch); switch_node_map_[stream_switch] = out_node_list; if (SetOriginalNodeName(stream_switch, switch_node->GetName()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set original node name:%s to node:%s(%s) failed", switch_node->GetName().c_str(), + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "SetOriginalNodeName for node %s failed.", stream_switch->GetName().c_str()); return FAILED; } @@ -230,6 +244,10 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou GE_CHECK_NOTNULL(peer_out_anchor); // Remove Switch data input. if (GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), peer_out_anchor->GetIdx(), + switch_node->GetName().c_str(), switch_node->GetType().c_str(), idx); GELOGE(FAILED, "Remove data edge %s->%s failed.", peer_out_anchor->GetOwnerNode()->GetName().c_str(), switch_node->GetName().c_str()); return FAILED; @@ -284,8 +302,13 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & const std::string &suffix, const OutDataAnchorPtr &peer_cond_anchor) { OpDescPtr switch_op_desc = switch_node->GetOpDesc(); - GE_CHK_BOOL_EXEC(switch_op_desc != nullptr, return nullptr, "OpDesc of Switch node is invalid."); + GE_CHK_BOOL_EXEC(switch_op_desc != nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); + return nullptr, "OpDesc of Switch node is invalid."); GE_IF_BOOL_EXEC(switch_op_desc->GetInputsSize() != SWITCH_INPUT_NUM, { + REPORT_INNER_ERROR("E19999", "Input desc size:%zu of node:%s(%s) not equal to %u, check invalid", + switch_op_desc->GetInputsSize(), + switch_op_desc->GetName().c_str(), switch_op_desc->GetType().c_str(), SWITCH_INPUT_NUM); GELOGE(FAILED, "Switch input param invalid, input_size=%lu, should be %u.", switch_op_desc->GetInputsSize(), SWITCH_INPUT_NUM); return nullptr; @@ -295,6 +318,7 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & GELOGI("Create StreamSwitch, name=%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMSWITCH); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, StreamSwitch:%s.", node_name.c_str()); return nullptr; } @@ -307,8 +331,18 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & hccl_group_id.c_str()); } + int64_t switch_type; + if (AttrUtils::GetInt(switch_node->GetOpDesc(), ATTR_NAME_STREAM_SWITCH_TYPE, switch_type)) { + (void)AttrUtils::SetInt(op_desc, ATTR_NAME_STREAM_SWITCH_TYPE, switch_type); + GELOGD("Set attr ATTR_NAME_STREAM_SWITCH_TYPE for Stream_Switch %s, value is %ld.", node_name.c_str(), + switch_type); + } + if (!AttrUtils::SetInt(op_desc, ATTR_NAME_SWITCH_DATA_TYPE, RT_SWITCH_INT32) || !AttrUtils::SetInt(op_desc, ATTR_NAME_STREAM_SWITCH_COND, (int64_t)RT_EQUAL)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s or Attr:%s to op:%s(%s) failed", + ATTR_NAME_SWITCH_DATA_TYPE.c_str(), ATTR_NAME_STREAM_SWITCH_COND.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set int failed"); return nullptr; } @@ -316,13 +350,22 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & // Already checked, first input is Variable will passed, second is condition will checked. GeTensorDesc cond_input_desc = switch_op_desc->GetInputDesc(SWITCH_PRED_INPUT); GeTensorDesc input_desc(GeShape(cond_input_desc.GetShape().GetDims()), cond_input_desc.GetFormat(), DT_INT32); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return nullptr, "Create StreamSwitch node: add input desc failed."); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return nullptr, "Create StreamSwitch node: add input desc failed."); NodePtr stream_switch = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(stream_switch != nullptr, return nullptr, "Insert StreamSwitch node failed."); + GE_CHK_BOOL_EXEC(stream_switch != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); + return nullptr, "Insert StreamSwitch node failed."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, stream_switch->GetInDataAnchor(0)), "StreamSwitch node add cond edge failed."); @@ -354,6 +397,8 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_ it->second[switch_group_id] = switch_list; } else { GE_IF_BOOL_EXEC(switch_group_it->second.size() != SWITCH_OUTPUT_NUM, { + REPORT_INNER_ERROR("E19999", "switch group size:%zu not equal to %u, group_id:%ld, check invalid", + switch_group_it->second.size(), SWITCH_OUTPUT_NUM, switch_group_id); GELOGE(INTERNAL_ERROR, "Check size failed, node: %s", stream_switch->GetName().c_str()); return FAILED; }); @@ -436,6 +481,8 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph) GE_CHK_STATUS(GraphUtils::AddEdge(cast_node->GetOutControlAnchor(), active_node->GetInControlAnchor()), "StreamActive add ctl edge failed."); if (SetActiveLabelList(active_node, { cast_node->GetName() }) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + cast_node->GetName().c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "Set active_label_list attr for node %s failed.", active_node->GetName().c_str()); return FAILED; } @@ -448,6 +495,14 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph) // select first stream_switch NodePtr stream_switch = switch_list.front(); + // set stream_label + if (SetStreamLabel(stream_switch, cast_node->GetName()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + cast_node->GetName().c_str(), stream_switch->GetName().c_str(), + stream_switch->GetType().c_str()); + GELOGE(FAILED, "Set stream label failed."); + return FAILED; + } OpDescPtr switch_desc = stream_switch->GetOpDesc(); GE_CHECK_NOTNULL(switch_desc); switch_desc->SetName(CheckDuplicateName(cond_group + "/" + STREAMSWITCH + (true_branch_flag ? "_t" : "_f"))); @@ -488,18 +543,27 @@ NodePtr SwitchToStreamSwitchPass::CreateActiveNode(const ComputeGraphPtr &graph, GELOGI("Create StreamActive op:%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMACTIVE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, StreamActive:%s.", node_name.c_str()); return nullptr; } NodePtr active_node = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(active_node != nullptr, return nullptr, "Create StreamActive node failed."); + GE_CHK_BOOL_EXEC(active_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); + return nullptr, "Create StreamActive node failed."); GE_IF_BOOL_EXEC(GraphUtils::AddEdge(node->GetOutControlAnchor(), active_node->GetInControlAnchor()) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add edge failed"); return nullptr); GE_IF_BOOL_EXEC(SetSwitchBranchNodeLabel(active_node, node_name) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Set switch branch node label:%s to node:%s(%s) failed", + node_name.c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set switch branch node label failed"); return nullptr); @@ -520,6 +584,7 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con GELOGI("Create cast_node: %s, input datatype:DT_BOOL, out datatype:DT_INT32", cast_name.c_str()); OpDescPtr cast_desc = MakeShared(cast_name, CAST); if (cast_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, Cast:%s.", cast_name.c_str()); return nullptr; } @@ -527,6 +592,10 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con AttrUtils::SetInt(cast_desc, CAST_ATTR_DSTT, (int64_t)DT_INT32) && AttrUtils::SetInt(cast_desc, CAST_ATTR_DST_TYPE, (int64_t)DT_INT32) && AttrUtils::SetBool(cast_desc, CAST_ATTR_TRUNCATE, false))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s or %s or %s or %s to op:%s(%s) failed", + CAST_ATTR_SRCT.c_str(), CAST_ATTR_DSTT.c_str(), + CAST_ATTR_DST_TYPE.c_str(), CAST_ATTR_TRUNCATE.c_str(), + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); GELOGE(FAILED, "Set CAST_ATTR_SRCT or CAST_ATTR_DSTT or CAST_ATTR_DST_TYPE or CAST_ATTR_TRUNCATE failed, node: %s.", cast_name.c_str()); return nullptr; @@ -534,14 +603,24 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con GeTensorDesc tensor_desc = cond_desc->GetOutputDesc(peer_cond_anchor->GetIdx()); tensor_desc.SetDataType(DT_BOOL); - GE_CHK_BOOL_EXEC(cast_desc->AddInputDesc(tensor_desc) == SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(cast_desc->AddInputDesc(tensor_desc) == SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); + return nullptr, "Cast_node add input desc failed."); tensor_desc.SetDataType(DT_INT32); - GE_CHK_BOOL_EXEC(cast_desc->AddOutputDesc(tensor_desc) == SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(cast_desc->AddOutputDesc(tensor_desc) == SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); + return nullptr, "Cast_node add output desc failed."); NodePtr cast_node = graph->AddNode(cast_desc); - GE_CHK_BOOL_EXEC(cast_node != nullptr, return nullptr, "Create cast_node failed."); + GE_CHK_BOOL_EXEC(cast_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str(), + graph->GetName().c_str()); + return nullptr, "Create cast_node failed."); // Cast node has and only has one input GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, cast_node->GetInDataAnchor(0)), "Cast add data edge failed."); @@ -558,13 +637,18 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons OpDescPtr op_desc = stream_switch->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); bool value = false; - GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED, + GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", + ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return FAILED, "StreamSwitch get attr TRUE_BRANCH_STREAM failed."); const std::string &const_node_name = op_desc->GetName() + "_Constant_" + (value ? "t" : "f"); GELOGI("Create const op: %s", const_node_name.c_str()); OpDescPtr const_op_desc = MakeShared(const_node_name, CONSTANT); if (const_op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, Constant:%s.", const_node_name.c_str()); return FAILED; } @@ -574,15 +658,26 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons GeTensorPtr const_value = MakeShared(data_desc, reinterpret_cast(&resize_value), sizeof(int32_t)); if (const_value == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create tensor failed."); return FAILED; } - GE_CHK_BOOL_EXEC(AttrUtils::SetTensor(const_op_desc, ATTR_NAME_WEIGHTS, const_value), return FAILED); - GE_CHK_BOOL_EXEC(const_op_desc->AddOutputDesc(data_desc) == GRAPH_SUCCESS, return FAILED, + GE_CHK_BOOL_EXEC(AttrUtils::SetTensor(const_op_desc, ATTR_NAME_WEIGHTS, const_value), + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); + return FAILED); + GE_CHK_BOOL_EXEC(const_op_desc->AddOutputDesc(data_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); + return FAILED, "Create Const op: add output desc failed."); NodePtr const_node = graph->AddNode(const_op_desc); - GE_CHK_BOOL_EXEC(const_node != nullptr, return FAILED, "Insert Const node failed."); + GE_CHK_BOOL_EXEC(const_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Insert Const node failed."); GE_CHK_STATUS(GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), stream_switch->GetInDataAnchor(1)), "StreamSwitch node add ctl edge failed."); @@ -604,6 +699,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no OpDescPtr switch_desc = switch_node->GetOpDesc(); GE_CHECK_NOTNULL(switch_desc); if (!AttrUtils::GetStr(switch_desc, ATTR_NAME_ORIG_NODE_NAME, orig_switch_name) || orig_switch_name.empty()) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_ORIG_NODE_NAME.c_str(), + switch_desc->GetName().c_str(), switch_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ORIG_NODE_NAME failed, node: %s", switch_desc->GetName().c_str()); return INTERNAL_ERROR; } @@ -625,6 +722,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no auto find_res1 = switch_node_map_.find(in_ctrl_node); GE_IF_BOOL_EXEC(find_res1 == switch_node_map_.end(), { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) can't find in switch_node_map_, check invalid", + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctrl_node->GetName().c_str()); return INTERNAL_ERROR; }); @@ -653,10 +752,14 @@ Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_n stream_switch->GetName().c_str(), active_node->GetName().c_str()); auto find_res = switch_node_map_.find(switch_node); GE_IF_BOOL_EXEC(find_res == switch_node_map_.end(), { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) can't find in switch_node_map_, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", switch_node->GetName().c_str()); return INTERNAL_ERROR; }); GE_IF_BOOL_EXEC(find_res->second.empty(), { + REPORT_INNER_ERROR("E19999", "True_nodes of StreamSwitch node:%s(%s) is empty, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "true_nodes of StreamSwitch node %s is empty.", switch_node->GetName().c_str()); return INTERNAL_ERROR; }); @@ -669,6 +772,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_n std::string orig_name = op_desc->GetName(); GE_IF_BOOL_EXEC(op_desc->HasAttr(ATTR_NAME_ORIG_NODE_NAME), { if (!AttrUtils::GetStr(op_desc, ATTR_NAME_ORIG_NODE_NAME, orig_name) || orig_name.empty()) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_ORIG_NODE_NAME.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ORIG_NODE_NAME failed, node: %s.", op_desc->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/transop_breadth_fusion_pass.cc b/ge/graph/passes/transop_breadth_fusion_pass.cc index 654c3822..a52f4389 100644 --- a/ge/graph/passes/transop_breadth_fusion_pass.cc +++ b/ge/graph/passes/transop_breadth_fusion_pass.cc @@ -31,6 +31,7 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { // breadth fusion pass requires new topologic Status ret_topo = graph->TopologicalSorting(); if (ret_topo != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological sorting for graph:%s failed", graph->GetName().c_str()); GELOGE(ret_topo, "TopologicalSorting the merged graph failed."); return ret_topo; } @@ -60,7 +61,9 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No bool trans_format = false; bool trans_shape = false; - GE_IF_BOOL_EXEC(node == nullptr || node->GetOpDesc() == nullptr, GELOGE(FAILED, "node is null"); return ""); + GE_IF_BOOL_EXEC(node == nullptr || node->GetOpDesc() == nullptr, + REPORT_INNER_ERROR("E19999", "Param node or its op_desc is nullptr, check invalid"); + GELOGE(FAILED, "node is null"); return ""); if (node->GetType() == CAST) { trans_data_type = true; } else if (node->GetType() == TRANSPOSE || node->GetType() == TRANSPOSED || node->GetType() == EXPANDDIMS) { diff --git a/ge/graph/passes/transop_depth_fusion_pass.cc b/ge/graph/passes/transop_depth_fusion_pass.cc index 85106e08..05b55307 100755 --- a/ge/graph/passes/transop_depth_fusion_pass.cc +++ b/ge/graph/passes/transop_depth_fusion_pass.cc @@ -82,6 +82,7 @@ graphStatus TransOpDepthFusionPass::RecursiveInDepth(const InDataAnchorPtr &dst_ if (dst_in_anchor == nullptr || dst_in_anchor->GetOwnerNode() == nullptr || dst_in_anchor->GetOwnerNode()->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param dst_in_anchor related node info has nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return GRAPH_FAILED; } @@ -257,11 +258,13 @@ graphStatus TransOpDepthFusionPass::RelinkEdges(const OutDataAnchorPtr &new_out_ const OutDataAnchorPtr &old_out_anchor, const InDataAnchorPtr &in_data_anchor) { if (new_out_anchor == nullptr || old_out_anchor == nullptr || in_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param anchor info has nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "new_out_anchor or old_out_anchor or in_data_anchor is nullptr"); return GRAPH_FAILED; } if (new_out_anchor->GetOwnerNode() == nullptr || old_out_anchor->GetOwnerNode() == nullptr || in_data_anchor->GetOwnerNode() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param anchor info owner node has nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "anchor's owner node is nullptr"); return GRAPH_FAILED; } @@ -305,11 +308,14 @@ graphStatus TransOpDepthFusionPass::RemoveNode(const NodePtr &node, const ge::Co return GRAPH_FAILED; } if (GraphUtils::IsolateNode(node, {0}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", node->GetName().c_str(), node->GetType().c_str()); return GRAPH_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node: %s, type: %s without relink failed", node->GetName().c_str(), node->GetType().c_str()); return GRAPH_FAILED; diff --git a/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc b/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc index b207abe9..78c60eda 100644 --- a/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc +++ b/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc @@ -99,6 +99,9 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt auto in_data_anchors = node->GetAllInDataAnchors(); auto out_data_anchors = node->GetAllOutDataAnchors(); if (in_data_anchors.size() != out_data_anchors.size()) { + REPORT_INNER_ERROR("E19999", "In data anchors size:%zu not equal to out data anchors size:%zu in node:%s(%s), " + "check invalid", in_data_anchors.size(), out_data_anchors.size(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "in and out data anchor size are not equal, node=%s, in_size=%zu, out_size=%zu", node->GetName().c_str(), in_data_anchors.size(), out_data_anchors.size()); return FAILED; @@ -143,6 +146,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt // delete in_node if (IsolateAndDeleteNode(in_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(FAILED, "remove node %s failed", in_node->GetName().c_str()); return FAILED; } @@ -150,6 +155,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt // delete out_node if (IsolateAndDeleteNode(out_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + out_node->GetName().c_str(), out_node->GetType().c_str()); GELOGE(FAILED, "remove node %s failed", out_node->GetName().c_str()); return FAILED; } @@ -162,9 +169,13 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt auto input_desc = in_node->GetOpDesc()->GetInputDesc(0); auto output_desc = out_node->GetOpDesc()->GetOutputDesc(0); if (node->GetOpDesc()->UpdateInputDesc(static_cast(i), input_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input:%zu desc in op:%s(%s) failed", + i, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "UpdateInputDesc fail."); } if (node->GetOpDesc()->UpdateOutputDesc(static_cast(i), output_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output:%zu desc in op:%s(%s) failed", + i, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "UpdateOutputDesc"); } GELOGI("successfully remove paired transop (%s and %s) for node %s", diff --git a/ge/graph/passes/transop_symmetry_elimination_pass.cc b/ge/graph/passes/transop_symmetry_elimination_pass.cc index 9db3aea1..2ea7fac1 100644 --- a/ge/graph/passes/transop_symmetry_elimination_pass.cc +++ b/ge/graph/passes/transop_symmetry_elimination_pass.cc @@ -172,6 +172,12 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 1.Unlink T1->T2 auto ret = src_out_anchor->Unlink(dst_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + dst_in_anchor->GetOwnerNode()->GetName().c_str(), + dst_in_anchor->GetOwnerNode()->GetType().c_str(), dst_in_anchor->GetIdx()); GELOGE(FAILED, "Unlink data anchor from %s to %s.", src_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -183,6 +189,11 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const auto pre_normal_node = in_anchor->GetPeerOutAnchor()->GetOwnerNode(); ret = GraphUtils::AddEdge(in_anchor->GetPeerOutAnchor(), dst_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_normal_node->GetName().c_str(), pre_normal_node->GetType().c_str(), + in_anchor->GetPeerOutAnchor()->GetIdx(), + dst_in_anchor->GetOwnerNode()->GetName().c_str(), + dst_in_anchor->GetOwnerNode()->GetType().c_str(), dst_in_anchor->GetIdx()); GELOGE(FAILED, "Add data edge from %s to %s failed.", pre_normal_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; @@ -190,6 +201,9 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 3.Copy in-control/data-in-control from T1->T2 ret = GraphUtils::CopyInCtrlEdges(src_node, dst_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(FAILED, "Copy control edge from %s to %s failed.", src_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -198,6 +212,9 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const if (in_node->GetName() == pre_normal_node->GetName()) { continue; } ret = GraphUtils::AddEdge(in_node->GetOutControlAnchor(), dst_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(FAILED, "Add control edge from %s to %s failed.", in_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -205,6 +222,8 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 5.IsolateAndDelete T2, A will link to B automatically, and all control edge will also relink. ret = IsolateAndDeleteNode(dst_node, {0}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", dst_node->GetName().c_str(), dst_node->GetType().c_str()); return ret; @@ -223,6 +242,9 @@ Status TransOpSymmetryEliminationPass::RemoveTransOpWithoutOutput(NodePtr &pre_n // 6.1 Copy out control to pre normal node Status ret = GraphUtils::CopyOutCtrlEdges(trans_node, pre_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy out control edge from node:%s(%s) to node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), + pre_node->GetName().c_str(), pre_node->GetType().c_str()); GELOGE(FAILED, "Copy control edge from %s to %s failed.", trans_node->GetName().c_str(), pre_node->GetName().c_str()); return ret; @@ -230,6 +252,8 @@ Status TransOpSymmetryEliminationPass::RemoveTransOpWithoutOutput(NodePtr &pre_n // 6.2 Isolate and delete T1 ret = IsolateAndDeleteNode(trans_node, {}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", trans_node->GetName().c_str(), trans_node->GetType().c_str()); return ret; diff --git a/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/ge/graph/passes/transop_without_reshape_fusion_pass.cc index 6bea9edc..00896235 100644 --- a/ge/graph/passes/transop_without_reshape_fusion_pass.cc +++ b/ge/graph/passes/transop_without_reshape_fusion_pass.cc @@ -63,7 +63,10 @@ void TransOpWithoutReshapeFusionPass::SetRemainNode( continue; } GELOGI("SetRemainNode node is %s", op_desc->GetName().c_str()); - GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return); + GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kRemainNode, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "set ext attr failed"); return); } } @@ -74,17 +77,29 @@ bool TransOpWithoutReshapeFusionPass::FormatContinuousCheck(const OutDataAnchorP return false; } auto in_node = in_anchor->GetOwnerNode(); - GE_IF_BOOL_EXEC(in_node == nullptr, GELOGE(INTERNAL_ERROR, "in_node is null"); return false); + GE_IF_BOOL_EXEC(in_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor's owner node is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_node is null"); return false); auto in_op = in_node->GetOpDesc(); auto out_owner_node = out_anchor->GetOwnerNode(); - GE_IF_BOOL_EXEC(out_owner_node == nullptr, GELOGE(INTERNAL_ERROR, "out_owner_node is null"); return false); + GE_IF_BOOL_EXEC(out_owner_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor's owner node is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_owner_node is null"); return false); auto out_op = out_owner_node->GetOpDesc(); - GE_IF_BOOL_EXEC(in_op == nullptr, GELOGE(INTERNAL_ERROR, "in_op is null"); return false); - GE_IF_BOOL_EXEC(out_op == nullptr, GELOGE(INTERNAL_ERROR, "out_op is null"); return false); + GE_IF_BOOL_EXEC(in_op == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor's owner op_desc is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_op is null"); return false); + GE_IF_BOOL_EXEC(out_op == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor's owner op_desc is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_op is null"); return false); auto in_op_desc = in_op->GetInputDescPtr(in_anchor->GetIdx()); auto out_op_desc = out_op->GetOutputDescPtr(out_anchor->GetIdx()); - GE_IF_BOOL_EXEC(in_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_op_desc is null"); return false); - GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_op_desc is null"); return false); + GE_IF_BOOL_EXEC(in_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor corresponding tensor is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_op_desc is null"); return false); + GE_IF_BOOL_EXEC(out_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor corresponding tensor is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_op_desc is null"); return false); if (!ShapeEqualCheck(in_op_desc->GetShape(), out_op_desc->GetShape())) { return false; } @@ -357,6 +372,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkSubGraphControlEdges( GELOGI("add control edge.src:%s, dst:%s", out_owner_node->GetName().c_str(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -365,6 +383,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkSubGraphControlEdges( GELOGI("add out data 2 in contorl edge.src:%s, dst:%s", out_owner_node->GetName().c_str(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor, in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -392,6 +413,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add control edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -401,6 +426,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add control edge.src:%s, src idx:%d, dst:%s", peer_out_anchor->GetOwnerNode()->GetName().c_str(), peer_out_anchor->GetIdx(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_anchor, in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -410,6 +439,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add out control 2 in data edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -419,6 +452,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add out data 2 in control edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_anchor, peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -443,6 +480,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkNodesWhenDescNotChanged( GELOGI("relink node.src node:%s, src idx:%d, dst node:%s, dst idx:%d", out_owner_node->GetName().c_str(), out_anchor->GetIdx(), in_owner_node->GetName().c_str(), in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_anchor, in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), out_anchor->GetIdx(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str(), in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!src:%s, src idx:%d, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), out_anchor->GetIdx(), in_owner_node->GetName().c_str(), in_anchor->GetIdx()); return GRAPH_FAILED; @@ -466,16 +506,21 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count; OpDescPtr format_transfer_op = MakeShared(format_transfer_op_name.str().c_str(), TRANSDATA); if (format_transfer_op == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(INTERNAL_ERROR, "new format transfer op failed!"); return nullptr; } GE_IF_BOOL_EXEC(!AttrUtils::SetInt(format_transfer_op, ATTR_NAME_INPUT_FORMAT, static_cast(format_trans_input_desc.GetFormat())), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_INPUT_FORMAT.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_INPUT_FORMAT failed"); return nullptr); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(format_transfer_op, ATTR_NAME_OUTPUT_FORMAT, static_cast(format_trans_output_desc.GetFormat())), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_OUTPUT_FORMAT.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_OUTPUT_FORMAT failed"); return nullptr); @@ -483,22 +528,32 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes string dst_format = TypeUtils::FormatToSerialString(format_trans_output_desc.GetFormat()); GE_IF_BOOL_EXEC(!AttrUtils::SetStr(format_transfer_op, kAttrNameSrcFormat, src_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kAttrNameSrcFormat, + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set kAttrNameSrcFormat failed"); return nullptr); GE_IF_BOOL_EXEC(!AttrUtils::SetStr(format_transfer_op, kAttrNameDstFormat, dst_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kAttrNameDstFormat, + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set kAttrNameDstFormat failed"); return nullptr); GE_IF_BOOL_EXEC(format_transfer_op->AddInputDesc(format_trans_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add input desc failed"); return nullptr); GE_IF_BOOL_EXEC(format_transfer_op->AddOutputDesc(format_trans_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add output desc failed"); return nullptr); GE_IF_BOOL_EXEC(!ge::AttrUtils::SetBool(format_transfer_op, ATTR_NEED_COMPILE, true), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ext attr failed"); return nullptr); return format_transfer_op; @@ -515,6 +570,7 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_in auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); if (cast_op == nullptr) { + REPORT_CALL_ERROR("E19999", "Create operator:%s(%s) failed", cast_op_name.str().c_str(), CAST); GELOGE(INTERNAL_ERROR, "new cast op failed!"); return nullptr; } @@ -522,29 +578,41 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_in const int default_output_index = 0; if (cast_op->GetInputsSize() == 0) { GE_IF_BOOL_EXEC(cast_op->AddInputDesc(cast_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add input desc failed"); return nullptr); } else { GE_IF_BOOL_EXEC(cast_op->UpdateInputDesc(default_input_index, cast_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update input:%d desc of op:%s(%s) failed", default_input_index, + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "update input desc failed"); return nullptr); } if (cast_op->GetOutputsSize() == 0) { GE_IF_BOOL_EXEC(cast_op->AddOutputDesc(cast_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add output desc failed"); return nullptr); } else { GE_IF_BOOL_EXEC(cast_op->UpdateOutputDesc(default_output_index, cast_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update output:%d desc of op:%s(%s) failed", default_output_index, + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "update output desc failed"); return nullptr); } if (!AttrUtils::SetInt(cast_op, CAST_ATTR_DST_TYPE, static_cast(cast_output_desc.GetDataType()))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", CAST_ATTR_DST_TYPE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set dst_type attr failed"); return nullptr; } if (!AttrUtils::SetBool(cast_op, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set need_compile attr failed"); return nullptr; } @@ -879,6 +947,8 @@ graphStatus TransOpWithoutReshapeFusionPass::AddTransNode(const ComputeGraphPtr trans_node = graph->AddNode(transop); if (trans_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + transop->GetName().c_str(), transop->GetType().c_str(), graph->GetName().c_str()); GELOGE(GRAPH_FAILED, "add node failed!"); return GRAPH_FAILED; } @@ -945,6 +1015,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph GELOGI("add edge.src:%s, src idx:%d, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetIdx(), new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor, new_trans_nodes.front()->GetInAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), out_anchor->GetIdx(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } else { auto old_peer_in_anchor = begin_out.second; @@ -957,6 +1030,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph new_trans_nodes.back()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.front()->GetOutAnchor(0), new_trans_nodes.back()->GetInAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str(), + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str()); return GRAPH_FAILED; } else { auto old_peer_out_anchor = end_in.first; @@ -967,6 +1043,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph GELOGI("add edge.src:%s, dst:%s, dst idx:%d", new_trans_nodes.back()->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetIdx()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutAnchor(0), in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str(), in_anchor->GetIdx()); return GRAPH_FAILED; } @@ -977,6 +1056,7 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, const vector &new_trans_nodes) { GE_CHECK_NOTNULL(out_anchor); if (new_trans_nodes.front() == nullptr || new_trans_nodes.back() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param new_trans_nodes front or back is nullptr, check invalid"); return GRAPH_FAILED; } if (sub_graph_has_control_edge_[index]) { @@ -984,6 +1064,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor->GetOwnerNode()->GetOutControlAnchor(), new_trans_nodes.front()->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetOwnerNode()->GetType().c_str(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } } @@ -993,6 +1076,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1002,6 +1089,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", peer_out_anchor->GetOwnerNode()->GetName().c_str(), new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_anchor, new_trans_nodes.front()->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1011,6 +1102,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1020,6 +1115,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutDataAnchor(0), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str(), peer_in_anchor->GetIdx()); return GRAPH_FAILED; } } @@ -1081,6 +1180,7 @@ graphStatus TransOpWithoutReshapeFusionPass::GetSubGraphsBetweenNormalNode( vector> &nodes_list) { graphStatus ret = GRAPH_SUCCESS; if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out_anchor is nullptr, check invalid"); return GRAPH_FAILED; } diff --git a/ge/graph/passes/transpose_transdata_pass.cc b/ge/graph/passes/transpose_transdata_pass.cc index 810f5639..674804bd 100644 --- a/ge/graph/passes/transpose_transdata_pass.cc +++ b/ge/graph/passes/transpose_transdata_pass.cc @@ -34,11 +34,13 @@ const char *const kAttrNameSrcFormat = "src_format"; namespace ge { Status TransposeTransDataPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); return PARAM_INVALID; } @@ -77,6 +79,7 @@ Status TransposeTransDataPass::Run(NodePtr &node) { GE_CHECK_NOTNULL(out_node); OpDescPtr out_op_desc = out_node->GetOpDesc(); if (out_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "OpDesc of out data node of [%s] must not be null.", node->GetName().c_str()); return FAILED; } @@ -111,6 +114,10 @@ Status TransposeTransDataPass::CheckOneInAndOneOutDataAnchor(NodePtr &node) cons // Trans op has one input data node, maybe has N output data nodes uint32_t in_data_node_nums = node->GetInDataNodes().size(); if (in_data_anchor_nums != 1 || out_data_anchor_nums != 1 || in_data_node_nums != 1) { + REPORT_INNER_ERROR("E19999", "In data anchor num:%u, out data anchor num:%u, in data node num:%u of node:%s(%s) " + "must be all equal to 1, check invalid", + in_data_anchor_nums, out_data_anchor_nums, in_data_node_nums, + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] %s has %u in %u out data anchor, has %u in data node.", node->GetType().c_str(), node->GetName().c_str(), in_data_anchor_nums, out_data_anchor_nums, in_data_node_nums); return FAILED; @@ -122,6 +129,8 @@ Status TransposeTransDataPass::RemoveTranspose(NodePtr &node) { GE_CHECK_NOTNULL(node); ComputeGraphPtr graph = node->GetOwnerComputeGraph(); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner graph of node:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] The owner graph must not be null.", node->GetName().c_str()); return FAILED; } @@ -146,6 +155,8 @@ Status TransposeTransDataPass::RemoveTranspose(NodePtr &node) { } AddNodeDeleted(node); if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/unused_args_clean_pass.cc b/ge/graph/passes/unused_args_clean_pass.cc index ec66b129..df70e99b 100755 --- a/ge/graph/passes/unused_args_clean_pass.cc +++ b/ge/graph/passes/unused_args_clean_pass.cc @@ -101,6 +101,8 @@ Status UnusedArgsCleanPass::ClassifyDataNodes(const ComputeGraphPtr &graph, cons for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_CALL_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -113,6 +115,8 @@ Status UnusedArgsCleanPass::ClassifyDataNodes(const ComputeGraphPtr &graph, cons uint32_t parent_index = 0; if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str()); return FAILED; } @@ -150,6 +154,8 @@ Status UnusedArgsCleanPass::UpdateInputTensor(const mapsecond; if (!AttrUtils::SetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, update_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Set parent index failed, name: %s", data->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/unused_const_pass.cc b/ge/graph/passes/unused_const_pass.cc index 7c57c53e..80e43d08 100644 --- a/ge/graph/passes/unused_const_pass.cc +++ b/ge/graph/passes/unused_const_pass.cc @@ -27,10 +27,12 @@ namespace ge { /// Status UnusedConstPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [opDesc] must not be null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/var_is_initialized_op_pass.cc b/ge/graph/passes/var_is_initialized_op_pass.cc index b9c752d8..e1f982d6 100644 --- a/ge/graph/passes/var_is_initialized_op_pass.cc +++ b/ge/graph/passes/var_is_initialized_op_pass.cc @@ -61,6 +61,8 @@ Status VarIsInitializedOpPass::CheckSrcNode(const NodePtr &node, bool &inited) c GE_CHECK_NOTNULL(node); auto input_nodes = node->GetInDataNodes(); if (input_nodes.size() != kVarIsInitializedIOCnt) { + REPORT_INNER_ERROR("E19999", "In data node num:%zu of node:%s(%s) not equal to %d, check invalid", + input_nodes.size(), node->GetName().c_str(), node->GetType().c_str(), kVarIsInitializedIOCnt); GELOGE(FAILED, "[%s] Node input data nodes size [%zu] is not equal 1.", node->GetName().c_str(), @@ -73,6 +75,9 @@ Status VarIsInitializedOpPass::CheckSrcNode(const NodePtr &node, bool &inited) c auto input_node_name = input_node->GetName(); auto input_node_type = input_node->GetType(); if (input_node_type != VARIABLE) { + REPORT_INNER_ERROR("E19999", "Index:%d In data node of node:%s(%s), type:%s not %s, check invalid", + kVarIsInitVarInputIndex, node->GetName().c_str(), node->GetType().c_str(), + input_node_type.c_str(), VARIABLE); GELOGE(FAILED, "[%s] Src node %s is not Variable, is %s.", node->GetName().c_str(), input_node_name.c_str(), input_node_type.c_str()); return FAILED; @@ -95,6 +100,7 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, // 1. create Constant OpDesc op_desc = MakeShared(node->GetName().c_str(), CONSTANT); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "[%s] Make shared of Constant op desc failed.", node->GetName().c_str()); return FAILED; } @@ -102,6 +108,7 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, // 2. get OpDesc of VarIsInitializedOp OpDescPtr original_op_desc = node->GetOpDesc(); if (original_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "[%s] Op desc must not be null.", node->GetName().c_str()); return FAILED; } @@ -111,10 +118,13 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, bool val = inited; GeTensorPtr const_tensor_ptr = MakeShared(original_desc, reinterpret_cast(&val), sizeof(bool)); if (const_tensor_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "[%s] Make shared of Constant tensor failed.", node->GetName().c_str()); return FAILED; } if (!AttrUtils::SetTensor(op_desc, ATTR_NAME_WEIGHTS, const_tensor_ptr)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "get ATTR_NAME_WEIGHTS failed"); return FAILED; } @@ -131,6 +141,9 @@ Status VarIsInitializedOpPass::ProcessInAnchor(NodePtr &node, NodePtr &new_node) auto out_anchors = node->GetAllOutDataAnchors(); if ((in_anchors.size() != kVarIsInitializedIOCnt) || (out_anchors.size() != kVarIsInitializedIOCnt)) { + REPORT_INNER_ERROR("E19999", "In data anchor num:%zu and out data anchor num:%zu of node:%s(%s), " + "must botch equal to %d, check invalid", in_anchors.size(), out_anchors.size(), + node->GetName().c_str(), node->GetType().c_str(), kVarIsInitializedIOCnt); GELOGE(FAILED, "[%s] Node input/output data anchors" " size [%lu][%lu] is not all equal 1.", @@ -144,22 +157,36 @@ Status VarIsInitializedOpPass::ProcessInAnchor(NodePtr &node, NodePtr &new_node) auto peer_out_anchor = in_anchor->GetPeerOutAnchor(); GE_CHECK_NOTNULL(peer_out_anchor); if (GraphUtils::RemoveEdge(in_anchor, peer_out_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx(), + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), peer_out_anchor->GetIdx()); GELOGE(FAILED, "[%s] Remove in data edge failed.", node->GetName().c_str()); return FAILED; } auto src_node = peer_out_anchor->GetOwnerNode(); if (GraphUtils::AddEdge(src_node->GetOutControlAnchor(), new_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to link control edges from var %s to new const %s", src_node->GetName().c_str(), new_node->GetName().c_str()); return FAILED; } if (GraphUtils::MoveInCtrlEdges(node, new_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move in control edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to move in ctrl edges from %s to new const", node->GetName().c_str()); return FAILED; } if (GraphUtils::MoveOutCtrlEdges(node, new_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out control edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to move out ctrl edges from %s to new const", node->GetName().c_str()); return FAILED; } @@ -177,6 +204,9 @@ Status VarIsInitializedOpPass::ChangeNodeToConstant(NodePtr &node, bool inited) NodePtr const_node = graph->AddNodeFront(constant_op_desc); if (const_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s front failed", + constant_op_desc->GetName().c_str(), constant_op_desc->GetType().c_str(), + graph->GetName().c_str()); return FAILED; } @@ -185,11 +215,16 @@ Status VarIsInitializedOpPass::ChangeNodeToConstant(NodePtr &node, bool inited) } if (NodeUtils::MoveOutputEdges(node, const_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); GELOGE(FAILED, "[%s] Move output edges to new node failed.", node->GetName().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } @@ -263,6 +298,7 @@ Status VarIsInitializedOpPass::UpdateInitedVars(const NodePtr &node) { std::set *VarIsInitializedOpPass::CreateInitedVars() { std::unique_ptr> inited_vars_keeper(new(std::nothrow) std::set()); if (inited_vars_keeper == nullptr) { + REPORT_CALL_ERROR("E19999", "New set failed"); GELOGE(OUT_OF_MEMORY, "Failed to alloc set memory"); return nullptr; } diff --git a/ge/graph/passes/variable_op_pass.cc b/ge/graph/passes/variable_op_pass.cc index f1843d94..c605d305 100644 --- a/ge/graph/passes/variable_op_pass.cc +++ b/ge/graph/passes/variable_op_pass.cc @@ -47,6 +47,9 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { GELOGD("Begin to bypass trans node %s", trans_node->GetName().c_str()); auto ret = GraphUtils::CopyInCtrlEdges(trans_node, ref_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to move control edges from trans " "node %s to var-ref %s", @@ -55,6 +58,8 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { } auto ref_in_anchor = ref_node->GetInDataAnchor(0); if (ref_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "The variable ref node %s does not have an " "input anchor", @@ -64,6 +69,8 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { ref_in_anchor->UnlinkAll(); auto trans_in_anchor = trans_node->GetInDataAnchor(0); if (trans_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to get the in data anchor from trans" " node %s type %s", @@ -79,6 +86,11 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { } else { ret = GraphUtils::AddEdge(prev_trans_node_out_anchor, ref_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + prev_trans_node_out_anchor->GetOwnerNode()->GetName().c_str(), + prev_trans_node_out_anchor->GetOwnerNode()->GetType().c_str(), + prev_trans_node_out_anchor->GetIdx(), + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add edge between ref node %s " "and the prev node of trans node %s", @@ -115,14 +127,17 @@ bool IsTransSupport(const TransNodeInfo &trans_info) { Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to run variable op pass, null graph"); return INTERNAL_ERROR; } + auto graph_id = GraphUtils::FindRootGraph(graph)->GetGraphID(); GELOGD("Begin to run variable op pass on graph %s, session %lu, graph id %u", graph->GetName().c_str(), - GetContext().SessionId(), graph->GetGraphID()); + GetContext().SessionId(), graph_id); if (var_accelerate_ctrl_ == nullptr) { + REPORT_INNER_ERROR("E19999", "The variable accelerate control is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to run var op pass, the variable accelerate control is null"); return INTERNAL_ERROR; } @@ -173,11 +188,15 @@ Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { ret = VarManager::Instance(graph->GetSessionID())->SetTransRoad(node->GetName(), fusion_road); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set Trans road for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(INTERNAL_ERROR, "Failed to update the format fusion road for var %s", node->GetName().c_str()); return INTERNAL_ERROR; } - ret = VarManager::Instance(graph->GetSessionID())->SetChangedGraphId(node->GetName(), graph->GetGraphID()); + ret = VarManager::Instance(graph->GetSessionID())->SetChangedGraphId(node->GetName(), graph_id); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update graph_id:%u for node:%s(%s) failed, session_id:%lu", + graph_id, node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(INTERNAL_ERROR, "Failed to update the graph id for var %s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -209,10 +228,14 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) { trans_node->GetType().c_str(), var_node->GetName().c_str()); if (GraphUtils::IsolateNode(trans_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, trans_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), graph->GetName().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } } @@ -244,9 +267,13 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) { " one output data nodes, isolate and remove it.", trans_node->GetName().c_str(), trans_node->GetType().c_str(), ref_node->GetName().c_str()); if (GraphUtils::IsolateNode(trans_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, trans_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), graph->GetName().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } } @@ -364,6 +391,7 @@ Status VariableOpPass::CheckVariableRefLegally(const ge::NodePtr &var_node, bool Status VariableOpPass::UpdateVarAndRefOutputFormatInfo(const GeTensorDesc &final_output, const ge::NodePtr &node) { if (node == nullptr || node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node or its op_desc is nullptr, check invalid"); GELOGE(FAILED, "node or opdesc is nullptr"); return FAILED; } @@ -376,6 +404,8 @@ Status VariableOpPass::UpdateVarAndRefOutputFormatInfo(const GeTensorDesc &final auto node_desc = node->GetOpDesc()->GetOutputDesc(0); CopyVariableFormatDataTypeAndShape(final_output, node_desc); if (node->GetOpDesc()->UpdateOutputDesc(0, node_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update ouput:0 desc in op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "update output desc fail."); return FAILED; } @@ -459,6 +489,10 @@ Status VariableOpPass::CheckVarAndVarRefAreAlike(const NodePtr &var_node, const GELOGD("var_ref_node_trans_nodes size is %zu.", var_ref_node_trans_nodes.size()); if (var_ref_node_trans_nodes.size() > 1) { + REPORT_INNER_ERROR("E19999", "In data node num:%zu of node:%s(%s) bigger than 1, check invalid", + var_ref_node_trans_nodes.size(), + var_ref_node->GetName().c_str(), var_ref_node->GetType().c_str()); + GELOGE(GE_GRAPH_VARIABLE_OP_PASS_FAILED, "var_ref_node_trans_nodes.size() > 1."); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } @@ -524,6 +558,7 @@ void VariableOpPass::CopyVariableFormatDataTypeAndShape(const GeTensorDesc &src_ Status VariableOpPass::CheckIfCouldBeOptimized(const ge::NodePtr &node, bool &flag, VarTransRoad &fusion_road) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); return FAILED; } bool is_matched = false; @@ -601,6 +636,8 @@ Status VariableOpPass::RenewVarDesc(ge::ComputeGraphPtr &graph) { GE_CHECK_NOTNULL(node->GetOpDesc()); ret = ge::VarManager::Instance(graph->GetSessionID())->RenewCurVarDesc(node->GetName(), node->GetOpDesc()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Renew descriptor for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(FAILED, "var manager renew var[%s] descriptor failed!", node->GetName().c_str()); return FAILED; } @@ -625,6 +662,8 @@ Status VariableOpPass::RenewVarDesc(uint64_t session_id, const NodePtr &node, co GE_CHECK_NOTNULL(node->GetOpDesc()); Status ret = ge::VarManager::Instance(session_id)->RenewCurVarDesc(node->GetName(), node->GetOpDesc()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Renew descriptor for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), session_id); GELOGE(FAILED, "var manager renew var[%s] descriptor failed!", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/variable_ref_delete_op_pass.cc b/ge/graph/passes/variable_ref_delete_op_pass.cc index 8e625857..a0e0bcba 100644 --- a/ge/graph/passes/variable_ref_delete_op_pass.cc +++ b/ge/graph/passes/variable_ref_delete_op_pass.cc @@ -35,6 +35,8 @@ Status VariableRefDeleteOpPass::Run(ge::ComputeGraphPtr graph) { continue; } if (all_var_names.count(ref_var_src_var_name) == 0) { + REPORT_INNER_ERROR("E19999", "Can not find source variable[%s] of variable ref[%s], check invalid", + ref_var_src_var_name.c_str(), node->GetName().c_str()); GELOGE(FAILED, "Can not find source variable[%s] of variable ref[%s]", ref_var_src_var_name.c_str(), node->GetName().c_str()); return FAILED; @@ -53,6 +55,8 @@ Status VariableRefDeleteOpPass::DealVariableRef(ge::ComputeGraphPtr &graph, ge:: GE_CHECK_NOTNULL(variable_ref); auto inAnchor0 = variable_ref->GetInDataAnchor(0); if (inAnchor0 == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); GELOGE(FAILED, "variable_ref [%s] no input", variable_ref->GetName().c_str()); return FAILED; } @@ -73,17 +77,23 @@ Status VariableRefDeleteOpPass::DealVariableRef(ge::ComputeGraphPtr &graph, ge:: GELOGI("[%s-%d]: add attr [REF_VAR_SRC_VAR_NAME: %s ] ", peer_node->GetName().c_str(), index, ref_var_src_var_name.c_str()); } else { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output:%d desc of op:%s(%s) failed", REF_VAR_SRC_VAR_NAME.c_str(), + index, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "[%s-%d]: add attr [REF_VAR_SRC_VAR_NAME: %s ] failed", peer_node->GetName().c_str(), index, ref_var_src_var_name.c_str()); return FAILED; } // remove variable_ref if (GraphUtils::IsolateNode(variable_ref, {0}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, variable_ref) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node: %s, type: %s without relink failed", variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); return FAILED; diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc index db17e091..4fb80646 100644 --- a/ge/graph/preprocess/graph_preprocess.cc +++ b/ge/graph/preprocess/graph_preprocess.cc @@ -23,6 +23,7 @@ #include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h" #include "common/formats/format_transfers/format_transfer_transpose.h" #include "common/formats/utils/formats_trans_utils.h" +#include "common/util/error_manager/error_manager.h" #include "common/helper/model_helper.h" #include "common/math/math_util.h" #include "common/op/ge_op_utils.h" @@ -98,6 +99,7 @@ const int64_t kInvalidDynaimcDimsType = -1; OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { GeTensorPtr tensor = MakeShared(); if (tensor == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(INTERNAL_ERROR, "Create shared ptr for GeTensor failed"); return nullptr; } @@ -109,6 +111,7 @@ OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { tensor->MutableTensorDesc().SetShape(GeShape()); int32_t dst_shape = 1; if (tensor->SetData(reinterpret_cast(&dst_shape), sizeof(int32_t)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set data to tensor failed"); GELOGE(INTERNAL_ERROR, "tensor set data failed"); return nullptr; } @@ -116,6 +119,7 @@ OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { tensor->MutableTensorDesc().SetShape(GeShape(std::vector({dim_cnt}))); unique_ptr dst_shape(new (std::nothrow) int32_t[dim_cnt]()); if (dst_shape == nullptr) { + REPORT_CALL_ERROR("E19999", "Malloc buffer failed, size:%zu", dim_cnt); GELOGE(INTERNAL_ERROR, "Create unique ptr failed"); return nullptr; } @@ -125,6 +129,7 @@ OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { GE_IF_BOOL_EXEC( tensor->SetData(reinterpret_cast(dst_shape.get()), dim_cnt * sizeof(int32_t)) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Set data to tensor failed"); GELOGE(INTERNAL_ERROR, "tensor set data failed"); return nullptr;) } @@ -171,11 +176,15 @@ void AddTransNodeAttr(const std::string &node_type, const GeTensorDesc &input, c NodePtr CreateTransNode(const std::string &name, const std::string &node_type, const GeTensorDesc &input, const GeTensorDesc &output, NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, trans_name:%s, trans_type:%s, check invalid", + name.c_str(), node_type.c_str()); GELOGE(PARAM_INVALID, "node is null."); return nullptr; } auto graph = node->GetOwnerComputeGraph(); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner graph in node is nullptr, trans_name:%s, trans_type:%s, check invalid", + name.c_str(), node_type.c_str()); GELOGE(PARAM_INVALID, "Owner graph is null, node name:%s.", node->GetName().c_str()); return nullptr; } @@ -190,6 +199,8 @@ NodePtr CreateTransNode(const std::string &name, const std::string &node_type, c } OpDescPtr op_desc = MakeShared(name, node_type); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed, trans_name:%s, trans_type:%s,", + name.c_str(), node_type.c_str()); GELOGE(INTERNAL_ERROR, "Create shared ptr for OpDesc failed"); return nullptr; } @@ -202,11 +213,15 @@ NodePtr CreateTransNode(const std::string &name, const std::string &node_type, c // Default single input and single output auto ret = op_desc->AddInputDesc(input); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc into op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add input desc when create node %s type %s", name.c_str(), node_type.c_str()); return nullptr; } ret = op_desc->AddOutputDesc(output); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc into op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add output desc when create node %s type %s", name.c_str(), node_type.c_str()); return nullptr; } @@ -223,12 +238,17 @@ NodePtr CreateTransNode(const std::string &name, const std::string &node_type, c } ret = op_desc->AddInputDesc(shape_desc->GetOutputDesc(0)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc into op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add the first input for reshape %s", name.c_str()); return nullptr; } shape_node = graph->AddNode(shape_desc); if (shape_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + shape_desc->GetName().c_str(), shape_desc->GetType().c_str(), + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add shape node for reshape %s, can not add the shape to graph", name.c_str()); return nullptr; } @@ -236,12 +256,18 @@ NodePtr CreateTransNode(const std::string &name, const std::string &node_type, c auto trans_node = graph->AddNode(op_desc); if (trans_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add trans node %s to graph", name.c_str()); return nullptr; } if (node_type == RESHAPE) { if (GraphUtils::AddEdge(shape_node->GetOutDataAnchor(0), trans_node->GetInDataAnchor(1)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:0) and op:%s(%s)(in_index:0) failed", + shape_node->GetName().c_str(), shape_node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add shape node for reshape %s, can not add the edge", name.c_str()); return nullptr; } @@ -260,6 +286,9 @@ Status RecoverOneTransNodeForVar(const std::string &name, const TransNodeInfo &t auto ret = GraphUtils::ReplaceNodeDataAnchors(trans_node, node, {}, {0}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace out anchors of node:%s(%s) by node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to replace out anchors when recover trans node for %s type %s", node->GetName().c_str(), node->GetType().c_str()); return INTERNAL_ERROR; @@ -267,6 +296,9 @@ Status RecoverOneTransNodeForVar(const std::string &name, const TransNodeInfo &t ret = GraphUtils::AddEdge(node->GetOutDataAnchor(0), trans_node->GetInDataAnchor(0)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:0) and op:%s(%s)(in_index:0) failed", + node->GetName().c_str(), node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to connect node %s to trans node %s", node->GetName().c_str(), trans_node->GetName().c_str()); return INTERNAL_ERROR; @@ -274,6 +306,9 @@ Status RecoverOneTransNodeForVar(const std::string &name, const TransNodeInfo &t ret = GraphUtils::MoveOutCtrlEdges(node, trans_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out control edges from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to move out control edges from %s to %s when recover trans node.", node->GetName().c_str(), trans_node->GetName().c_str()); return INTERNAL_ERROR; @@ -292,6 +327,9 @@ Status RecoverOneTransNodeForVarRef(const std::string &name, const TransNodeInfo auto ret = GraphUtils::ReplaceNodeDataAnchors(trans_node, node, {0}, {}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Replace out anchors of node:%s(%s) by node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to replace int anchors when recover trans node for %s type %s", node->GetName().c_str(), node->GetType().c_str()); return INTERNAL_ERROR; @@ -299,6 +337,9 @@ Status RecoverOneTransNodeForVarRef(const std::string &name, const TransNodeInfo ret = GraphUtils::AddEdge(trans_node->GetOutDataAnchor(0), node->GetInDataAnchor(0)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:0) and op:%s(%s)(in_index:0) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to connect trans node %s to node %s", trans_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; @@ -306,6 +347,9 @@ Status RecoverOneTransNodeForVarRef(const std::string &name, const TransNodeInfo ret = GraphUtils::MoveInCtrlEdges(node, trans_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out control edges from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to move int control edges from %s to %s when recover trans node.", node->GetName().c_str(), trans_node->GetName().c_str()); return INTERNAL_ERROR; @@ -326,6 +370,8 @@ Status UpdateVarFormats(const NodePtr &var, const GeTensorDesc &tensor_desc) { output_desc.SetOriginDataType(tensor_desc.GetOriginDataType()); output_desc.SetOriginShape(tensor_desc.GetOriginShape()); GE_IF_BOOL_EXEC(var->GetOpDesc()->UpdateOutputDesc(0, output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update output desc of node:%s(%s) failed, index:0,", + var->GetName().c_str(), var->GetType().c_str()); GELOGE(INTERNAL_ERROR, "UpdateOutputDesc failed"); return INTERNAL_ERROR;); } @@ -339,6 +385,8 @@ Status UpdateVarFormats(const NodePtr &var, const GeTensorDesc &tensor_desc) { desc.SetOriginDataType(tensor_desc.GetOriginDataType()); desc.SetOriginShape(tensor_desc.GetOriginShape()); GE_IF_BOOL_EXEC(var->GetOpDesc()->UpdateInputDesc(0, desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update input desc of node:%s(%s) failed, index:0,", + var->GetName().c_str(), var->GetType().c_str()); GELOGE(INTERNAL_ERROR, "UpdateInputDesc failed"); return INTERNAL_ERROR;) } @@ -365,9 +413,18 @@ Status RecoverTransRoadForVar(const NodePtr &var, const VarTransRoad &road) { std::string stream_label; (void)AttrUtils::GetStr(var_desc, ATTR_NAME_STREAM_LABEL, stream_label); if (!stream_label.empty()) { - GE_CHK_STATUS_RET(SetStreamLabel(last_node, stream_label), "set stream label failed"); + auto status = SetStreamLabel(last_node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), last_node->GetName().c_str(), last_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } GE_CHK_BOOL_EXEC((ge::AttrUtils::SetBool(last_node->GetOpDesc(), ge::ATTR_INSERTED_BY_GE, true)), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ge::ATTR_INSERTED_BY_GE.c_str(), + last_node->GetName().c_str(), last_node->GetType().c_str()); return INTERNAL_ERROR, "Set attr ATTR_INSERTED_BY_GE failed."); GELOGD("Recover trans node %s type %s success", trans_name.c_str(), iter->node_type.c_str()); } @@ -399,10 +456,19 @@ Status RecoverTransRoadForVarRef(const std::set &nodes, const VarTransR std::string stream_label; (void)AttrUtils::GetStr(var_desc, ATTR_NAME_STREAM_LABEL, stream_label); if (!stream_label.empty()) { - GE_CHK_STATUS_RET(SetStreamLabel(last_node, stream_label), "set stream label failed"); + auto status = SetStreamLabel(last_node, stream_label); + if (status != ge::SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + stream_label.c_str(), last_node->GetName().c_str(), last_node->GetType().c_str()); + GELOGE(status, "Set stream label failed."); + return status; + } } GE_CHK_BOOL_EXEC((ge::AttrUtils::SetBool(last_node->GetOpDesc(), ge::ATTR_INSERTED_BY_GE, true)), + REPORT_CALL_ERROR("E19999", "Set Attr:%s of node:%s(%s) failed", + ge::ATTR_INSERTED_BY_GE.c_str(), + last_node->GetName().c_str(), last_node->GetType().c_str()); return INTERNAL_ERROR, "Set attr ATTR_INSERTED_BY_GE failed."); } if (!(road.empty()) && (UpdateVarFormats(var, road.rbegin()->output) != SUCCESS)) { @@ -418,6 +484,7 @@ VarNamesToRefs CollectVarNamesToRefs(const ComputeGraphPtr &graph) { VarNamesToRefs names_to_refs; std::string var_name; if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(PARAM_INVALID, "graph is null."); return names_to_refs; } @@ -461,6 +528,8 @@ Status ModifyInputFormatAndShape(NodePtr &node_ptr) { ge::DataType dt = input->GetDataType(); std::vector dst_shape_dims; if (TransferShape2NC1HWC0(old_format, old_shape, dt, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Transfer shape to NC1HWC0 failed, op:%s(%s),", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Trans shape failed"); return FAILED; } @@ -476,6 +545,8 @@ Status ModifyInputFormatAndShape(NodePtr &node_ptr) { int64_t size = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*output, size); if (graph_status != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get output tensor size failed, op:%s(%s), index:0", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(graph_status, "GetTensorSizeInBytes failed!"); return FAILED; } @@ -520,6 +591,8 @@ Status ModifyDataNetOutputFormatAndShape(OpDescPtr &op_desc, uint32_t index, For int64_t size = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*output, size); if (graph_status != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get output tensor size failed, op:%s(%s), index:%u", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), index); GELOGE(graph_status, "GetTensorSizeInBytes failed!"); return FAILED; } @@ -618,19 +691,27 @@ Status ProcessInputDtDynShape(NodePtr &node_ptr, bool &is_dynamic_batch, NodePtr return SUCCESS; } input->SetDataType(dt_set); - int64_t input_shape_size = 0; - int64_t output_shape_size = 0; - ge::graphStatus input_graph_status = ge::TensorUtils::GetTensorSizeInBytes(*input, input_shape_size); - ge::graphStatus output_graph_status = ge::TensorUtils::GetTensorMemorySizeInBytes(*input, output_shape_size); - if (input_graph_status != ge::GRAPH_SUCCESS && output_graph_status != ge::GRAPH_SUCCESS) { - GELOGE(GRAPH_FAILED, "GetTensorSize failed!"); - return FAILED; - } - ge::TensorUtils::SetSize(*input, input_shape_size); const GeTensorDescPtr &output = op_desc->MutableOutputDesc(0); GE_CHECK_NOTNULL(output); output->SetDataType(dt_set); - ge::TensorUtils::SetSize(*output, output_shape_size); + + GeShape shape = input->GetShape(); + if (!shape.IsUnknownShape()) { + int64_t input_shape_size = 0; + int64_t output_shape_size = 0; + ge::graphStatus input_graph_status = ge::TensorUtils::GetTensorSizeInBytes(*input, input_shape_size); + ge::graphStatus output_graph_status = ge::TensorUtils::GetTensorMemorySizeInBytes(*input, output_shape_size); + if (input_graph_status != ge::GRAPH_SUCCESS && output_graph_status != ge::GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get input tensor size failed, op:%s(%s), index:0", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(GRAPH_FAILED, "[Process][InputOp] Get tensor size of op [%s] failed!", node_ptr->GetName().c_str()); + return FAILED; + } + ge::TensorUtils::SetSize(*input, input_shape_size); + ge::TensorUtils::SetSize(*output, output_shape_size); + GELOGI("[Process][InputDynShape] Set input and output size of node [%s] success.", node_ptr->GetName().c_str()); + } + if (is_dynamic_batch) { GELOGI("The node [%s] dtype set fp16", switchn_node->GetName().c_str()); auto switchn_op_desc = switchn_node->GetOpDesc(); @@ -673,6 +754,8 @@ Status ProcessInputNC1HWC0DynShape(NodePtr &node_ptr, bool &is_dynamic_batch, No GE_CHECK_NOTNULL(switchn_op_desc); const GeTensorDescPtr &switchn_input = switchn_op_desc->MutableInputDesc(0); if (ModifyFormatAndShapeForSingleTensor(switchn_input) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Modify format and shape of input:0 in op:%s(%s) failed", + switchn_op_desc->GetName().c_str(), switchn_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "modify format and shape failed"); return FAILED; } @@ -682,6 +765,8 @@ Status ProcessInputNC1HWC0DynShape(NodePtr &node_ptr, bool &is_dynamic_batch, No old_format = switchn_output->GetFormat(); old_shape = switchn_output->GetShape(); if (ModifyFormatAndShapeForSingleTensor(switchn_output) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Modify format and shape of output:%u in op:%s(%s) failed", i, + switchn_op_desc->GetName().c_str(), switchn_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "modify format and shape failed"); return FAILED; } @@ -782,6 +867,9 @@ Status ProcessNetoutputNodeFp16Nc1hwc0DynShape(GeTensorDesc &src_desc, GeTensorD std::vector dst_shape_dims; std::vector src_shape_dims = src_shape.GetDims(); if (TransferShape2NC1HWC0(src_format, src_shape_dims, DT_FLOAT16, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Transfer output:0 shape of op:%s(%s) to NC1HWC0 format failed, shape:%s, format:%s", + src_op_desc->GetName().c_str(), src_op_desc->GetType().c_str(), + src_shape.ToString().c_str(), TypeUtils::FormatToSerialString(src_format).c_str()); GELOGE(INTERNAL_ERROR, "Trans shape failed"); return FAILED; } @@ -792,6 +880,8 @@ Status ProcessNetoutputNodeFp16Nc1hwc0DynShape(GeTensorDesc &src_desc, GeTensorD auto merge_out = src_op_desc->MutableOutputDesc(0); GE_CHECK_NOTNULL(merge_out); if (ModifyFormatAndShapeForSingleTensor(merge_out) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Modify format and shape of output:0 in op:%s(%s) failed", + src_op_desc->GetName().c_str(), src_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "modify format and shape failed"); return FAILED; } @@ -799,6 +889,8 @@ Status ProcessNetoutputNodeFp16Nc1hwc0DynShape(GeTensorDesc &src_desc, GeTensorD auto merge_in = src_op_desc->MutableInputDesc(i); GE_CHECK_NOTNULL(merge_in); if (ModifyFormatAndShapeForSingleTensor(merge_in) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Modify format and shape of input:%u in op:%s(%s) failed", i, + src_op_desc->GetName().c_str(), src_op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "modify format and shape failed"); return FAILED; } @@ -904,13 +996,13 @@ long StringToLongNoThrow(const string &str) { return std::stol(str); } catch (const std::invalid_argument) { GELOGE(PARAM_INVALID, - "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example: " + "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example:" "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", str.c_str()); return PARAM_INVALID; } catch (const std::out_of_range) { GELOGE(PARAM_INVALID, - "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example: " + "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example:" "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", str.c_str()); return PARAM_INVALID; @@ -924,12 +1016,15 @@ long StringToLongNoThrow(const string &str) { Status ParseDynamicInputShapeRange(const std::string &shape_range, std::vector>> &range) { if (shape_range.size() < 2) { + REPORT_INNER_ERROR("E19999", "shape_range.size:%zu < 2, check invalid", shape_range.size()); GELOGE(PARAM_INVALID, "Shape range %s is invalid.", shape_range.c_str()); return PARAM_INVALID; } // different shape_range of single input are split by ']' vector shape_range_set = ge::StringUtils::Split(shape_range, ']'); if (shape_range_set.empty()) { + REPORT_INNER_ERROR("E19999", "Shape range %s is not valid. Correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", + shape_range.c_str()); GELOGE(PARAM_INVALID, "Shape range %s is not valid. Correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", shape_range.c_str()); return PARAM_INVALID; @@ -968,6 +1063,8 @@ Status ParseDynamicInputShapeRange(const std::string &shape_range, auto range_left = StringToLongNoThrow(range_pair_set.at(0).c_str()); auto range_right = StringToLongNoThrow(range_pair_set.at(1).c_str()); if (range_left < 0 || range_right < 0) { + REPORT_INNER_ERROR("E19999", "Shape range of input is invalid. Given range pair [%ld,%ld], " + "while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", range_left, range_right); GELOGE(PARAM_INVALID, "Shape range of input is invalid. Given range pair [%ld,%ld], while correct example: " "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", @@ -976,6 +1073,8 @@ Status ParseDynamicInputShapeRange(const std::string &shape_range, } range_pair = std::make_pair(range_left, range_right); } else { + REPORT_INNER_ERROR("E19999", "Shape range of input is invalid. Given %s, " + "while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", shape_range.c_str()); GELOGE(PARAM_INVALID, "Shape range of input is invalid. Given %s, while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", shape_range.c_str()); @@ -1009,6 +1108,8 @@ Status GetDynamicInputShapeRange(const std::vector &user_input, const } else if (!enable_dynamic_execute_mode && !enable_input_shape_range) { return SUCCESS; } else { + REPORT_INNER_ERROR("E19999", "Graph option: %s and %s should be enabled at the same time, check invalid", + OPTION_EXEC_DYNAMIC_EXECUTE_MODE, OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE); GELOGE(PARAM_INVALID, "Graph option: %s and %s should be enabled at the same time.", OPTION_EXEC_DYNAMIC_EXECUTE_MODE, OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE); return PARAM_INVALID; @@ -1030,6 +1131,9 @@ Status UpdateDynamicInputShapeRange(const ge::GeAttrValue::INT index, auto origin_shape = desc.GetShape(); auto current_shape_range_vec = range_vec.at(index); if (current_shape_range_vec.size() != origin_shape.GetDimNum()) { + REPORT_INNER_ERROR("E19999", "Given shape_range dim num is %zu, current dim:%s num is %zu, not match, " + "check invalid", current_shape_range_vec.size(), origin_shape.ToString().c_str(), + origin_shape.GetDimNum()); GELOGE(PARAM_INVALID, "Given shape_range dim num is %zu, current dim num is %zu, not match.Pleace Check.", current_shape_range_vec.size(), origin_shape.GetDimNum()); return PARAM_INVALID; @@ -1041,6 +1145,8 @@ Status UpdateDynamicInputShapeRange(const ge::GeAttrValue::INT index, if (left_range == right_range) { // given shape_range is known dim, check is same as origin or not if (curr_dim != left_range) { + REPORT_INNER_ERROR("E19999", "Given shape range is %ld, current dim shape is %ld, not match, dim_index:%zu, " + "check invalid", left_range, curr_dim, i); GELOGE(PARAM_INVALID, "Given shape range is %ld, current dim shape is %ld, not match.Pleace Check.", left_range, curr_dim); return PARAM_INVALID; @@ -1050,6 +1156,9 @@ Status UpdateDynamicInputShapeRange(const ge::GeAttrValue::INT index, // given shape_range is fix range, check input_shape is in this range or not if (right_range != UNKNOWN_DIM) { if ((curr_dim < left_range) || (curr_dim > right_range)) { + REPORT_INNER_ERROR("E19999", "Given shape range is [%ld~%ld], current dim shape is %ld, out of range, " + "dim_index:%zu, check invalid", + left_range, right_range, curr_dim, i); GELOGE(PARAM_INVALID, "Given shape range is [%ld~%ld], current dim shape is %ld, out of range.Pleace Check.", left_range, right_range, curr_dim); return PARAM_INVALID; @@ -1062,9 +1171,9 @@ Status UpdateDynamicInputShapeRange(const ge::GeAttrValue::INT index, desc.SetShapeRange(current_shape_range_vec); graphStatus graph_ret = op->UpdateInputDesc(0, desc); - GE_CHK_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret); + GE_CHK_GRAPH_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret); graph_ret = op->UpdateOutputDesc(0, desc); - GE_CHK_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret); + GE_CHK_GRAPH_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret); return SUCCESS; } } // namespace @@ -1138,17 +1247,20 @@ Status GraphPrepare::Init(const ge::Graph &graph, uint64_t session_id) { Status GraphPrepare::CheckGraph() { if (compute_graph_ == nullptr) { + REPORT_INNER_ERROR("E19999", "compute_graph_ is nullptr, check invalid"); GELOGE(GE_GRAPH_INIT_FAILED, "Graph prepare init compute graph is NULLPTR"); return GE_GRAPH_INIT_FAILED; } auto nodes = compute_graph_->GetAllNodes(); if (nodes.empty()) { + REPORT_INNER_ERROR("E19999", "nodes in graph is empty, check invalid"); GELOGE(GE_GRAPH_INIT_FAILED, "Invalid graph, no nodes in this graph."); return GE_GRAPH_INIT_FAILED; } for (const NodePtr &node : compute_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node); if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "node without opdesc exist in graph, check invalid"); GELOGE(GE_GRAPH_INIT_FAILED, "Check Graph node opdesc is NULL"); return GE_GRAPH_INIT_FAILED; } @@ -1184,6 +1296,9 @@ Status GraphPrepare::CheckRefInputNode(const NodePtr &node, const std::string &i auto input_type = input_op_desc->GetType(); if (input_type == ge::FRAMEWORKOP) { if (!ge::AttrUtils::GetStr(input_op_desc, ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, input_type)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE.c_str(), + input_op_desc->GetName().c_str(), input_op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Get original type failed."); return PARAM_INVALID; } @@ -1207,11 +1322,13 @@ Status GraphPrepare::CheckRefOp() { std::set ref_nodes; for (const NodePtr &node : compute_graph_->GetDirectNode()) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "nullptr node exist in graph, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "node without opdesc exist in graph, check invalid"); GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); return PARAM_INVALID; } @@ -1245,15 +1362,23 @@ Status GraphPrepare::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode) { Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Input node is NULL"); return GE_GRAPH_GRAPH_NODE_NULL; } OpDescPtr op_desc_ptr = node->GetOpDesc(); if (op_desc_ptr == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Input node opdesc is NULL"); return GE_GRAPH_GRAPH_NODE_NULL; } GeTensorDesc output = op_desc_ptr->GetOutputDesc(0); + GeShape output_shape = output.GetShape(); + if (output_shape.IsUnknownShape()) { + GELOGD("[Adjust][DataOpOutput] Shape of op [%s] output is unknown.", node->GetName().c_str()); + return SUCCESS; + } + int64_t tensor_size = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(output, tensor_size); if (graph_status != GRAPH_SUCCESS) { @@ -1265,6 +1390,8 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) { TensorUtils::SetSize(output, tensor_size); graphStatus graph_ret = op_desc_ptr->UpdateOutputDesc(0, output); if (graph_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output desc of op:%s(%s) failed, index:0", + op_desc_ptr->GetName().c_str(), op_desc_ptr->GetType().c_str()); GELOGE(graph_ret, "UpdateOutputDesc fail, graph_ret:%u", graph_ret); return graph_ret; } @@ -1304,7 +1431,8 @@ Status GraphPrepare::UpdateInput(const std::vector &user_input, auto format = desc.GetFormat(); auto origin_format = desc.GetOriginFormat(); // data maybe internal format [FRACTAL_NZ] at singleop process such as GEMM. - bool need_check_internal_format = (!IsTansDataOpData(input_node)) && (!options_.is_single_op); + auto tune_flag = (options_.build_mode == BUILD_MODE_TUNING) && (options_.build_step == BUILD_STEP_AFTER_BUILDER); + bool need_check_internal_format = (!IsTansDataOpData(input_node)) && (!options_.is_single_op) && (!tune_flag); if (need_check_internal_format) { bool is_internal = TypeUtils::IsInternalFormat(format) || TypeUtils::IsInternalFormat(origin_format); if (is_internal) { @@ -1334,6 +1462,7 @@ Status GraphPrepare::UpdateInput(const std::vector &user_input, GE_IF_BOOL_EXEC(shape_size == 0 && desc.GetShape().GetDimNum() == 0, shape_size = static_cast(length)); int64_t size = 0; GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc, size) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Get size of user input tensor failed, index:%ld", index); GELOGE(INTERNAL_ERROR, "TensorUtils GetSize failed"); return FAILED); bool size_check = (size != 0 && shape_size != size); @@ -1346,19 +1475,26 @@ Status GraphPrepare::UpdateInput(const std::vector &user_input, return FAILED; } ge::TensorUtils::SetSize(desc, shape_size); - graphStatus graph_ret = op->UpdateInputDesc(0, desc); - if (graph_ret != GRAPH_SUCCESS) { - GELOGE(graph_ret, "UpdateInputDesc fail, graph_ret:%u", graph_ret); - return graph_ret; - } - // Size will be recalculated in the build stage - ge::TensorUtils::SetSize(desc, 0); - graph_ret = op->UpdateOutputDesc(0, desc); - if (graph_ret != GRAPH_SUCCESS) { - GELOGE(graph_ret, "UpdateOutputDesc fail, graph_ret:%u", graph_ret); - return graph_ret; + if (!tune_flag) { + graphStatus graph_ret = op->UpdateInputDesc(0, desc); + if (graph_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input desc of op:%s(%s) failed, index:0", + op->GetName().c_str(), op->GetType().c_str()); + GELOGE(graph_ret, "UpdateInputDesc fail, graph_ret:%u", graph_ret); + return graph_ret; + } + // Size will be recalculated in the build stage + ge::TensorUtils::SetSize(desc, 0); + graph_ret = op->UpdateOutputDesc(0, desc); + if (graph_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output desc of op:%s(%s) failed, index:0", + op->GetName().c_str(), op->GetType().c_str()); + GELOGE(graph_ret, "UpdateOutputDesc fail, graph_ret:%u", graph_ret); + return graph_ret; + } + } else { + GELOGI("data %s skip update info in tune mode", op->GetName().c_str()); } - if (!dynamic_shape_range_vec.empty()) { ret = UpdateDynamicInputShapeRange(index, dynamic_shape_range_vec, op, desc); GE_CHK_STATUS_RET(ret, "Fail to update dynamic input shape range on %s.", op->GetName().c_str()); @@ -1451,6 +1587,7 @@ Status GraphPrepare::ResourcePairProcess(const std::string &action) { new ResourcePairRemoveControlPass); } } catch (std::bad_alloc &e) { + REPORT_INNER_ERROR("E19999", "bad memory allocation occur when add ResourcePair Pass"); GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occur, action:%s.", action.c_str()); return INTERNAL_ERROR; } @@ -1587,6 +1724,7 @@ Status GraphPrepare::PrepareRunningFormatRefiner() { Status GraphPrepare::SwitchOpOptimize(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, check invalid"); GELOGE(GE_GRAPH_NULL_INPUT, "Input Graph is NULL"); return GE_GRAPH_NULL_INPUT; } @@ -1602,6 +1740,7 @@ Status GraphPrepare::SwitchOpOptimize(ComputeGraphPtr &compute_graph) { } ret = compute_graph->TopologicalSorting(); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological sorting failed"); GELOGE(ret, "Graph topological sort failed, ret:%u.", ret); return ret; } @@ -1612,6 +1751,7 @@ Status GraphPrepare::SwitchOpOptimize(ComputeGraphPtr &compute_graph) { Status GraphPrepare::GenerateInfershapeGraph(ConstGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(GE_GRAPH_NULL_INPUT, "Input Graph is NULL"); return GE_GRAPH_NULL_INPUT; } @@ -1626,6 +1766,7 @@ Status GraphPrepare::GenerateInfershapeGraph(ConstGraphPtr graph) { ret = compute_graph_->InferOriginFormat(); GE_DUMP(compute_graph_, "after_inferformat"); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Infer OriginFormat failed"); GELOGE(ret, "Prepare Graph inferformat failed"); return ret; } @@ -1652,6 +1793,7 @@ Status GraphPrepare::CheckConstOp() { } else if (node_ptr->GetType() == FRAMEWORKOP) { auto op_desc = node_ptr->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Get op desc failed"); return PARAM_INVALID; } @@ -1673,6 +1815,8 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) { GE_CHECK_NOTNULL(op_desc); ConstGeTensorPtr ge_tensor_ptr; if (!(AttrUtils::GetTensor(op_desc, ATTR_NAME_WEIGHTS, ge_tensor_ptr))) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "Get value from const attr failed"); return PARAM_INVALID; } @@ -1747,6 +1891,8 @@ Status GraphPrepare::CheckUserInput(const std::vector &user_input) { data_num++; GeAttrValue::INT index = 0; if (!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + op->GetName().c_str(), op->GetType().c_str()); GELOGE(GE_GRAPH_INIT_FAILED, "Get index from attr failed"); return GE_GRAPH_INIT_FAILED; } @@ -1763,13 +1909,13 @@ Status GraphPrepare::CheckUserInput(const std::vector &user_input) { GeTensorDesc desc(user_input[index].GetTensorDesc()); for (size_t i = 0; i < desc.GetShape().GetDimNum(); ++i) { - if (desc.GetShape().GetDim(i) < 0) { - std::string situation = "data dim[" + std::to_string(i) + "][" + - std::to_string(desc.GetShape().GetDim(i)) + "]" ; - std::string reason = "it need >= 0"; - ErrorManager::GetInstance().ATCReportErrMessage("E19025", {"situation", "reason"}, {situation, reason}); - GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need >= 0, real:%ld.", i, - desc.GetShape().GetDim(i)); + int64_t dim = desc.GetShape().GetDim(i); + if (dim < UNKNOWN_DIM_NUM) { + std::string situation = "data dim[" + std::to_string(i) + "][" + std::to_string(dim) + "]" ; + std::string reason = "it need >= -2"; + REPORT_INPUT_ERROR( + "E19025", std::vector({"situation", "reason"}), std::vector({situation, reason})); + GELOGE(GE_GRAPH_INIT_FAILED, "[Check][InputDim]data dim %zu is not supported, need >= -2, real:%ld.", i, dim); return GE_GRAPH_INIT_FAILED; } } @@ -1851,6 +1997,7 @@ Status GraphPrepare::PrepareOptimize() { (void)original_graph_passes.AddPass("PrepareOptimize::ReplaceTransShapePass", new ReplaceTransShapePass); (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass", new MarkAgnosticPass); } catch (std::bad_alloc &e) { + REPORT_INNER_ERROR("E19999", "bad memory allocation occur when add Pass"); GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); return INTERNAL_ERROR; } @@ -1914,6 +2061,7 @@ Status GraphPrepare::PrepareOptimize() { // can't move to optimize1/2 directly, may cause more identity insert, cause CI fail (void)graph_pass.AddPass("PrepareOptimize::HcclMemcpyPass", new HcclMemcpyPass); } catch (std::bad_alloc &e) { + REPORT_INNER_ERROR("E19999", "bad memory allocation occur when add Pass"); GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); return INTERNAL_ERROR; } @@ -1930,6 +2078,7 @@ Status GraphPrepare::PrepareOptimize() { ret = compute_graph_->TopologicalSorting(); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological sorting failed"); GELOGE(ret, "Graph topological sort failed, ret:%u.", ret); return ret; } @@ -2000,6 +2149,7 @@ Status GraphPrepare::ProcessNetOutput() { graph_passes_before_infershape.AddPass("ProcessNetOutput::DataPass", new (std::nothrow) DataPass); // Add NetOutput first. } catch (std::bad_alloc) { + REPORT_INNER_ERROR("E19999", "bad memory allocation occur when add Pass"); GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); return INTERNAL_ERROR; } @@ -2039,6 +2189,7 @@ Status GraphPrepare::CheckAndUpdateInput(const std::vector &user_input } else { ret = compute_graph_->TopologicalSorting(); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological sorting failed"); GELOGE(ret, "graph prepare error: compute_graph_->Topological Sorting"); return FAILED; } diff --git a/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/ge/graph/preprocess/insert_op/ge_aipp_op.cc index 7c8d9073..d46cb0f3 100755 --- a/ge/graph/preprocess/insert_op/ge_aipp_op.cc +++ b/ge/graph/preprocess/insert_op/ge_aipp_op.cc @@ -110,6 +110,12 @@ Status GetDataDimN(const ge::NodePtr &data_node, ge::Format format, int64_t &bat batch = shape[NHWC_DIM_N]; return SUCCESS; default: + REPORT_INPUT_ERROR("E10001", std::vector({"parameter", "value", "reason"}), + std::vector({ + data_node->GetName() + " format", + TypeUtils::FormatToSerialString(format), + "only format " + TypeUtils::FormatToSerialString(FORMAT_NCHW) + " and " + + TypeUtils::FormatToSerialString(FORMAT_NHWC) + " supported"})); GELOGE(PARAM_INVALID, "Not support data format: %s", TypeUtils::FormatToSerialString(format).c_str()); return PARAM_INVALID; } @@ -156,6 +162,7 @@ Format GetAndCheckFormat() { Status AippOp::Init(domi::AippOpParams *aipp_params) { aipp_params_ = new (std::nothrow) domi::AippOpParams(); if (aipp_params_ == nullptr) { + REPORT_CALL_ERROR("E19999", "New AippOpParams failed"); return FAILED; } aipp_params_->CopyFrom(*aipp_params); @@ -190,6 +197,12 @@ Status AippOp::InsertAippToGraph(ComputeGraphPtr &graph, std::string &aippConfig auto ret = GraphUtils::InsertNodeBetweenDataAnchors(out_in_anchors.first, out_in_anchors.second, aipp); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert aipp:%s(%s) node between op:%s(%s) and op:%s:%s failed", + aipp->GetName().c_str(), aipp->GetType().c_str(), + out_in_anchors.first->GetOwnerNode()->GetName().c_str(), + out_in_anchors.first->GetOwnerNode()->GetType().c_str(), + out_in_anchors.second->GetOwnerNode()->GetName().c_str(), + out_in_anchors.second->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to link edges for aipp node %s", aipp->GetName().c_str()); return INTERNAL_ERROR; } @@ -209,6 +222,10 @@ Status AippOp::InsertAippToGraph(ComputeGraphPtr &graph, std::string &aippConfig auto &aipp = iter->second; auto ret = out_in_anchors.second->LinkFrom(aipp->GetOutDataAnchor(0)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "link aipp:%s(%s) to peer op:%s(%s) failed", + aipp->GetName().c_str(), aipp->GetType().c_str(), + out_in_anchors.second->GetOwnerNode()->GetName().c_str(), + out_in_anchors.second->GetOwnerNode()->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to link aipp %s to the peer node %s", aipp->GetName().c_str(), out_in_anchors.second->GetOwnerNode()->GetName().c_str()); return INTERNAL_ERROR; @@ -224,6 +241,7 @@ NodePtr AippOp::CreateAipp(const OutDataAnchorPtr &out_anchor, std::string current_name = node->GetName() + "_" + std::to_string(out_anchor->GetIdx()) + "_huawei_aipp"; auto aipp_opdesc_ptr = MakeShared(current_name, AIPP); if (aipp_opdesc_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to alloc aipp desc, name %s", current_name.c_str()); return nullptr; } @@ -250,6 +268,9 @@ NodePtr AippOp::CreateAipp(const OutDataAnchorPtr &out_anchor, // but the InferFormat process before InferShape can not infer the format // if the tensor on the Aipp has an unknown shape if (aipp_opdesc_ptr->UpdateInputDesc(kAippImageInputIndex, opdesc_src_data) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update the output desc from node:%s(%s) to aipp:%s(%s) failed", + node_desc->GetName().c_str(), node_desc->GetType().c_str(), + aipp_opdesc_ptr->GetName().c_str(), aipp_opdesc_ptr->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to update the output desc from node %s to aipp %s", node_desc->GetName().c_str(), aipp_opdesc_ptr->GetName().c_str()); return nullptr; @@ -341,6 +362,8 @@ Status AippOp::GetAndCheckTarget(const ComputeGraphPtr &graph, int rank, NodePtr GeAttrValue::NAMED_ATTRS aipp_attr; ConvertParamToAttr(aipp_attr); if (!AttrUtils::SetNamedAttrs(data_opdesc, ATTR_NAME_AIPP, aipp_attr)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s for op:%s(%s) failed", ATTR_NAME_AIPP.c_str(), + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Set name attrs for Data node failed. id: %d", rank); return INTERNAL_ERROR; } @@ -371,12 +394,17 @@ Status AippOp::GetStaticTargetNode(const ComputeGraphPtr &graph, NodePtr &data_n std::string related_node_name; if (AttrUtils::GetStr(data_node->GetOpDesc(), kMbatchSwitchnName, related_node_name)) { if (related_node_name.empty()) { + REPORT_INNER_ERROR("E19999", "The data node %s has switchn node flag, but the value of attr:%s is empty, " + "check invalid", data_node->GetName().c_str(), + kMbatchSwitchnName); GELOGE(INTERNAL_ERROR, "The data node %s has switchn node flag, but the value is empty", data_node->GetName().c_str()); return INTERNAL_ERROR; } auto switchn = graph->FindNode(related_node_name); if (switchn == nullptr) { + REPORT_INNER_ERROR("E19999", "The data node %s has switchn node %s, but can not find it on the graph, " + "check invalid", data_node->GetName().c_str(), related_node_name.c_str()); GELOGE(INTERNAL_ERROR, "The data node %s has switchn node %s, but can not find it on the graph", data_node->GetName().c_str(), related_node_name.c_str()); return INTERNAL_ERROR; @@ -428,7 +456,8 @@ Status AippOp::ConvertRelatedInputNameToRank() { if (!convert_flag) { string error_msg = "Top name " + related_input_name + "convert rank failed, Please" " ensure top name in aipp config is the top name of data node."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error_msg.c_str()); + GELOGE(PARAM_INVALID, "[Check][InputParam]%s", error_msg.c_str()); + REPORT_INPUT_ERROR("E19021", std::vector({"reason"}), std::vector({error_msg})); return PARAM_INVALID; } @@ -465,6 +494,9 @@ Status AippOp::GetTargetPosition(ComputeGraphPtr graph, NodePtr &target_input, for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Subgraph:%s of op:%s(%s) not find in graph:%s, check invalid", + name.c_str(), func_desc->GetName().c_str(), func_desc->GetType().c_str(), + graph->GetName().c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -665,11 +697,15 @@ Status AippOp::GenerateOpDesc(OpDescPtr op_desc) { // Add two InputDesc, add the second after the first one is added successfully. if ((op_desc->AddInputDesc(GeTensorDesc()) != GRAPH_SUCCESS) || (op_desc->AddInputDesc(GeTensorDesc()) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Add input desc into op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "failed to add input desc"); return FAILED; } if (op_desc->AddOutputDesc(GeTensorDesc()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc into op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "add output desc failed."); return FAILED; } @@ -677,6 +713,8 @@ Status AippOp::GenerateOpDesc(OpDescPtr op_desc) { ConvertParamToAttr(aipp_attrs); GE_IF_BOOL_EXEC(!AttrUtils::SetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attrs), + REPORT_INNER_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_AIPP.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "failed to set ATTR_NAME_AIPP"); return FAILED); @@ -857,12 +895,18 @@ Status AippOp::AddNodeToGraph(const NodePtr &aipp_node, int64_t max_dynamic_aipp // add node desc for aipp node auto stat3 = aipp_node->GetOpDesc()->UpdateInputDesc(kAippParamsInputIndex, output_tensor); if (stat1 != GRAPH_SUCCESS || stat2 != GRAPH_SUCCESS || stat3 != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add and Update InputDesc to op:%s(%s) failed, index:%d", + aipp_node->GetName().c_str(), aipp_node->GetType().c_str(), kAippParamsInputIndex); GELOGE(INTERNAL_ERROR, "node process desc failed!"); return INTERNAL_ERROR; } // aipp_node should have two input data but now tbe only one input if (GraphUtils::AddEdge(aipp_data_node_ptr->GetOutDataAnchor(kAippDataOutputIndex), aipp_node->GetInDataAnchor(kAippParamsInputIndex)) != GRAPH_SUCCESS) { + REPORT_INNER_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%u) and op:%s(%s)(in_index:%u) failed", + aipp_data_node_ptr->GetName().c_str(), aipp_data_node_ptr->GetType().c_str(), + kAippDataOutputIndex, aipp_node->GetName().c_str(), aipp_node->GetType().c_str(), + kAippParamsInputIndex); GELOGE(INTERNAL_ERROR, "Add Anchor anchor between aipp data node and aipp failed!"); return INTERNAL_ERROR; } diff --git a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc index b1534eb4..3bc8e3e4 100755 --- a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc +++ b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc @@ -1,4 +1,4 @@ -/** + /** * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -99,7 +99,7 @@ Status InsertNewOpUtil::InsertAippOps(ComputeGraphPtr &graph, std::string &aippC GE_CHK_STATUS_RET(CheckGraph(graph), "after inserting all ops, check graph failed"); - GE_CHK_STATUS_RET(graph->TopologicalSorting(), "after insert dynamic op, sort graph failed"); + GE_CHK_GRAPH_STATUS_RET(graph->TopologicalSorting(), "after insert dynamic op, sort graph failed"); ClearNewOps(); @@ -124,13 +124,15 @@ Status InsertNewOpUtil::CheckInputNamePositionNotRepeat() { if (another_item->related_input_name().empty()) { string error_msg = "Can not both set related_input_name and related_input_rank!" " Please ensure param is the same with the first aipp config(related_input_name)."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error_msg.c_str()); + GELOGE(PARAM_INVALID, "[Check][InputParam]%s", error_msg.c_str()); + REPORT_INPUT_ERROR("E19021", std::vector({"reason"}), std::vector({error_msg})); return PARAM_INVALID; } if (item->related_input_name() == another_item->related_input_name()) { string error_msg = "Can not insert aipp to the same postion! Please ensure related_input_name" " param is different in different aipp config."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error_msg.c_str()); + GELOGE(PARAM_INVALID, "[Check][InputParam]%s", error_msg.c_str()); + REPORT_INPUT_ERROR("E19021", std::vector({"reason"}), std::vector({error_msg})); return PARAM_INVALID; } } @@ -150,13 +152,15 @@ Status InsertNewOpUtil::CheckInputRankPositionNoRepeat() { if (!another_item->related_input_name().empty()) { string error_msg = "Can not both set related_input_rank and related_input_name!" " Please ensure param is the same with the first aipp config(related_input_rank)."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error_msg.c_str()); + GELOGE(PARAM_INVALID, "[Check][InputParam]%s", error_msg.c_str()); + REPORT_INPUT_ERROR("E19021", std::vector({"reason"}), std::vector({error_msg})); return PARAM_INVALID; } if (item->related_input_rank() == another_item->related_input_rank()) { string error_msg = "Can not insert aipp to the same postion! Please ensure related_input_rank" " param is different in different aipp config."; - GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error_msg.c_str()); + GELOGE(PARAM_INVALID, "[Check][InputParam]%s", error_msg.c_str()); + REPORT_INPUT_ERROR("E19021", std::vector({"reason"}), std::vector({error_msg})); return PARAM_INVALID; } } @@ -212,7 +216,7 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) { } } } - GE_CHK_LOG_AND_ERRORMSG((aippNodes.size() == 0) || (aippNodes.size() == next_nodes_cnt), + GE_CHK_LOG_AND_ERRORMSG((aippNodes.size() == 0) || (aippNodes.size() == next_nodes_cnt), PARAM_INVALID, "Can not config part of outputs of Data node to support AIPP, config all " "of the outputs of Data to support AIPP, or config none of them"); @@ -302,6 +306,9 @@ Status InsertNewOpUtil::FindMaxSizeNode(const ComputeGraphPtr &graph, const Node for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Subgraph:%s of op:%s(%s) not find in graph:%s, check invalid", + name.c_str(), func_desc->GetName().c_str(), + func_desc->GetType().c_str(), graph->GetName().c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -321,6 +328,9 @@ Status InsertNewOpUtil::FindMaxSizeNode(const ComputeGraphPtr &graph, const Node uint32_t parent_index = 0; if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), + src_op->GetName().c_str(), src_op->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str()); return FAILED; } @@ -372,12 +382,16 @@ Status InsertNewOpUtil::UpdateCaseNode(const ComputeGraphPtr &graph, const NodeP auto ret = data_opdesc->UpdateOutputDesc(0, *input_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update OutputDesc to op:%s(%s) failed, index:0", + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to update data %s output using case %s", data->GetName().c_str(), case_node->GetName().c_str()); return INTERNAL_ERROR; } ret = data_opdesc->UpdateInputDesc(0, *input_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update InputDesc to op:%s(%s) failed, index:0", + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to update data %s input using case %s", data->GetName().c_str(), case_node->GetName().c_str()); return INTERNAL_ERROR; @@ -400,11 +414,15 @@ Status InsertNewOpUtil::UpdatePrevNodeByAipp(NodePtr &node, std::set &s int64_t size = 0; graphStatus graph_ret = ge::TensorUtils::GetSize(*aipp_input, size); if (graph_ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get input size of op:%s(%s), index:0, failed", + aipp_op_desc->GetName().c_str(), aipp_op_desc->GetType().c_str()); GELOGE(FAILED, "UpdateOutputDesc fail, graph_ret:%d", graph_ret); return FAILED; } GELOGI("Get input size [%ld] from aipp [%s].", size, aipp_op_desc->GetName().c_str()); if (size == 0) { + REPORT_CALL_ERROR("E19999", "Tensor size of op:%s(%s) is 0, input_index:0, check invalid", + aipp_op_desc->GetName().c_str(), aipp_op_desc->GetType().c_str()); GELOGE(FAILED, "Can not get size from aipp [%s]", aipp_op_desc->GetName().c_str()); return FAILED; } @@ -491,12 +509,16 @@ Status InsertNewOpUtil::UpdateDataBySwitchN(const NodePtr &switchn, const NodePt auto ret = data_opdesc->UpdateOutputDesc(0, *input_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update OutputDesc to op:%s(%s) failed, index:0", + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to update data %s output using switchn %s", data->GetName().c_str(), switchn->GetName().c_str()); return INTERNAL_ERROR; } ret = data_opdesc->UpdateInputDesc(0, *input_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update InputDesc to op:%s(%s) failed, index:0", + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to update data %s input using switchn %s", data->GetName().c_str(), switchn->GetName().c_str()); return INTERNAL_ERROR; @@ -596,6 +618,9 @@ Status InsertNewOpUtil::GetAllAipps(const NodePtr &data_node, const NodePtr &nod for (const auto &name : op->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Subgraph:%s of op:%s(%s) not find in graph:%s, check invalid", + name.c_str(), op->GetName().c_str(), + op->GetType().c_str(), graph->GetName().c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -607,6 +632,9 @@ Status InsertNewOpUtil::GetAllAipps(const NodePtr &data_node, const NodePtr &nod GE_CHECK_NOTNULL(src_op); uint32_t parent_index = 0; if (!AttrUtils::GetInt(src_op, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_INNER_ERROR("E19999", "Get Attr:%s of op:%s(%s) failed", + ATTR_NAME_PARENT_NODE_INDEX.c_str(), + src_op->GetName().c_str(), src_op->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", src_op->GetName().c_str()); return FAILED; } @@ -746,6 +774,9 @@ Status InsertNewOpUtil::SetModelInputDims(NodePtr &data_node, NodePtr &aipp_node } GELOGD("After set N or H/W to -1, the model input dims: %s.", formats::JoinToString(model_input_dims).c_str()); if (!AttrUtils::SetListInt(data_opdesc, ATTR_NAME_INPUT_DIMS, model_input_dims)) { + REPORT_INNER_ERROR("E19999", "Set Attr:%s of op:%s(%s) failed", + ATTR_NAME_INPUT_DIMS.c_str(), + data_opdesc->GetName().c_str(), data_opdesc->GetType().c_str()); GELOGE(FAILED, "SetListInt of %s failed.", ATTR_NAME_INPUT_DIMS.c_str()); return FAILED; } diff --git a/ge/graph/preprocess/multi_batch_copy_graph.cc b/ge/graph/preprocess/multi_batch_copy_graph.cc index 12987f29..22f39d26 100644 --- a/ge/graph/preprocess/multi_batch_copy_graph.cc +++ b/ge/graph/preprocess/multi_batch_copy_graph.cc @@ -78,6 +78,7 @@ inline bool IsGetNextType(const NodePtr &node) { NodePtr InsertMergeNodeToGraph(const std::string &name, size_t input_num, const ComputeGraphPtr &graph) { OpDescPtr desc = MakeShared(); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to insert merge node, name %s", name.c_str()); return nullptr; } @@ -87,24 +88,33 @@ NodePtr InsertMergeNodeToGraph(const std::string &name, size_t input_num, const for (size_t i = 0; i < input_num; ++i) { auto ret = desc->AddInputDesc("x" + std::to_string(i), tensor_desc); GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, input desc name:%s,", + desc->GetName().c_str(), desc->GetType().c_str(), + ("x" + std::to_string(i)).c_str()); GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add input %zu, error-code %u", name.c_str(), i, ret); return nullptr); } auto ret = desc->AddOutputDesc("y", tensor_desc); GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, output desc name:%s,", + desc->GetName().c_str(), desc->GetType().c_str(), "y"); GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add output 'y', error-code %u", name.c_str(), ret); return nullptr); tensor_desc.SetDataType(DT_INT32); ret = desc->AddOutputDesc("value_index", tensor_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, output desc name:%s,", + desc->GetName().c_str(), desc->GetType().c_str(), "value_index"); GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add output 'value_index', error-code %u", name.c_str(), ret); return nullptr; } if (!AttrUtils::SetBool(desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_INSERT_BY_MBATCH.c_str(), + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add attr", name.c_str()); return nullptr; } @@ -114,13 +124,18 @@ NodePtr InsertMergeNodeToGraph(const std::string &name, size_t input_num, const NodePtr InsertCopyNode(const NodePtr &node, size_t n) { const std::string &name = node->GetName() + "_ascend_mbatch_batch_" + std::to_string(n); auto src_op_desc = node->GetOpDesc(); - GE_IF_BOOL_EXEC(src_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "Failed to copy node %s to %s, the OpDesc is null", - node->GetName().c_str(), name.c_str()); + GE_IF_BOOL_EXEC(src_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Param opdesc in node is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "Failed to copy node %s to %s, the OpDesc is null", + node->GetName().c_str(), name.c_str()); return nullptr); auto desc = AttrUtils::CopyOpDesc(src_op_desc); - GE_IF_BOOL_EXEC(desc == nullptr, GELOGE(OUT_OF_MEMORY, "Failed to create op desc for copy node for node %s name %s", - node->GetName().c_str(), name.c_str()); + GE_IF_BOOL_EXEC(desc == nullptr, + REPORT_CALL_ERROR("E19999", "Copy OpDesc from op:%s(%s) failed", + src_op_desc->GetName().c_str(), src_op_desc->GetType().c_str()); + GELOGE(OUT_OF_MEMORY, "Failed to create op desc for copy node for node %s name %s", + node->GetName().c_str(), name.c_str()); return nullptr); desc->SetName(name); @@ -128,6 +143,8 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) { for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) { auto input_desc = desc->MutableInputDesc(i); GE_IF_BOOL_EXEC(input_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Input desc of op:%s(%s) not exist, index:%u, check invalid", + desc->GetName().c_str(), desc->GetType().c_str(), i); GELOGW("Get null input desc by index %u from node %s when copy from %s", i, desc->GetName().c_str(), node->GetName().c_str()); continue); @@ -137,6 +154,8 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) { for (uint32_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) { auto output_desc = desc->MutableOutputDesc(i); GE_IF_BOOL_EXEC(output_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Ouput desc of op:%s(%s) not exist, index:%u, check invalid", + desc->GetName().c_str(), desc->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "Failed to get output desc by index %u from node %s when copy from %s", i, desc->GetName().c_str(), node->GetName().c_str()); return nullptr); @@ -145,6 +164,8 @@ NodePtr InsertCopyNode(const NodePtr &node, size_t n) { } const std::string &batch_label = "Batch_" + std::to_string(n); if (!AttrUtils::SetStr(desc, ATTR_NAME_BATCH_LABEL, batch_label)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_BATCH_LABEL.c_str(), + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(FAILED, "set attr ATTR_NAME_BATCH_LABEL failed, node:%s.", name.c_str()); return nullptr; } @@ -167,6 +188,7 @@ bool IsAllDimsPositive(const std::vector &dims) { NodePtr InsertConst(const std::string &name, const ComputeGraphPtr &graph) { auto desc = MakeShared(); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to create const op %s, out of memory", name.c_str()); return nullptr; } @@ -175,14 +197,20 @@ NodePtr InsertConst(const std::string &name, const ComputeGraphPtr &graph) { GeTensor tensor; tensor.SetData(std::vector({0})); if (!AttrUtils::SetTensor(desc, ATTR_NAME_WEIGHTS, tensor)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to init tensor value for const %s", name.c_str()); return nullptr; } if (!AttrUtils::SetBool(desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_INSERT_BY_MBATCH.c_str(), + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to set insert flag for const node %s", name.c_str()); return nullptr; } if (desc->AddOutputDesc(GeTensorDesc()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add output desc for const node %s", name.c_str()); return nullptr; } @@ -297,7 +325,7 @@ Status MultiBatchGraphCopyer::RelinkConstCtrlEdge() { continue; } if (!out_ctrl_anchor_of_in_ctrl_node->IsLinkedWith(out_node->GetInControlAnchor())) { - GE_CHK_STATUS_RET(out_ctrl_anchor_of_in_ctrl_node->LinkTo(out_node->GetInControlAnchor())) + GE_CHK_GRAPH_STATUS_RET(out_ctrl_anchor_of_in_ctrl_node->LinkTo(out_node->GetInControlAnchor())) } } } @@ -371,6 +399,9 @@ Status MultiBatchGraphCopyer::GetEnterNodesGroupByFrame(mapGetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Get attr frame_name of enter[%s] failed.", node->GetName().c_str()); return FAILED; } @@ -435,7 +466,7 @@ Status MultiBatchGraphCopyer::MoveInEntersInDataAnchorDown(NodePtr &node, OpDesc GE_CHECK_NOTNULL(peer_out_data_anchor); auto peer_in_data_node = peer_out_data_anchor->GetOwnerNode(); if (IsEnterType(peer_in_data_node->GetType())) { - GE_CHK_STATUS_RET(peer_out_data_anchor->Unlink(in_data_anchor)) + GE_CHK_GRAPH_STATUS_RET(peer_out_data_anchor->Unlink(in_data_anchor)) GELOGD("Unlink data edge from %s to %s.", peer_in_data_node->GetName().c_str(), node->GetName().c_str()); auto enter_in_data_anchors = peer_in_data_node->GetAllInDataAnchors(); for (auto &enter_in_data_anchor : enter_in_data_anchors) { @@ -444,7 +475,7 @@ Status MultiBatchGraphCopyer::MoveInEntersInDataAnchorDown(NodePtr &node, OpDesc if (peer_out_data_anchor_of_enter->IsLinkedWith(in_data_anchor)) { continue; } - GE_CHK_STATUS_RET(peer_out_data_anchor_of_enter->LinkTo(in_data_anchor)) + GE_CHK_GRAPH_STATUS_RET(peer_out_data_anchor_of_enter->LinkTo(in_data_anchor)) GELOGD("Relink data edge from %s to %s.", peer_out_data_anchor_of_enter->GetOwnerNode()->GetName().c_str(), node->GetName().c_str()); } @@ -481,17 +512,17 @@ Status MultiBatchGraphCopyer::InsertEnterAfterNode(NodePtr &node, const OpDescPt GELOGD("Create Enter op %s after %s.", name.c_str(), node->GetName().c_str()); auto enter_desc = AttrUtils::CopyOpDesc(copy_desc); enter_desc->SetName(name); - GE_CHK_STATUS_RET( + GE_CHK_GRAPH_STATUS_RET( enter_desc->UpdateInputDesc("x", node_desc->GetOutputDesc(outanchor_inanchors_nodes.first->GetIdx()))) - GE_CHK_STATUS_RET( + GE_CHK_GRAPH_STATUS_RET( enter_desc->UpdateOutputDesc("y", node_desc->GetOutputDesc(outanchor_inanchors_nodes.first->GetIdx()))) auto enter_node = graph_->AddNode(enter_desc); GE_CHECK_NOTNULL(enter_node); - GE_CHK_STATUS_RET(outanchor_inanchors_nodes.first->LinkTo(enter_node->GetInDataAnchor(kDataInIndex))) + GE_CHK_GRAPH_STATUS_RET(outanchor_inanchors_nodes.first->LinkTo(enter_node->GetInDataAnchor(kDataInIndex))) GE_CHECK_NOTNULL(enter_node->GetOutDataAnchor(kDataInIndex)); for (auto &inanchor_node : outanchor_inanchors_nodes.second) { - GE_CHK_STATUS_RET(outanchor_inanchors_nodes.first->Unlink(inanchor_node.first)) - GE_CHK_STATUS_RET(enter_node->GetOutDataAnchor(kDataInIndex)->LinkTo(inanchor_node.first)) + GE_CHK_GRAPH_STATUS_RET(outanchor_inanchors_nodes.first->Unlink(inanchor_node.first)) + GE_CHK_GRAPH_STATUS_RET(enter_node->GetOutDataAnchor(kDataInIndex)->LinkTo(inanchor_node.first)) GELOGD("Unlink from %s to %s, link from %s to %s then to %s.", node->GetName().c_str(), inanchor_node.second->GetName().c_str(), node->GetName().c_str(), enter_node->GetName().c_str(), inanchor_node.second->GetName().c_str()); @@ -507,14 +538,14 @@ Status MultiBatchGraphCopyer::MoveCtrlEdgeToOutNodes(NodePtr &node, set GE_CHECK_NOTNULL(in_ctrl_anchor); auto peer_out_ctrl_anchors = in_ctrl_anchor->GetPeerOutControlAnchors(); for (auto &peer_out_ctrl_anchor : peer_out_ctrl_anchors) { - GE_CHK_STATUS_RET(peer_out_ctrl_anchor->Unlink(in_ctrl_anchor)) + GE_CHK_GRAPH_STATUS_RET(peer_out_ctrl_anchor->Unlink(in_ctrl_anchor)) GELOGD("Unlink control edge from %s to %s.", peer_out_ctrl_anchor->GetOwnerNode()->GetName().c_str(), node->GetName().c_str()); for (auto &out_node : out_nodes) { auto in_ctrl_anchor_of_out_node = out_node->GetInControlAnchor(); GE_CHECK_NOTNULL(in_ctrl_anchor_of_out_node); if (!peer_out_ctrl_anchor->IsLinkedWith(in_ctrl_anchor_of_out_node)) { - GE_CHK_STATUS_RET(peer_out_ctrl_anchor->LinkTo(in_ctrl_anchor_of_out_node)) + GE_CHK_GRAPH_STATUS_RET(peer_out_ctrl_anchor->LinkTo(in_ctrl_anchor_of_out_node)) GELOGD("Link control edge from %s to %s.", peer_out_ctrl_anchor->GetOwnerNode()->GetName().c_str(), out_node->GetName().c_str()); } @@ -531,8 +562,8 @@ Status MultiBatchGraphCopyer::DeleteEnterWithoutDataOut() { auto out_nodes = node->GetOutAllNodes(); if (out_nodes.empty()) { GELOGD("Delete enter node: %s which has no output.", node->GetName().c_str()); - GE_CHK_STATUS_RET(GraphUtils::IsolateNode(node, {})) - GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph_, node)) + GE_CHK_GRAPH_STATUS_RET(GraphUtils::IsolateNode(node, {})) + GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph_, node)) } } } @@ -578,7 +609,9 @@ Status MultiBatchGraphCopyer::LabelInBatchBranchStatus() { GELOGD("Start label in batch branch status."); for (const auto &data : origin_data_nodes_) { auto op_desc = data->GetOpDesc(); - GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(PARAM_INVALID, "Op desc is nullptr."); + GE_IF_BOOL_EXEC(op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "op_desc in origin_data_nodes_ is nullptr, check invalid"); + GELOGE(PARAM_INVALID, "Op desc is nullptr."); return PARAM_INVALID); LabelStatusForData(data); if (!GetLocalOmgContext().dynamic_node_type.empty()) { @@ -853,6 +886,9 @@ NodePtr MultiBatchGraphCopyer::FindSwitchnNodeForDataEdge(const OutDataAnchorPtr GELOGI("The output idx %d has %zu referenced nums.", output_idx, data_out_anchor->GetPeerInDataAnchors().size()); for (const auto &peer_in_anchor : data_out_anchor->GetPeerInDataAnchors()) { if (peer_in_anchor->GetOwnerNode()->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "peer op_desc of op:%s(%s)'s out_index:%d anchor exist nullptr, " + "check invalid", + data_node->GetName().c_str(), data_node->GetType().c_str(), output_idx); GELOGE(INTERNAL_ERROR, "Op desc should not be nullptr."); return nullptr; } @@ -862,6 +898,11 @@ NodePtr MultiBatchGraphCopyer::FindSwitchnNodeForDataEdge(const OutDataAnchorPtr } if (output_idx >= static_cast(getnext_nodes_to_switchn_.size()) || referenced_index >= getnext_nodes_to_switchn_.at(output_idx).size()) { + REPORT_INNER_ERROR("E19999", "output_index:%d of op:%s(%s) > getnext_nodes_to_switchn_.size():%zu or " + "referenced_index:%zu >= getnext_nodes_to_switchn_.at(output_idx).size():%zu, " + "check invalid", output_idx, + data_node->GetName().c_str(), data_node->GetType().c_str(), getnext_nodes_to_switchn_.size(), + referenced_index, getnext_nodes_to_switchn_.at(output_idx).size()); GELOGE(INTERNAL_ERROR, "Output idx is %d, referenced index is %zu", output_idx, referenced_index); return nullptr; } @@ -891,6 +932,10 @@ Status MultiBatchGraphCopyer::CopyInDataEdges(const NodePtr &origin_node, int ba if (switchn != nullptr) { auto ret = GraphUtils::AddEdge(switchn->GetOutDataAnchor(batch_num), dst_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + switchn->GetName().c_str(), switchn->GetType().c_str(), + batch_num, copyed_node->GetName().c_str(), copyed_node->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add data edge between %s(%d) to %s(%d), error-code %u", switchn->GetName().c_str(), batch_num, copyed_node->GetName().c_str(), in_anchor->GetIdx(), ret); @@ -906,6 +951,11 @@ Status MultiBatchGraphCopyer::CopyInDataEdges(const NodePtr &origin_node, int ba auto src_batch_node = batch_branch_iter->second.at(batch_num); auto ret = GraphUtils::AddEdge(src_batch_node->GetOutDataAnchor(origin_src_anchor->GetIdx()), dst_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + src_batch_node->GetName().c_str(), + src_batch_node->GetType().c_str(), origin_src_anchor->GetIdx(), + copyed_node->GetName().c_str(), copyed_node->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add data edge between %s(%d) to %s(%d), error-code %u", src_batch_node->GetName().c_str(), batch_num, copyed_node->GetName().c_str(), in_anchor->GetIdx(), ret); return INTERNAL_ERROR; @@ -917,6 +967,11 @@ Status MultiBatchGraphCopyer::CopyInDataEdges(const NodePtr &origin_node, int ba auto ret = GraphUtils::AddEdge(origin_src_anchor, dst_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(out_index:%d) and op:%s(%s)(in_index:%d) failed", + origin_src_node->GetName().c_str(), + origin_src_node->GetType().c_str(), origin_src_anchor->GetIdx(), + copyed_node->GetName().c_str(), copyed_node->GetType().c_str(), + in_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add data edge between origin node %s(%d) to copyed %s(%d)", origin_src_node->GetName().c_str(), origin_src_anchor->GetIdx(), copyed_node->GetName().c_str(), dst_anchor->GetIdx()); @@ -936,6 +991,9 @@ Status MultiBatchGraphCopyer::CopyInControlEdges(const NodePtr &node, int batch_ // reconnect data node auto ret = GraphUtils::AddEdge(switchn_iter->second->GetOutControlAnchor(), copyed_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge between op:%s(%s) and op:%s(%s) failed", + switchn_iter->second->GetName().c_str(), switchn_iter->second->GetType().c_str(), + copyed_node->GetName().c_str(), copyed_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add control edge between %s to %s, error-code %u", switchn_iter->second->GetName().c_str(), copyed_node->GetName().c_str(), ret); return INTERNAL_ERROR; @@ -950,6 +1008,9 @@ Status MultiBatchGraphCopyer::CopyInControlEdges(const NodePtr &node, int batch_ auto src_batch_node = batch_branch_iter->second.at(batch_num); auto ret = GraphUtils::AddEdge(src_batch_node->GetOutControlAnchor(), copyed_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge between op:%s(%s) and op:%s(%s) failed", + src_batch_node->GetName().c_str(), src_batch_node->GetType().c_str(), + copyed_node->GetName().c_str(), copyed_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add data edge between %s to %s, error-code %u", src_batch_node->GetName().c_str(), copyed_node->GetName().c_str(), ret); return INTERNAL_ERROR; @@ -960,6 +1021,9 @@ Status MultiBatchGraphCopyer::CopyInControlEdges(const NodePtr &node, int batch_ auto ret = GraphUtils::AddEdge(origin_src_node->GetOutControlAnchor(), copyed_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add ctrl edge between op:%s(%s) and op:%s(%s) failed", + origin_src_node->GetName().c_str(), origin_src_node->GetType().c_str(), + copyed_node->GetName().c_str(), copyed_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add control edge from origin %s to copyed %s", origin_src_node->GetName().c_str(), copyed_node->GetName().c_str()); return INTERNAL_ERROR; @@ -973,6 +1037,7 @@ Status MultiBatchGraphCopyer::CopyInControlEdges(const NodePtr &node, int batch_ NodePtr MultiBatchGraphCopyer::InsertShapeDataNode() { auto desc = MakeShared(); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to create shape data node, out of memory"); return nullptr; } @@ -987,27 +1052,38 @@ NodePtr MultiBatchGraphCopyer::InsertShapeDataNode() { GeTensorDesc tensor_desc(GeShape({static_cast(shapes_.at(0).size())}), FORMAT_ND, DT_INT64); auto ret = desc->AddInputDesc(tensor_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); return nullptr; } ret = desc->AddOutputDesc(tensor_desc); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc into op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data"); return nullptr; } if (!AttrUtils::SetBool(desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_INSERT_BY_MBATCH.c_str(), desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add attr for created data"); return nullptr; } auto data_node = graph_->AddNode(desc); if (data_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + desc->GetName().c_str(), desc->GetType().c_str(), graph_->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add shape data node to graph"); return nullptr; } ret = GraphUtils::AppendInputNode(graph_, data_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Append input node:%s(%s) to graph:%s failed", + data_node->GetName().c_str(), data_node->GetType().c_str(), + graph_->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to append data node %s as input to graph", data_node->GetName().c_str()); return nullptr; } @@ -1019,6 +1095,7 @@ NodePtr MultiBatchGraphCopyer::InsertGetDynamicDimsNode() { GELOGD("Start insert getdynamicdims node to get shape info."); auto desc = MakeShared(); if (desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to create shape data node, out of memory"); return nullptr; } @@ -1040,33 +1117,49 @@ NodePtr MultiBatchGraphCopyer::InsertGetDynamicDimsNode() { tensor_desc.SetFormat(FORMAT_ND); tensor_desc.SetDataType(DT_INT64); auto ret = desc->AddInputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); - return nullptr); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); + return nullptr); continue; } GeTensorDesc tensor_desc(GeShape({static_cast(input_shape_dims)}), FORMAT_ND, DT_INT64); auto ret = desc->AddInputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); - return nullptr); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data"); + return nullptr); } GeTensorDesc tensor_desc(GeShape({static_cast(shapes_.at(0).size())}), FORMAT_ND, DT_INT64); auto ret = desc->AddOutputDesc(tensor_desc); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data"); - return nullptr); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + desc->GetName().c_str(), desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data"); + return nullptr); if (!AttrUtils::SetBool(desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_INSERT_BY_MBATCH.c_str(), desc->GetName().c_str(), desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add attr for created data"); return nullptr; } auto data_node = graph_->AddNode(desc); if (data_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + desc->GetName().c_str(), desc->GetType().c_str(), graph_->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add shape data node to graph"); return nullptr; } ret = GraphUtils::AppendInputNode(graph_, data_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Append input node:%s(%s) to graph:%s failed", + data_node->GetName().c_str(), data_node->GetType().c_str(), + graph_->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Failed to append data node %s as input to graph", data_node->GetName().c_str()); return nullptr; } @@ -1076,6 +1169,7 @@ NodePtr MultiBatchGraphCopyer::InsertGetDynamicDimsNode() { Status MultiBatchGraphCopyer::CheckArguments() { if (graph_ == nullptr) { + REPORT_INNER_ERROR("E19999", "graph_ is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Failed to copy graph, the graph is null"); return PARAM_INVALID; } @@ -1122,6 +1216,9 @@ Status MultiBatchGraphCopyer::LinkDataToMerge(const NodePtr &data, const NodePtr for (size_t i = 0; i < shapes_.size(); ++i) { auto ret = GraphUtils::AddEdge(switchn->GetOutDataAnchor(i), merge->GetInDataAnchor(i)); GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%zu) failed", + switchn->GetName().c_str(), switchn->GetType().c_str(), i, + merge->GetName().c_str(), merge->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "Failed to add edge between switchn %s(%zu) to merge %s(%zu), error-code %u", switchn->GetName().c_str(), i, merge->GetName().c_str(), i, ret); return INTERNAL_ERROR); @@ -1132,6 +1229,9 @@ Status MultiBatchGraphCopyer::LinkDataToMerge(const NodePtr &data, const NodePtr Status MultiBatchGraphCopyer::LinkNodeToMerge(const NodePtr &node, int out_index, const NodePtr &merge) { auto ©ed_nodes = nodes_to_batch_nodes_[node.get()]; if (copyed_nodes.size() != shapes_.size()) { + REPORT_INNER_ERROR("E19999", "Create merge node for node %s failed, " + "the copyed nodes for it count %zu different with shape %zu, check invalid", + node->GetName().c_str(), copyed_nodes.size(), shapes_.size()); GELOGE(INTERNAL_ERROR, "Failed to create merge node for node %s, the copyed nodes for it count %zu different with shape %zu", node->GetName().c_str(), copyed_nodes.size(), shapes_.size()); @@ -1153,14 +1253,21 @@ Status MultiBatchGraphCopyer::LinkNodeToMerge(const NodePtr &node, int out_index return OUT_OF_MEMORY); auto ret = GraphUtils::AddEdge(src_node->GetOutControlAnchor(), const_node->GetInControlAnchor()); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", - src_node->GetName().c_str(), const_node->GetName().c_str()); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ctrl edge between op:%s(%s) and op:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", + src_node->GetName().c_str(), const_node->GetName().c_str()); return INTERNAL_ERROR); src_node = const_node; } auto ret = GraphUtils::AddEdge(src_node->GetOutDataAnchor(out_index), merge->GetInDataAnchor(i)); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%zu) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), out_index, + merge->GetName().c_str(), merge->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "Failed to add edge between copyed node %s(%d) to inserted merge node %s(%zu), error-code %u", copyed_nodes[i]->GetName().c_str(), out_index, merge->GetName().c_str(), i, ret); @@ -1219,6 +1326,8 @@ Status MultiBatchGraphCopyer::UpdateShapeOfShapeNode(const NodePtr &node, size_t GeShape output_shape(output_dims); output_desc.SetShape(output_shape); if (node->GetOpDesc()->UpdateOutputDesc(shape_index, output_desc) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output desc to op:%s(%s) failed, index:%zu", + node->GetName().c_str(), node->GetType().c_str(), shape_index); GELOGE(FAILED, "Update output desc fail."); return FAILED; } @@ -1253,6 +1362,9 @@ Status MultiBatchGraphCopyer::UpdateMaxShapeToData(const NodePtr &node, size_t o int64_t size = 1; for (auto dim : data_to_dynamic_info_.at(data_name).at(i)) { if (INT64_MAX / dim < size) { + REPORT_CALL_ERROR("E19999", "Op:%s(%s)'s shape:%s size will overflow after multi, check invalid", + node->GetName().c_str(), node->GetType().c_str(), + formats::ShapeToString(data_to_dynamic_info_[data_name].at(i)).c_str()); GELOGE(PARAM_INVALID, "The shape %s size overflow", formats::ShapeToString(data_to_dynamic_info_[data_name].at(i)).c_str()); return PARAM_INVALID; @@ -1267,11 +1379,11 @@ Status MultiBatchGraphCopyer::UpdateMaxShapeToData(const NodePtr &node, size_t o // must not be error, the calc result has been checked in function InsertSwitchNForData (void)CalcShape(data_to_dynamic_info_.at(data_name).at(max_shape_index), data_shape); auto ret = NodeUtils::UpdateOutputShape(*node, out_anchor_index, data_shape); - GE_CHK_STATUS_RET(ret, "Failed to update output shape for data %s", node->GetName().c_str()); + GE_CHK_GRAPH_STATUS_RET(ret, "Failed to update output shape for data %s", node->GetName().c_str()); // getnext_sink not has input if (!getnext_sink_dynamic_dims_) { ret = NodeUtils::UpdateInputShape(*node, kDataInIndex, data_shape); - GE_CHK_STATUS_RET(ret, "Failed to update input shape for data %s", node->GetName().c_str()); + GE_CHK_GRAPH_STATUS_RET(ret, "Failed to update input shape for data %s", node->GetName().c_str()); } else { // need to update shape of Shape_node when getnext_sink_dynamic GE_CHK_STATUS_RET(UpdateShapeOfShapeNode(node, out_anchor_index), "Failed to update shape of shape node"); @@ -1300,6 +1412,7 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &node, const si auto switchn_desc = MakeShared(); GE_IF_BOOL_EXEC(switchn_desc == nullptr, + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to create switchn for data %s", node->GetName().c_str()); return OUT_OF_MEMORY); string switchn_name = node->GetName() + "_ascend_mbatch_switchn"; @@ -1313,10 +1426,16 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &node, const si GeTensorDesc tensor(NodeUtils::GetOutputDesc(*node, out_anchor_index)); GE_IF_BOOL_EXEC(switchn_desc->AddInputDesc("data", tensor) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, input desc name:%s", + switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str(), + "data"); GELOGE(OUT_OF_MEMORY, "Failed to add input tensor desc for %s", switchn_desc->GetName().c_str()); return OUT_OF_MEMORY); GeTensorDesc pred_tensor; GE_IF_BOOL_EXEC(switchn_desc->AddInputDesc("pred_value", pred_tensor) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed, input desc name:%s", + switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str(), + "pred_value"); GELOGE(OUT_OF_MEMORY, "Failed to add input pred tensor desc for %s", switchn_desc->GetName().c_str()); return OUT_OF_MEMORY); std::vector input_dims_str; @@ -1340,11 +1459,17 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &node, const si formats::JoinToString(tensor.GetShape().GetDims()); input_dims_str.emplace_back(input_str); if (!AttrUtils::SetListInt(tensor, ATTR_NAME_SWITCHN_PRED_VALUE, shapes_.at(i))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output tensor of node:%s(%s) failed, index:%zu", + ATTR_NAME_SWITCHN_PRED_VALUE.c_str(), + node->GetName().c_str(), node->GetType().c_str(), out_anchor_index); GELOGE(INTERNAL_ERROR, "Failed to add attr value on output %zu tensor", i); return INTERNAL_ERROR; } (void) AttrUtils::SetListInt(tensor, ATTR_NAME_COMBINED_DYNAMIC_DIMS, shape.GetDims()); if (switchn_desc->AddOutputDesc("output" + std::to_string(i), tensor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed, output desc name:%s", + switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str(), + ("output" + std::to_string(i)).c_str()); GELOGE(GRAPH_FAILED, "Opdesc AddOutputDesc failed"); return GRAPH_FAILED; } @@ -1352,15 +1477,22 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &node, const si } (void)AttrUtils::SetListStr(node->GetOpDesc(), "_all_origin_gears_inputs", input_dims_str); if (!AttrUtils::SetListStr(switchn_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, data_name_order_)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_USER_DESIGNEATE_SHAPE_ORDER.c_str(), + switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add user designate shape order attr on switchn node %s", switchn_desc->GetName().c_str()); return INTERNAL_ERROR; } if (!AttrUtils::SetBool(switchn_desc, ATTR_INSERT_BY_MBATCH, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_INSERT_BY_MBATCH.c_str(), switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add insert attr on switchn node %s", switchn_desc->GetName().c_str()); return INTERNAL_ERROR; } if (!AttrUtils::SetStr(node->GetOpDesc(), kMbatchSwitchnName, switchn_desc->GetName())) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + kMbatchSwitchnName, node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add switchn attr on data node %s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1371,6 +1503,9 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &node, const si auto switchn = graph_->AddNode(switchn_desc); GE_IF_BOOL_EXEC(switchn == nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + switchn_desc->GetName().c_str(), switchn_desc->GetType().c_str(), + graph_->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to create switchn %s from desc", switchn_desc->GetName().c_str()); return OUT_OF_MEMORY); if (!getnext_sink_dynamic_dims_) { @@ -1416,6 +1551,8 @@ Status MultiBatchGraphCopyer::LinkGetDynamicDimsToNetOutput(const NodePtr &node) if (node->GetType() == NETOUTPUT) { if (!GetLocalOmgContext().dynamic_node_type.empty()) { if (!AttrUtils::SetStr(node->GetOpDesc(), ATTR_ALL_GEARS_INFO, GetLocalOmgContext().dynamic_dims)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_ALL_GEARS_INFO.c_str(), node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set all gears info attr on netoutput %s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1423,15 +1560,24 @@ Status MultiBatchGraphCopyer::LinkGetDynamicDimsToNetOutput(const NodePtr &node) if (getnext_sink_dynamic_dims_) { size_t input_index = node->GetAllInDataAnchors().size(); if (NodeUtils::AppendInputAnchor(node, input_index + 1) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Append %zu input anchors to node:%s(%s) failed", + input_index + 1, node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Append input anchor of %s of %zu failed.", node->GetName().c_str(), input_index); return INTERNAL_ERROR; } auto ret = ge::GraphUtils::AddEdge(shape_data_->GetOutDataAnchor(kDataOutIndex), node->GetInDataAnchor(input_index)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link netoutput %s to getdynamicdims %s", - node->GetName().c_str(), shape_data_->GetName().c_str()); + GE_IF_BOOL_EXEC( + ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%zu) failed", + shape_data_->GetName().c_str(), shape_data_->GetType().c_str(), kDataOutIndex, + node->GetName().c_str(), node->GetType().c_str(), input_index); + GELOGE(INTERNAL_ERROR, "Failed to link netoutput %s to getdynamicdims %s", + node->GetName().c_str(), shape_data_->GetName().c_str()); return INTERNAL_ERROR); if (!AttrUtils::SetBool(node->GetOpDesc(), ATTR_GETNEXT_SINK_DYNMAIC, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_GETNEXT_SINK_DYNMAIC.c_str(), node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to set getnext sink dynamic attr on netoutput %s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1459,6 +1605,9 @@ Status MultiBatchGraphCopyer::AddAttrForGetDynamicDims(const NodePtr &node) { GELOGD("Add attr for :%s, type is %s:", shape_data_->GetName().c_str(), shape_data_->GetType().c_str()); size_t data_count = node->GetAllOutDataAnchors().size() / kDivisionConst; if (!AttrUtils::SetInt(shape_data_->GetOpDesc(), ATTR_GETNEXT_SINK_DATA_COUNT, data_count)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_GETNEXT_SINK_DATA_COUNT.c_str(), + shape_data_->GetName().c_str(), shape_data_->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_DATA_COUNT failed"); return INTERNAL_ERROR; } @@ -1475,6 +1624,9 @@ Status MultiBatchGraphCopyer::AddAttrForGetDynamicDims(const NodePtr &node) { } } if (!AttrUtils::SetListInt(shape_data_->GetOpDesc(), ATTR_GETNEXT_SINK_SHAPE_INFO, shape_info)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_GETNEXT_SINK_SHAPE_INFO.c_str(), + shape_data_->GetName().c_str(), shape_data_->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_SHAPE_INFO failed"); return INTERNAL_ERROR; } @@ -1491,8 +1643,13 @@ Status MultiBatchGraphCopyer::AddLinkForGetDynamicDims(const NodePtr &node) { shape_data_->GetName().c_str(), input_index); auto out_data_anchor = node->GetOutDataAnchor(out_index); auto ret = GraphUtils::AddEdge(out_data_anchor, shape_data_->GetInDataAnchor(input_index)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link getnext %s to getdynamicdims %s", - node->GetName().c_str(), shape_data_->GetName().c_str()); + GE_IF_BOOL_EXEC( + ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%zu) and op:%s(%s)(index:%zu) failed", + node->GetName().c_str(), node->GetType().c_str(), out_index, + shape_data_->GetName().c_str(), shape_data_->GetType().c_str(), input_index); + GELOGE(INTERNAL_ERROR, "Failed to link getnext %s to getdynamicdims %s", + node->GetName().c_str(), shape_data_->GetName().c_str()); return INTERNAL_ERROR); } return SUCCESS; @@ -1506,6 +1663,9 @@ Status MultiBatchGraphCopyer::LinkEdges() { if (data_nodes_to_switchn_.count(node.get()) > 0) { auto switchn = data_nodes_to_switchn_[node.get()]; GE_IF_BOOL_EXEC(switchn == nullptr, + REPORT_INNER_ERROR("E19999", + "swithn in data_nodes_to_switchn_ for op:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(PARAM_INVALID, "Switchn should not be nullptr for %s.", node->GetName().c_str()); return OUT_OF_MEMORY); ret = LinkDataToSwitchN(node, switchn, kDataOutIndex); @@ -1545,14 +1705,24 @@ Status MultiBatchGraphCopyer::LinkEdges() { Status MultiBatchGraphCopyer::LinkDataToSwitchN(const NodePtr &data, const NodePtr &switchn, const int &out_index) { auto ret = GraphUtils::AddEdge(shape_data_->GetOutDataAnchor(kDataOutIndex), switchn->GetInDataAnchor(kSwitchNPredIndex)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link shape data %s to switchn %s", - shape_data_->GetName().c_str(), switchn->GetName().c_str()); - return INTERNAL_ERROR); + GE_IF_BOOL_EXEC( + ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + shape_data_->GetName().c_str(), shape_data_->GetType().c_str(), kDataOutIndex, + switchn->GetName().c_str(), switchn->GetType().c_str(), kSwitchNPredIndex); + GELOGE(INTERNAL_ERROR, "Failed to link shape data %s to switchn %s", + shape_data_->GetName().c_str(), switchn->GetName().c_str()); + return INTERNAL_ERROR); ret = GraphUtils::AddEdge(data->GetOutDataAnchor(out_index), switchn->GetInDataAnchor(kSwitchNDataIndex)); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link data %s to switchn %s", - data->GetName().c_str(), switchn->GetName().c_str()); - return INTERNAL_ERROR); + GE_IF_BOOL_EXEC( + ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + data->GetName().c_str(), data->GetType().c_str(), out_index, + switchn->GetName().c_str(), switchn->GetType().c_str(), kSwitchNDataIndex); + GELOGE(INTERNAL_ERROR, "Failed to link data %s to switchn %s", + data->GetName().c_str(), switchn->GetName().c_str()); + return INTERNAL_ERROR); return SUCCESS; } @@ -1594,6 +1764,8 @@ Status MultiBatchGraphCopyer::LinkToMerge(const NodePtr &node) { } continue; } + REPORT_INNER_ERROR("E19999", "The merge node %s is created, index %zu, but can not find the src node, " + "check invalid", merge_node->GetName().c_str(), i); GELOGE(INTERNAL_ERROR, "The merge node %s is created, index %zu, but can not find the src node", merge_node->GetName().c_str(), i); return INTERNAL_ERROR; @@ -1630,24 +1802,38 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) { } auto iter = nodes_to_merge_nodes_.find(in_node.get()); if (iter == nodes_to_merge_nodes_.end()) { + REPORT_INNER_ERROR("E19999", "Failed to link data edge from %s(%s)(index:%d) to %s(%s)(index:%d), " + "cause no merge node found, check invalid", + in_node->GetName().c_str(), in_node->GetType().c_str(), src_out_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str(), in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to link IO data edge from %s(%d) to %s(%d), no merge node found", in_node->GetName().c_str(), src_out_anchor->GetIdx(), node->GetName().c_str(), in_data_anchor->GetIdx()); return INTERNAL_ERROR; } auto merge_node = iter->second[src_out_anchor->GetIdx()]; if (merge_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Failed to link data edge from %s(%s)(index:%d) to %s(%s)(index:%d), " + "cause no merge node found, check invalid", + in_node->GetName().c_str(), in_node->GetType().c_str(), src_out_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str(), in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to link IO data edge from %s(%d) to %s(%d), no merge node found", in_node->GetName().c_str(), src_out_anchor->GetIdx(), node->GetName().c_str(), in_data_anchor->GetIdx()); return INTERNAL_ERROR; } auto ret = src_out_anchor->Unlink(in_data_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_INNER_ERROR("E19999", "Unlink edge from %s(%s)(index:%d) to %s(%s)(index:%d) failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), src_out_anchor->GetIdx(), + node->GetName().c_str(), node->GetType().c_str(), in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to unlink the control edge from %s(%d) to %s(%d)", in_node->GetName().c_str(), src_out_anchor->GetIdx(), node->GetName().c_str(), in_data_anchor->GetIdx()); return INTERNAL_ERROR; } ret = GraphUtils::AddEdge(merge_node->GetOutDataAnchor(kMergeDataOutIndex), in_data_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + merge_node->GetName().c_str(), merge_node->GetType().c_str(), kMergeDataOutIndex, + node->GetName().c_str(), node->GetType().c_str(), in_data_anchor->GetIdx()); GELOGE(INTERNAL_ERROR, "Failed to add data edge from %s(%d) to %s(%d)", merge_node->GetName().c_str(), src_out_anchor->GetIdx(), node->GetName().c_str(), in_data_anchor->GetIdx()); return INTERNAL_ERROR; @@ -1662,28 +1848,47 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) { } auto iter = nodes_to_merge_nodes_.find(in_node.get()); if (iter == nodes_to_merge_nodes_.end()) { + REPORT_INNER_ERROR("E19999", "Failed to link IO control edge from %s(%s) to %s(%s), no merge node found," + "check invalid", + in_node->GetName().c_str(), in_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to link IO control edge from %s to %s, no merge node found", in_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; } auto merge_node = iter->second[0]; if (merge_node == nullptr) { + REPORT_INNER_ERROR("E19999", + "Failed to link IO control edge from %s(%s) to %s(%s), no merge node found, check invalid", + in_node->GetName().c_str(), in_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to link IO control edge from %s to %s, no merge node found", in_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; } GE_IF_BOOL_EXEC(in_node->GetOutControlAnchor() == nullptr, + REPORT_INNER_ERROR("E19999", "Out control anchor of op:%s(%s) is nullptr, check invalid", + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Innode outputControlAnchor is null"); return INTERNAL_ERROR); auto ret = in_node->GetOutControlAnchor()->Unlink(node->GetInControlAnchor()); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to unlink the control edge from %s to %s", - in_node->GetName().c_str(), node->GetName().c_str()); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + REPORT_INNER_ERROR("E19999", "Unlink ctrl edge from %s(%s) to %s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to unlink the control edge from %s to %s", + in_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR); ret = GraphUtils::AddEdge(merge_node->GetOutControlAnchor(), node->GetInControlAnchor()); - GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", - merge_node->GetName().c_str(), node->GetName().c_str()); - return INTERNAL_ERROR); + GE_IF_BOOL_EXEC( + ret != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ctrl edge between op:%s(%s) and op:%s(%s) failed", + merge_node->GetName().c_str(), merge_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", + merge_node->GetName().c_str(), node->GetName().c_str()); + return INTERNAL_ERROR); GELOGI("Link control edge from merge %s(from %s) to %s", merge_node->GetName().c_str(), in_node->GetName().c_str(), node->GetName().c_str()); } @@ -1779,6 +1984,8 @@ void GetDynamicShapeByGraph(const ComputeGraphPtr &graph, const NodePtr &node, for (size_t j = 0; j < dynamic_branch_names.size(); ++j) { const auto &subgraph = graph->GetSubgraph(dynamic_branch_names[j]); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph:%s from graph:%s failed", + dynamic_branch_names[j].c_str(), graph->GetName().c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", dynamic_branch_names[j].c_str()); dynamic_output_dims.clear(); return; @@ -1786,6 +1993,8 @@ void GetDynamicShapeByGraph(const ComputeGraphPtr &graph, const NodePtr &node, const auto &out_node = subgraph->FindFirstNodeMatchType(NETOUTPUT); if (out_node == nullptr) { + REPORT_INNER_ERROR("E19999", "No netoutput node exist in subgraph:%s, check invalid", + subgraph->GetName().c_str()); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "NetOutput not found, name: %s", dynamic_branch_names[j].c_str()); dynamic_output_dims.clear(); return; @@ -1794,6 +2003,9 @@ void GetDynamicShapeByGraph(const ComputeGraphPtr &graph, const NodePtr &node, GELOGI("Find the subgraph Output node %s and the index is %zu", out_node->GetName().c_str(), i); const auto &out_desc = out_node->GetOpDesc(); if (out_desc == nullptr || out_desc->GetInputsSize() <= i) { + REPORT_INNER_ERROR("E19999", + "op_desc of node in subgraph:%s is nullptr or input desc size:%zu <= %zu, check invalid", + subgraph->GetName().c_str(), out_desc->GetInputsSize(), i); GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "Get Input desc failed, name: %s, index: %zu", out_node->GetName().c_str(), i); dynamic_output_dims.clear(); return; @@ -1895,6 +2107,9 @@ Status GetDynamicOutputShape(ComputeGraphPtr &graph) { if ((net_output != nullptr) && !dynamic_output_dims.empty()) { GetDirectOutputShape(graph, net_output, dynamic_output_index, dynamic_output_dims); if (!AttrUtils::SetListStr(net_output->GetOpDesc(), ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_dims)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_NAME_DYNAMIC_OUTPUT_DIMS.c_str(), + net_output->GetName().c_str(), net_output->GetType().c_str()); GELOGE(FAILED, "Set dynamic output dims attr failed"); return FAILED; } diff --git a/ge/graph/preprocess/multi_batch_options.cc b/ge/graph/preprocess/multi_batch_options.cc index 3bde0efb..b82d1034 100644 --- a/ge/graph/preprocess/multi_batch_options.cc +++ b/ge/graph/preprocess/multi_batch_options.cc @@ -102,6 +102,9 @@ Status DistinguishGetNextAndData(ComputeGraphPtr &graph, vector &data_n Status CheckSequenceOfData(ComputeGraphPtr &graph, const vector &data_nodes) { GELOGD("Start check input sequence from data nodes and input shape."); if (data_nodes.size() != GetLocalOmgContext().user_input_dims.size()) { + REPORT_INNER_ERROR("E19999", "Count:%zu of data_nodes in graph:%s should be equal to " + "input_shape count:%zu from option, check invalid", + data_nodes.size(), graph->GetName().c_str(), GetLocalOmgContext().user_input_dims.size()); GELOGE(PARAM_INVALID, "The count of input shape:%zu should be equal to the count of data num:%zu.", GetLocalOmgContext().user_input_dims.size(), data_nodes.size()); return PARAM_INVALID; @@ -119,6 +122,11 @@ Status CheckSequenceOfData(ComputeGraphPtr &graph, const vector &data_n continue; } if (dynamic_dims.size() != output_shape.size()) { + REPORT_INNER_ERROR("E19999", "The output shape of %s is %s, the input shape from options of %s is %s, graph:%s," + "check invalid", data_node->GetName().c_str(), + formats::JoinToString(output_shape).c_str(), + GetLocalOmgContext().user_input_dims.at(i).first.c_str(), + formats::JoinToString(dynamic_dims).c_str(), graph->GetName().c_str()); GELOGE(PARAM_INVALID, "The output shape of %s is %s, the input shape from options of %s is %s.", data_node->GetName().c_str(), formats::JoinToString(output_shape).c_str(), GetLocalOmgContext().user_input_dims.at(i).first.c_str(), formats::JoinToString(dynamic_dims).c_str()); @@ -126,6 +134,11 @@ Status CheckSequenceOfData(ComputeGraphPtr &graph, const vector &data_n } for (size_t j = 0; j < dynamic_dims.size(); ++j) { if (dynamic_dims.at(j) != kDynmaicDims && dynamic_dims.at(j) != output_shape.at(j)) { + REPORT_INNER_ERROR("E19999", "Value of input shape %s from option and output shape %s of data op:%s " + "should be equal to %d, index:%zu, graph:%s, check invalid", + formats::JoinToString(dynamic_dims).c_str(), + formats::JoinToString(output_shape).c_str(), data_node->GetName().c_str(), kDynmaicDims, + j, graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Value of input shape %s should be equal to %s.", formats::JoinToString(dynamic_dims).c_str(), formats::JoinToString(output_shape).c_str()); return INTERNAL_ERROR; @@ -138,6 +151,9 @@ Status CheckSequenceOfData(ComputeGraphPtr &graph, const vector &data_n Status CheckSequenceOfGetnext(ComputeGraphPtr &graph, const vector &getnext_sink_node) { GELOGD("Start check input sequence from getnext sink nodes and input shape."); if (getnext_sink_node.size() != kNumOfGetnextNode) { + REPORT_INNER_ERROR("E19999", "Not support dynamic dims when a graph with multi getnext nodes, graph:%s, " + "num of getnext node:%zu, check invalid", + graph->GetName().c_str(), getnext_sink_node.size()); GELOGE(PARAM_INVALID, "Not support dynamic dims when a graph with multi getnext nodes."); return PARAM_INVALID; } @@ -147,6 +163,9 @@ Status CheckSequenceOfGetnext(ComputeGraphPtr &graph, const vector &get GE_CHECK_NOTNULL(op_desc); size_t data_count = data_node->GetAllOutDataAnchors().size() / kDivisionConst; if (data_count != GetLocalOmgContext().user_input_dims.size()) { + REPORT_INNER_ERROR("E19999", "Output desc count of %s is %zu, should be equal to count of input shape: %zu, " + "graph:%s, check invalid", op_desc->GetName().c_str(), data_count, + GetLocalOmgContext().user_input_dims.size(), graph->GetName().c_str()); GELOGE(PARAM_INVALID, "Output count of %s is %zu, should be equal to count of input shape: %zu", op_desc->GetName().c_str(), data_count, GetLocalOmgContext().user_input_dims.size()); return PARAM_INVALID; @@ -161,6 +180,12 @@ Status CheckSequenceOfGetnext(ComputeGraphPtr &graph, const vector &get continue; } if (dynamic_dims.size() != output_shape.size()) { + REPORT_INNER_ERROR("E19999", "The %zu output_shape of %s is %s not equal to the input_shape:%s " + "from options of %s, graph:%s, check invalid", i, + data_node->GetName().c_str(), formats::JoinToString(output_shape).c_str(), + formats::JoinToString(dynamic_dims).c_str(), + GetLocalOmgContext().user_input_dims.at(i).first.c_str(), + graph->GetName().c_str()); GELOGE(PARAM_INVALID, "the output_shape of %s is %s, the input_shape from options of %s is %s.", data_node->GetName().c_str(), formats::JoinToString(output_shape).c_str(), GetLocalOmgContext().user_input_dims.at(i).first.c_str(), formats::JoinToString(dynamic_dims).c_str()); @@ -168,6 +193,11 @@ Status CheckSequenceOfGetnext(ComputeGraphPtr &graph, const vector &get } for (size_t j = 0; j < dynamic_dims.size(); ++j) { if (dynamic_dims.at(j) != kDynmaicDims && dynamic_dims.at(j) != output_shape.at(j)) { + REPORT_INNER_ERROR("E19999", "Value of input shape %s from option and output shape %s of data op:%s " + "should be equal to %d, index:%zu, graph:%s, check invalid", + formats::JoinToString(dynamic_dims).c_str(), + formats::JoinToString(output_shape).c_str(), data_node->GetName().c_str(), kDynmaicDims, + j, graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "value of input_shape %s should be equal to %s.", formats::JoinToString(dynamic_dims).c_str(), formats::JoinToString(output_shape).c_str()); return INTERNAL_ERROR; @@ -215,6 +245,9 @@ Status CheckSequenceOfOptions(ComputeGraphPtr &graph, vector &data_node Status UpdateNameOfData(ComputeGraphPtr &graph, const vector &data_nodes) { GELOGD("Update first value of input shape by data nodes."); if (data_nodes.size() != GetLocalOmgContext().user_input_dims.size()) { + REPORT_INNER_ERROR("E19999", "Count:%zu of data_nodes in graph:%s should be equal to " + "input_shape count:%zu from option, check invalid", + data_nodes.size(), graph->GetName().c_str(), GetLocalOmgContext().user_input_dims.size()); GELOGE(PARAM_INVALID, "count of data_nodes: %zu should be equal to input_shape count: %zu.", data_nodes.size(), GetLocalOmgContext().user_input_dims.size()); return PARAM_INVALID; @@ -229,6 +262,9 @@ Status UpdateNameOfData(ComputeGraphPtr &graph, const vector &data_node Status UpdateNameOfGetnext(ComputeGraphPtr &graph, const vector &getnext_sink_nodes) { GELOGD("Update first value of input shape by getnext sink nodes."); if (getnext_sink_nodes.size() != kNumOfGetnextNode) { + REPORT_INNER_ERROR("E19999", "Not support dynamic dims when a graph with multi getnext nodes, graph:%s, " + "num of getnext node:%zu, check invalid", + graph->GetName().c_str(), getnext_sink_nodes.size()); GELOGE(PARAM_INVALID, "Not support dynamic dims when a graph with multi getnext nodes."); return PARAM_INVALID; } @@ -239,6 +275,9 @@ Status UpdateNameOfGetnext(ComputeGraphPtr &graph, const vector &getnex // user want getnext dynamic, just getnext or data+getnext_sink size_t data_count = input_node->GetAllOutDataAnchors().size() / kDivisionConst; if (data_count != GetLocalOmgContext().user_input_dims.size()) { + REPORT_INNER_ERROR("E19999", "Output desc count of %s is %zu, should be equal to count of input shape: %zu, " + "graph:%s, check invalid", op_desc->GetName().c_str(), data_count, + GetLocalOmgContext().user_input_dims.size(), graph->GetName().c_str()); GELOGE(PARAM_INVALID, "Output count of %s is %zu, should be equal to count of input shape: %zu", op_desc->GetName().c_str(), data_count, GetLocalOmgContext().user_input_dims.size()); return PARAM_INVALID; @@ -299,6 +338,8 @@ Status DeleteIdentityInsertByAdapter(ComputeGraphPtr &graph) { if (dst_node->GetType() == IDENTITY) { GELOGI("Need to remove %s.", dst_node->GetName().c_str()); if (ge::GraphUtils::RemoveNodeWithoutRelink(graph, dst_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + dst_node->GetName().c_str(), dst_node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "Remove Identity node %s failed.", dst_node->GetName().c_str()); return FAILED; } @@ -322,6 +363,8 @@ Status CheckNegativeCountOfOptions(const std::vector> &shap } for (size_t i = 0; i < shapes.size(); ++i) { if (shapes.at(i).size() != negative_count) { + REPORT_INNER_ERROR("E19999", "gear num of dynamic_dims is %zu should be equal to num:%zu from option, " + "check invalid", shapes.at(i).size(), negative_count); GELOGE(PARAM_INVALID, "Each gear num of dynamic_dims is %zu should be equal to %zu.", shapes.at(i).size(), negative_count); return PARAM_INVALID; @@ -533,6 +576,8 @@ Status StampDynamicType(const OpDescPtr &op_desc) { dynamic_type = static_cast(DYNAMIC_DIMS); } if (!AttrUtils::SetInt(op_desc, ATTR_DYNAMIC_TYPE, dynamic_type)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to node:%s(%s) failed", + ATTR_DYNAMIC_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add dynamic type attr for node %s", op_desc->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/host_cpu_engine/CMakeLists.txt b/ge/host_cpu_engine/CMakeLists.txt index 13cb7434..8d84ee28 100644 --- a/ge/host_cpu_engine/CMakeLists.txt +++ b/ge/host_cpu_engine/CMakeLists.txt @@ -3,6 +3,7 @@ set(PROTO_LIST ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) +protobuf_generate(ge_atcstub PROTO_ATCSTUB_SRCS PROTO_ATCSTUB_HDRS ${PROTO_LIST}) set(SRC_LIST "engine/host_cpu_engine.cc" @@ -61,7 +62,7 @@ target_link_libraries(host_cpu_engine PRIVATE ) ############ atcstub/libhost_cpu_engine.so ############ -add_library(atc_host_cpu_engine SHARED ${SRC_LIST} ${PROTO_HDRS}) +add_library(atc_host_cpu_engine SHARED ${SRC_LIST} ${PROTO_ATCSTUB_HDRS}) target_compile_options(atc_host_cpu_engine PRIVATE -Werror @@ -84,7 +85,7 @@ target_include_directories(atc_host_cpu_engine PRIVATE ${METADEF_DIR}/inc/external ${METADEF_DIR}/inc/external/graph ${CMAKE_BINARY_DIR} - ${CMAKE_BINARY_DIR}/proto/ge + ${CMAKE_BINARY_DIR}/proto/ge_atcstub #### yellow zone #### ${GE_CODE_DIR}/../inc #### blue zone #### diff --git a/ge/host_cpu_engine/engine/host_cpu_engine.cc b/ge/host_cpu_engine/engine/host_cpu_engine.cc index cdbad1ed..5e8394f0 100644 --- a/ge/host_cpu_engine/engine/host_cpu_engine.cc +++ b/ge/host_cpu_engine/engine/host_cpu_engine.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "framework/common/debug/ge_log.h" #include "common/ge/ge_util.h" #include "host_cpu_engine/common/constant/constant.h" @@ -34,7 +35,8 @@ Status HostCpuEngine::Initialize(const std::map &options) { if (ops_kernel_store_ == nullptr) { ops_kernel_store_ = MakeShared(); if (ops_kernel_store_ == nullptr) { - GELOGE(FAILED, "Make HostCpuOpsKernelInfoStore failed."); + GELOGE(FAILED, "[Create][HostCpuEngine]Make HostCpuOpsKernelInfoStore failed."); + REPORT_INNER_ERROR("E19999", "HostCpuEngine::Initialize failed for new HostCpuEngine."); return FAILED; } } diff --git a/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc b/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc index adb252bc..47809ae1 100644 --- a/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc +++ b/ge/host_cpu_engine/ops_kernel_store/host_cpu_ops_kernel_builder.cc @@ -21,6 +21,7 @@ #include "graph/utils/node_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" +#include #include "framework/common/debug/ge_log.h" #include "host_cpu_engine/common/constant/constant.h" #include "register/ops_kernel_builder_registry.h" @@ -39,7 +40,8 @@ Status HostCpuOpsKernelBuilder::Initialize(const map & Status HostCpuOpsKernelBuilder::CalcOpRunningParam(Node &ge_node) { OpDescPtr op_desc = ge_node.GetOpDesc(); if (op_desc == nullptr) { - GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null"); + GELOGE(FAILED, "[Get][OpDesc]CalcOpRunningParam failed, as op desc is null"); + REPORT_INNER_ERROR("E19999", "GetOpDesc failed."); return FAILED; } @@ -73,9 +75,14 @@ Status HostCpuOpsKernelBuilder::CalcOpRunningParam(Node &ge_node) { GeShape output_shape = output_tensor.GetShape(); if ((TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size) != GRAPH_SUCCESS) || (output_mem_size < 0)) { - GELOGE(FAILED, "Calc op[%s:%s] out[%zu] mem size failed, mem_size=%ld, format=%s, data_type=%s.", - name.c_str(), type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGE(FAILED, + "[Calc][TensorMemSize] fail for op[%s:%s] out[%zu] mem size, mem_size=%ld, format=%s, data_type=%s.", + name.c_str(), type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", + "CalcTensorMemSize failed for op[%s:%s] out[%zu] mem size, mem_size=%ld, format=%s, data_type=%s.", + name.c_str(), type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); return FAILED; } GELOGI("Calc op[%s:%s] out[%zu] mem size is %ld, format=%s, data_type=%s.", @@ -84,8 +91,13 @@ Status HostCpuOpsKernelBuilder::CalcOpRunningParam(Node &ge_node) { TensorUtils::SetSize(output_tensor, output_mem_size); if (op_desc->UpdateOutputDesc(static_cast(i), output_tensor) != GRAPH_SUCCESS) { - GELOGE(FAILED, "Update op[%s:%s] out[%zu] desc failed, format=%s, data_type=%s.", name.c_str(), type.c_str(), i, - TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGE(FAILED, + "[Update][OutputDesc] fail for op[%s:%s] out[%zu] desc , format=%s, data_type=%s.", + name.c_str(), type.c_str(), i, + TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "UpdateOutputDesc failed for op[%s:%s] out[%zu] desc , format=%s, data_type=%s.", + name.c_str(), type.c_str(), i, + TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); return FAILED; } } diff --git a/ge/host_cpu_engine/ops_kernel_store/op/host_op.cc b/ge/host_cpu_engine/ops_kernel_store/op/host_op.cc index 7f709f03..3e619635 100644 --- a/ge/host_cpu_engine/ops_kernel_store/op/host_op.cc +++ b/ge/host_cpu_engine/ops_kernel_store/op/host_op.cc @@ -35,5 +35,6 @@ REGISTER_OP_CREATOR(Mul, HostOp); REGISTER_OP_CREATOR(ConcatV2, HostOp); REGISTER_OP_CREATOR(Data, HostOp); REGISTER_OP_CREATOR(Fill, HostOp); +REGISTER_OP_CREATOR(NetOutput, HostOp); } // namespace host_cpu } // namespace ge diff --git a/ge/host_kernels/concat_offset_kernel.cc b/ge/host_kernels/concat_offset_kernel.cc index ff597873..b6940eb4 100644 --- a/ge/host_kernels/concat_offset_kernel.cc +++ b/ge/host_kernels/concat_offset_kernel.cc @@ -33,7 +33,7 @@ const int kNumOne = 1; } // namespace Status ConcatOffsetKernel::Compute(const OpDescPtr op_desc_ptr, const vector &input, vector &v_output) { - GELOGI("ConcatOffsetKernel in."); + GELOGD("ConcatOffsetKernel in"); if (op_desc_ptr == nullptr) { GELOGE(PARAM_INVALID, "input opdesc is nullptr."); return PARAM_INVALID; @@ -41,7 +41,7 @@ Status ConcatOffsetKernel::Compute(const OpDescPtr op_desc_ptr, const vector(reinterpret_cast(input_0->GetData().data()))); // validate inputs if ((static_cast(input.size()) != (N + kNumOne)) || (input.size() <= kConcatOffsetInputIndexOne)) { - GELOGW("The number of input for concat offset must be equal to %d, and must be more than one.", (N + kNumOne)); + GELOGW("The number of input for concat offset must be equal to %d, and must be more than one", (N + kNumOne)); return NOT_CHANGED; } @@ -61,7 +61,7 @@ Status ConcatOffsetKernel::Compute(const OpDescPtr op_desc_ptr, const vectorMutableTensorDesc().SetShape(output_shape); GE_IF_BOOL_EXEC(output_ptr->SetData(reinterpret_cast(buf.get()), static_cast(sizeof(DT_INT32) * output_size)) != GRAPH_SUCCESS, - GELOGW("set data failed"); + GELOGW("set data failed."); return NOT_CHANGED); v_output.push_back(output_ptr); // caculate offset @@ -99,7 +99,7 @@ Status ConcatOffsetKernel::Compute(const OpDescPtr op_desc_ptr, const vector(reinterpret_cast(indices_tensor_ptr->GetData().data())); for (int64_t i = 0; i < indices_shape.GetShapeSize(); i++) { if (*(indices_ptr + i) < 0 || *(indices_ptr + i) >= x_shape.GetDim(axis)) { - GELOGW("indices %ld value is not in range [0, %ld)", i, x_shape.GetDim(axis)); + GELOGW("indices %ld value is not in range [0, %ld).", i, x_shape.GetDim(axis)); return NOT_CHANGED; } indicates_.push_back(*(indices_ptr + i)); @@ -288,7 +288,7 @@ Status GatherV2Kernel::SaveIndicesByDataType(ConstGeTensorPtr indices_tensor_ptr auto indices_ptr = const_cast(reinterpret_cast(indices_tensor_ptr->GetData().data())); for (int64_t i = 0; i < indices_shape.GetShapeSize(); i++) { if (*(indices_ptr + i) < 0 || *(indices_ptr + i) >= x_shape.GetDim(axis)) { - GELOGW("indices %ld value is not in range [0, %ld)", i, x_shape.GetDim(axis)); + GELOGW("indices %ld value is not in range [0, %ld).", i, x_shape.GetDim(axis)); return NOT_CHANGED; } indicates_.push_back(*(indices_ptr + i)); @@ -344,30 +344,30 @@ Status GatherV2Kernel::Check(const OpDescPtr &op_desc_ptr, const vectorGetTensorDesc().GetDataType(); bool is_valid_indices_data_type = indices_data_type == DT_INT32 || indices_data_type == DT_INT64; if (!is_valid_indices_data_type) { - GELOGW("indices datatype must be DT_INT32 or DT_INT64"); + GELOGW("indices datatype must be DT_INT32 or DT_INT64."); return NOT_CHANGED; } if (indices_shape.GetDimNum() > kMaxIndicatesDims) { - GELOGW("indices input only support 0 or 1 dims"); + GELOGW("indices input only support 0 or 1 dims."); return NOT_CHANGED; } return SUCCESS; } void GatherV2Kernel::DebugPrint(int64_t axis, const GeShape &x_shape, const GeShape &indices_shape, const std::vector &y_shape) { - GELOGD("GatherV2Kernel axis:%ld x_shape:%zu indices_shape:%zu y_shape:%zu", axis, x_shape.GetDimNum(), + GELOGD("GatherV2Kernel axis:%ld x_shape:%zu indices_shape:%zu y_shape:%zu.", axis, x_shape.GetDimNum(), indices_shape.GetDimNum(), y_shape.size()); for (size_t i = 0; i < x_shape.GetDimNum(); i++) { - GELOGD("GatherV2Kernel x_shape[%zu]: %ld", i, x_shape.GetDim(i)); + GELOGD("GatherV2Kernel x_shape[%zu]: %ld.", i, x_shape.GetDim(i)); } for (size_t i = 0; i < indices_shape.GetDimNum(); i++) { - GELOGD("GatherV2Kernel indices_shape[%zu]: %ld", i, indices_shape.GetDim(i)); + GELOGD("GatherV2Kernel indices_shape[%zu]: %ld.", i, indices_shape.GetDim(i)); } for (size_t i = 0; i < y_shape.size(); i++) { - GELOGD("GatherV2Kernel y_shape[%zu]: %ld", i, y_shape[i]); + GELOGD("GatherV2Kernel y_shape[%zu]: %ld.", i, y_shape[i]); } for (auto ele : indicates_) { - GELOGD("GatherV2Kernel indices:%ld", ele); + GELOGD("GatherV2Kernel indices:%ld.", ele); } } @@ -376,10 +376,10 @@ Status GatherV2Kernel::Compute(const OpDescPtr op_desc_ptr, const vectorGetName().c_str()); + GELOGI("GatherV2Kernel[%s] start Process", op_desc_ptr->GetName().c_str()); ConstGeTensorPtr tensor0 = input.at(kGatherV2InputIndexZero); ConstGeTensorPtr tensor1 = input.at(kGatherV2InputIndexOne); ConstGeTensorPtr tensor2 = input.at(kGatherV2InputIndexTwo); @@ -394,7 +394,7 @@ Status GatherV2Kernel::Compute(const OpDescPtr op_desc_ptr, const vector= 0 ? axis : axis + x_shape.GetDimNum(); // check axis value if (axis < 0 || (axis + 1) > static_cast(x_shape.GetDimNum())) { - GELOGW("axis is invalid"); + GELOGW("axis is invalid!"); return NOT_CHANGED; } auto indices_data_type = tensor1->GetTensorDesc().GetDataType(); @@ -407,7 +407,8 @@ Status GatherV2Kernel::Compute(const OpDescPtr op_desc_ptr, const vectorGetTensorDesc().GetDataType(); if (supported_type.find(x_data_type) == supported_type.end()) { - GELOGI("GatherV2Kernel does not support this Data type:%s", TypeUtils::DataTypeToSerialString(x_data_type).c_str()); + GELOGI("GatherV2Kernel does not support this Data type:%s.", + TypeUtils::DataTypeToSerialString(x_data_type).c_str()); return NOT_CHANGED; } // calc output shape diff --git a/ge/host_kernels/identity_kernel.cc b/ge/host_kernels/identity_kernel.cc index 702f5c93..ef1446a8 100644 --- a/ge/host_kernels/identity_kernel.cc +++ b/ge/host_kernels/identity_kernel.cc @@ -61,4 +61,5 @@ Status IdentityKernel::Compute(const ge::OpDescPtr op_desc, const std::vector &input, vector &input, vector &v_output) { - GELOGD("StridedSliceKernel in."); + GELOGD("StridedSliceKernel in"); // 1.Check input and attrs if (CheckAndGetAttr(attr) != SUCCESS) { - GELOGW("Check and get attrs failed.Ignore kernel."); + GELOGW("Check and get attrs failed.Ignore kernel"); return NOT_CHANGED; } if (CheckInputParam(input) != SUCCESS) { - GELOGW("Check input params failed.Ignore kernel."); + GELOGW("Check input params failed.Ignore kernel"); return NOT_CHANGED; } // 2.Init param with mask attrs. @@ -120,7 +120,7 @@ Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector(data_size), data_type, input_dims, begin_vec, output_dims, output_ptr.get(), stride_vec); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, "SetOutputSliceData failed."); + GELOGE(INTERNAL_ERROR, "SetOutputSliceData failed"); return NOT_CHANGED; } @@ -133,7 +133,7 @@ Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector & return PARAM_INVALID; } if (kIndexNumberType.find(begin_tensor_desc.GetDataType()) == kIndexNumberType.end()) { - GELOGW("Data type of StridedSlice OP(begin,end,strides) must be int32 or int64."); + GELOGW("Data type of StridedSlice OP(begin,end,strides) must be int32 or int64"); return PARAM_INVALID; } @@ -250,7 +250,7 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector kMaxHbmMemorySize) { - GELOGE(PARAM_INVALID, "Invalid HBM memory size: %zu", allocate_size); + GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.", + allocate_size, kMaxHbmMemorySize); + REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.", + allocate_size, kMaxHbmMemorySize); return nullptr; } void *try_reuse_addr = nullptr; @@ -87,7 +94,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); } if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to malloc memory, device_id = %u, size = %zu", device_id_, allocate_size); + GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu", + device_id_, allocate_size); + REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu", + device_id_, allocate_size); return nullptr; } diff --git a/ge/hybrid/common/tensor_value.cc b/ge/hybrid/common/tensor_value.cc index c691c6f3..275e24f6 100644 --- a/ge/hybrid/common/tensor_value.cc +++ b/ge/hybrid/common/tensor_value.cc @@ -32,7 +32,8 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } if (allocator == nullptr) { - GELOGE(INTERNAL_ERROR, "allocator is NULL"); + GELOGE(INTERNAL_ERROR, "[Check][Param:NpuMemoryAllocator] allocator is NULL."); + REPORT_INNER_ERROR("E19999", "input allocator is NULL."); return nullptr; } @@ -42,7 +43,8 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } buffer = allocator->Allocate(size, attr); if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate memory. size = %zu", size); + GELOGE(MEMALLOC_FAILED, "[Allocate][Memory] Failed. size = %zu.", size); + REPORT_CALL_ERROR("E19999", "allocate failed, size = %zu.", size); return nullptr; } diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index 50f6287c..f1357285 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -59,8 +59,31 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { return SUCCESS; } - GELOGE(RT_FAILED, "Failed to invoke rtStreamSynchronize, ret = %d", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtStreamSynchronize] failed, ret = %d", rt_ret); + REPORT_CALL_ERROR("E19999", "invoke rtStreamSynchronize failed, ret = %d", rt_ret); return RT_FAILED; } + +Status GraphExecutionContext::DumpExceptionInfo(const std::vector &exception_infos) { + if (exception_infos.empty()) { + GELOGI("[Dump][ExceptionInfo] Exception info is null."); + return SUCCESS; + } + GELOGI("[Dump][ExceptionInfo] Start to search dynamic op info and to dump."); + if (exception_dumper.DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][Exception] Dump dynamic op exception info failed."); + return FAILED; + } + GELOGI("[Dump][ExceptionInfo] Start to search static op info and to dump."); + for (const auto &iter : davinci_model) { + if (iter != nullptr) { + if (iter->DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][ExceptionInfo] Dump static op exception info failed."); + return FAILED; + } + } + } + return SUCCESS; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h index 4dc010df..67a96e98 100644 --- a/ge/hybrid/executor/hybrid_execution_context.h +++ b/ge/hybrid/executor/hybrid_execution_context.h @@ -23,6 +23,7 @@ #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "graph/ge_local_context.h" +#include "graph/load/model_manager/davinci_model.h" #include "hybrid/common/npu_memory_allocator.h" #include "hybrid/common/tensor_value.h" #include "hybrid/executor/hybrid_profiler.h" @@ -54,6 +55,7 @@ struct GraphExecutionContext { void SetErrorCode(Status error_code); Status GetStatus() const; Status Synchronize(rtStream_t rt_stream); + Status DumpExceptionInfo(const std::vector &exception_infos); uint64_t session_id = 0; uint64_t context_id = 0; @@ -68,9 +70,12 @@ struct GraphExecutionContext { DumpProperties dump_properties; bool trace_enabled = false; bool dump_enabled = false; - std::atomic_bool is_eos_; + ExceptionDumper exception_dumper; + std::vector> davinci_model; + std::atomic_bool is_eos_{false}; long profiling_level = 0; long iteration = 0; + void *global_step = nullptr; private: Status status = SUCCESS; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index b6c4dc9e..f3f1e1f5 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -29,7 +29,7 @@ const size_t kMinimumPiplineStages = 2; const int kDefaultLoopCount = 10; } HybridModelAsyncExecutor::HybridModelAsyncExecutor(HybridModel *model) - : model_(model), run_flag_(false) { + : model_(model), run_flag_(false), data_dumper_(nullptr) { } HybridModelAsyncExecutor::~HybridModelAsyncExecutor() { @@ -46,13 +46,13 @@ void HybridModelAsyncExecutor::SetModelId(uint32_t model_id) { model_id_ = model_id; } -void HybridModelAsyncExecutor::SetModelName(const string &model_name) { - om_name_ = model_name; -} - Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr &data) { - GE_CHK_STATUS_EXEC(data_inputer_->Push(data), return domi::DATA_QUEUE_ISFULL, - "Data queue is full, please call again later, model_id %u ", model_id_); + if (data_inputer_->Push(data) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Data queue is full, please call again later, model_id %u.", model_id_); + GELOGE(domi::DATA_QUEUE_ISFULL, + "[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_); + return domi::DATA_QUEUE_ISFULL; + } GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index); return SUCCESS; } @@ -60,17 +60,22 @@ Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr Status HybridModelAsyncExecutor::Start(const std::shared_ptr &listener) { GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr); std::lock_guard lk(mu_); - GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, "Model already started."); - + if (run_flag_) { + REPORT_INNER_ERROR("E19999", "Model already started, model_id:%u.", model_id_); + GELOGE(INTERNAL_ERROR, "[Check][RunState] Model already started, model_id:%u.", model_id_); + return INTERNAL_ERROR; + } run_flag_ = true; listener_ = listener; future_ = std::async(std::launch::async, [&]() -> Status { GetThreadLocalContext() = *executor_->GetContext()->ge_context; GetContext().SetSessionId(executor_->GetContext()->session_id); + GetContext().SetContextId(executor_->GetContext()->context_id); return RunInternal(); }); - GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, "Failed to start."); + GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, + "[Check][RunState] Failed to start, model_id:%u.", model_id_); GELOGD("HybridModelExecutor::Start successfully"); return SUCCESS; } @@ -104,26 +109,29 @@ Status HybridModelAsyncExecutor::Init() { executor_ = std::unique_ptr(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); GE_CHECK_NOTNULL(executor_); - GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine"); - GE_CHK_STATUS_RET(DumpOpDebug(),"Dump op debug failed in hybrid engine"); + GE_CHK_STATUS_RET(executor_->Init(), + "[Init][HybridModelExecutor] failed, model_id:%u.", model_id_); + GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed, model_id:%u.", model_id_); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) { pipe_executor_ = std::unique_ptr(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_)); GE_CHECK_NOTNULL(pipe_executor_); - GE_CHK_STATUS_RET(pipe_executor_->Init(), "Failed to init hybrid engine"); + GE_CHK_STATUS_RET(pipe_executor_->Init(), + "[Init][HybridModelPipelineExecutor] failed, model_id:%u.", model_id_); } - GE_CHK_STATUS_RET(InitInputDesc(), "Failed to init input tensors"); + GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed, model_id:%u.", model_id_); return SUCCESS; } Status HybridModelAsyncExecutor::PreRun(InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { - GE_CHK_STATUS_RET(SyncVarData(), "Failed to sync var data"); + GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End"); - GE_CHK_STATUS_RET(PrepareInputs(current_data, args), "Failed to copy input data to model"); + GE_CHK_STATUS_RET(PrepareInputs(current_data, args), + "[Invoke][PrepareInputs] failed to copy input data to model, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End"); return SUCCESS; } @@ -136,8 +144,12 @@ Status HybridModelAsyncExecutor::RunInternal() { GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(device_id)); }); while (run_flag_) { + // Model has not indeedly started running before received data + SetRunningFlag(false); std::shared_ptr data_wrapper; Status ret = data_inputer_->Pop(data_wrapper); + // Model indeedly start running + SetRunningFlag(true); if (data_wrapper == nullptr || ret != SUCCESS) { GELOGI("data_wrapper is null!, ret = %u", ret); continue; @@ -154,7 +166,7 @@ Status HybridModelAsyncExecutor::RunInternal() { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); - continue, "PreRun failed."); // [No need to check value] + continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value] if (pipe_executor_ != nullptr) { GELOGI("HybridModel will execute in pipeline mode"); @@ -166,6 +178,7 @@ Status HybridModelAsyncExecutor::RunInternal() { } else { GELOGI("HybridModel will execute in singleline mode"); ge::GetContext().SetSessionId(executor_->GetContext()->session_id); + ge::GetContext().SetContextId(executor_->GetContext()->context_id); ret = executor_->Execute(args); } ret = HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); @@ -176,7 +189,8 @@ Status HybridModelAsyncExecutor::RunInternal() { RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[RunInternal] [iteration = %d] End", iterator_count_); iterator_count_++; - GELOGI("run iterator count is %lu", iterator_count_); + SetRunningFlag(false); + GELOGI("run iterator count is %lu, model_id:%u", iterator_count_, model_id_); } CsaInteract::GetInstance().WriteInternalErrorCode(); @@ -197,7 +211,8 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret, } if (exec_ret != SUCCESS) { - GELOGE(exec_ret, "Failed to execute graph. model_id = %u", model_id_); + GELOGE(exec_ret, "[Check][Param:Status] failed to execute graph. model_id = %u", model_id_); + REPORT_INNER_ERROR("E19999", "failed to execute graph. model_id = %u", model_id_); return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list); } @@ -233,8 +248,11 @@ Status HybridModelAsyncExecutor::SyncVarData() { Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { if (current_data.blobs.size() < input_tensor_desc_.size()) { - GELOGE(PARAM_INVALID, "Blob size mismatches, expect at least %zu, but got %zu", - input_tensor_desc_.size(), current_data.blobs.size()); + GELOGE(PARAM_INVALID, + "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu, model_id = %u", + input_tensor_desc_.size(), current_data.blobs.size(), model_id_); + REPORT_INNER_ERROR("E19999", "Blob size mismatches, expect at least %zu, but got %zu, model_id = %u.", + input_tensor_desc_.size(), current_data.blobs.size(), model_id_); return PARAM_INVALID; } @@ -246,8 +264,11 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy auto tensor_size = input_sizes_[input_index]; if (is_input_dynamic_[input_index]) { if (input_index >= current_data.shapes.size()) { - GELOGE(PARAM_INVALID, "Shape index out of range, index = %zu, shape size = %zu", - input_index, current_data.shapes.size()); + GELOGE(PARAM_INVALID, + "[Check][Range]Shape index out of range, index = %zu, shape size = %zu model_id = %u.", + input_index, current_data.shapes.size(), model_id_); + REPORT_INNER_ERROR("E19999", "Shape index out of range, index = %zu, shape size = %zu, model_id = %u.", + input_index, current_data.shapes.size(), model_id_); return PARAM_INVALID; } auto &tensor_desc = input_tensor_desc_[input_index]; @@ -255,15 +276,19 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy std::vector> range; auto range_ret = tensor_desc->GetShapeRange(range); GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, INTERNAL_ERROR, - "Get shape range failed, ret=%u.", range_ret); + "[Invoke][GetShapeRange] failed, ret=%u, model_id = %u.", range_ret, model_id_); for (size_t k = 0; k < range.size(); ++k) { if (k >= shape.GetDimNum()) { break; } // range[k].second can be -1 if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) { - GELOGE(PARAM_INVALID, "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", - input_index, k, shape.GetDim(k), range[k].first, range[k].second); + GELOGE(PARAM_INVALID, "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu," + "dim = %ld, range = [%ld, %ld], model_id = %u.", + input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); + REPORT_INNER_ERROR("E19999", "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld," + "range = [%ld, %ld], model_id = %u.", + input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); return PARAM_INVALID; } } @@ -271,9 +296,9 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy args.input_desc[input_index] = tensor_desc; GELOGD("Update shape of input[%zu] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str()); GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size), - "Failed to calc tensor size, index = %zu, shape = [%s]", - input_index, - tensor_desc->GetShape().ToString().c_str()); + "[Invoke][GetTensorMemorySizeInBytes]Failed to calc tensor size," + "index = %zu, shape = [%s], model_id = %u.", + input_index, tensor_desc->GetShape().ToString().c_str(), model_id_); GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size); } @@ -289,25 +314,30 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy GELOGD("To copy input data for input[%zu]", input_index); const DataBuffer &data_buf = blobs[input_index]; auto mem_size = static_cast(tensor_size); - GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length, - PARAM_INVALID, - "input data size(%lu) does not match model required size(%lu), ret failed.", - data_buf.length, - mem_size); - - GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]", - model_->root_runtime_param_.graph_id, - input_index, - args.inputs[input_index].GetData(), - mem_size, - data_buf.length); - GE_CHK_RT_RET(rtMemcpy(args.inputs[input_index].MutableData(), - mem_size, - data_buf.data, - data_buf.length, - RT_MEMCPY_HOST_TO_DEVICE)); - } + if (mem_size < data_buf.length) { + REPORT_INNER_ERROR("E19999", + "input data size(%lu) does not match model required size(%lu), ret failed, model_id = %u.", + data_buf.length, mem_size, model_id_); + GELOGE(PARAM_INVALID, + "[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed, model_id = %u.", + data_buf.length, mem_size, model_id_); + return PARAM_INVALID; + } + if (data_buf.length > 0) { + GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]", + model_->root_runtime_param_.graph_id, + input_index, + args.inputs[input_index].GetData(), + mem_size, + data_buf.length); + GE_CHK_RT_RET(rtMemcpy(args.inputs[input_index].MutableData(), + mem_size, + data_buf.data, + data_buf.length, + RT_MEMCPY_HOST_TO_DEVICE)); + } + } return SUCCESS; } @@ -348,7 +378,7 @@ Status HybridModelAsyncExecutor::OnComputeDone(uint32_t data_index, uint32_t res GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code); if (listener_ != nullptr) { GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs), - "OnComputeDone failed"); + "[Invoke][OnComputeDone] failed, model_id = %u.", model_id_); } return result_code; @@ -362,9 +392,11 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a std::vector &output_tensors = args.outputs; if (output_tensor_desc_list.size() != output_tensors.size()) { GELOGE(INTERNAL_ERROR, - "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu", - output_tensor_desc_list.size(), - output_tensors.size()); + "[Check][Size]Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, model_id = %u.", + output_tensor_desc_list.size(), output_tensors.size(), model_id_); + REPORT_INNER_ERROR("E19999", + "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, model_id = %u.", + output_tensor_desc_list.size(), output_tensors.size(), model_id_); return INTERNAL_ERROR; } @@ -379,7 +411,7 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a tensor_desc->GetFormat(), tensor_desc->GetDataType(), output_size), - "Failed to calc tensor size for output[%zu]. shape = [%s], type = %s, format = %s", + "[Calc][TensorMemSize]Failed for output[%zu]. shape = [%s], type = %s, format = %s", i, tensor_desc->GetShape().ToString().c_str(), TypeUtils::DataTypeToSerialString(tensor_desc->GetDataType()).c_str(), @@ -396,8 +428,10 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a GE_CHECK_LE(output_size, UINT32_MAX); if (output_tensor.GetSize() < static_cast(output_size)) { GELOGE(INTERNAL_ERROR, - "output[%zu] tensor size(%zu) is not enough for output shape [%s]", - i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); + "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s], model_id = %u.", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_); + REPORT_INNER_ERROR("E19999", "output[%zu] tensor size(%zu) is not enough for output shape [%s] model_id = %u", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_); return INTERNAL_ERROR; } @@ -444,29 +478,18 @@ Status HybridModelAsyncExecutor::Execute(const std::vector &inputs, TensorValue tensor_value(inputs[i].data, inputs[i].length); args.inputs[i] = tensor_value; } - GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); - for (const auto &output_tensor_desc : args.output_desc) { - output_desc.emplace_back(*output_tensor_desc); + for (size_t i = 0; i < outputs.size(); ++i) { + args.outputs.emplace_back(TensorValue(outputs[i].data, outputs[i].length)); + } + // usr must designate input tensorDesc when input shape is dynamic in inference + for (size_t i = 0; i < input_desc.size(); ++i) { + ConstGeTensorDescPtr tensor_desc_ptr = MakeShared(input_desc[i]); + args.input_desc.emplace_back(tensor_desc_ptr); } - for (size_t i = 0; i < args.outputs.size(); ++i) { - int64_t output_real_size = 0; - ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc[i], output_real_size); - if (graph_status != GRAPH_SUCCESS) { - GELOGE(FAILED, "Get tensor size in bytes failed."); - return FAILED; - } - if (output_real_size > 0) { - if (outputs[i].length < static_cast(output_real_size)) { - GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by " - "user should be greater than or equal to the real size of output[%ld]", - i, outputs[i].length, output_real_size); - return FAILED; - } - GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, - RT_MEMCPY_DEVICE_TO_DEVICE)); - } - outputs[i].length = output_real_size; + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_); + for (const auto &output_tensor_desc : args.output_desc) { + output_desc.emplace_back(*output_tensor_desc); } return SUCCESS; @@ -485,13 +508,15 @@ Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector< } HybridModelExecutor::ExecuteArgs args; - GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "Failed to copy input data to model"); + GE_CHK_STATUS_RET(PrepareInputs(input_data, args), + "[Invoke][PrepareInputs]Failed to copy input data to model, model_id = %u", model_id_); GELOGD("Done copying input data successfully."); - GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_); std::vector output_tensor_info_list; OutputData output_data; - GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), "Failed to copy outputs."); + GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), + "[Invoke][CopyOutputs]Failed to copy outputs, model_id = %u.", model_id_); GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size()); int out_index = 0; @@ -542,7 +567,8 @@ Status HybridModelAsyncExecutor::DumpOpDebug() { loop_cond = const_cast(varible_loop_cond->GetData()); } data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine"); + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), + "[Invoke][LoadDumpInfo] failed in hybrid engine, model_id = %u.", model_id_); GELOGD("Dump op debug SUCCESS in hybrid engine"); } return SUCCESS; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h index 69d8a3f4..c5a6533a 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.h +++ b/ge/hybrid/executor/hybrid_model_async_executor.h @@ -51,12 +51,18 @@ class HybridModelAsyncExecutor { void SetModelId(uint32_t model_id); - void SetModelName(const string &model_name); - Status Stop(); Status EnqueueData(const std::shared_ptr &data); + uint32_t GetDataInputerSize() { return data_inputer_->Size(); } + + bool GetRunningFlag() const { return running_flag_; } + + void SetRunningFlag(bool flag) { running_flag_ = flag; } + + const GraphExecutionContext * GeContext() { return executor_->GetContext(); } + private: Status InitInputDesc(); @@ -86,6 +92,8 @@ class HybridModelAsyncExecutor { uint32_t device_id_ = 0U; uint32_t model_id_ = 0U; std::atomic_bool run_flag_; + // check whether model is running with data + bool running_flag_ = false; std::unique_ptr data_inputer_; std::unique_ptr executor_; std::unique_ptr pipe_executor_; @@ -97,7 +105,6 @@ class HybridModelAsyncExecutor { std::map input_tensor_desc_; std::vector is_input_dynamic_; std::shared_ptr listener_; - string om_name_; DataDumper data_dumper_; bool is_op_debug_reg_ = false; OpdebugRegister op_debug_register_; diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index 80b8983a..ea4e6912 100755 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -17,7 +17,10 @@ #include "hybrid_model_executor.h" #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" +#include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_manager.h" #include "common/dump/dump_manager.h" +#include "common/profiling/profiling_manager.h" namespace ge { namespace hybrid { @@ -47,6 +50,16 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) { auto root_graph_item = model_->GetRootGraphItem(); GE_CHECK_NOTNULL(root_graph_item); + if (root_graph_item->IsDynamic()) { + GE_CHK_STATUS_RET(CheckInputShapeByShapeRange(root_graph_item, args), + "[%s] check input node shape by shape range failed.", + root_graph_item->GetName().c_str()); + } + + if (context_.global_step != nullptr) { + GE_CHK_RT_RET(rtMemcpyAsync(context_.global_step, sizeof(uint64_t), &context_.iteration, + sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE_EX, context_.stream)); + } SubgraphExecutor executor(model_->GetRootGraphItem(), &context_); auto ret = ExecuteGraphInternal(executor, args); Cleanup(); @@ -61,7 +74,7 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) { if (ret == END_OF_SEQUENCE) { args.is_eos = true; } else { - GE_CHK_STATUS_RET(ret, "Failed to execute model"); + GE_CHK_STATUS_RET(ret, "[Invoke][ExecuteGraphInternal] Failed, ret:%d.", ret); } return SUCCESS; } @@ -72,12 +85,37 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor, GE_CHK_STATUS_RET_NOLOG(ResetExecutionContext(context_)); RECORD_MODEL_EXECUTION_EVENT(&context_, "[InitContext] End"); + uint64_t index_id = context_.iteration + 1; + uint64_t model_id = static_cast(model_->GetModelId()); + int32_t device_id = static_cast(device_id_); + auto &prof_mgr = ProfilingManager::Instance(); + // tag_id 0 means step begin, 1 meas step end. + if (!model_->IsSingleOp() && prof_mgr.ProfilingModelLoadOn()) { + GE_CHK_STATUS_RET_NOLOG(prof_mgr.ProfileStepInfo(index_id, model_id, 0, stream_, device_id)); + } + HYBRID_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc, args.outputs), "Failed to execute partitioned call."); RECORD_MODEL_EXECUTION_EVENT(&context_, "[ExecuteAsync] End"); - HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph."); - RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End"); + if (!model_->IsSingleOp() && prof_mgr.ProfilingModelLoadOn()) { + GE_CHK_STATUS_RET_NOLOG(prof_mgr.ProfileStepInfo(index_id, model_id, 1, stream_, device_id)); + } + + if (!model_->IsSingleOp()) { + Status ret = executor.Synchronize(); + if (ret != ge::SUCCESS) { + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } + GELOGE(ret, "[Execute][GraphInternal] Synchronize failed."); + } + RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End"); + } args.outputs.clear(); HYBRID_CHK_STATUS_RET(executor.GetOutputs(args.outputs, args.output_desc), "Failed to get outputs"); @@ -98,6 +136,7 @@ Status HybridModelExecutor::InitExecutionContext() { GE_CHK_RT_RET(rtCtxCreate(&context_.rt_gen_context, RT_CTX_GEN_MODE, 0)); GE_CHK_RT_RET(rtCtxSetCurrent(context_.rt_context)); + context_.global_step = model_->GetGlobalStep(); context_.stream = stream_; context_.model = model_; context_.is_eos_ = false; @@ -130,6 +169,70 @@ Status HybridModelExecutor::ResetExecutionContext(GraphExecutionContext &context string ctx_id = std::to_string(context.context_id); RuntimeInferenceContext::DestroyContext(ctx_id); GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::CreateContext(ctx_id), "Failed to Destroy RuntimeInferenceContext"); + RuntimeInferenceContext *ctx = nullptr; + GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(ctx_id, &ctx), "Failed to get context"); + for (auto &host_tensor : context.model->GetHostTensors()) { + auto node_id = host_tensor.first; + for (const auto &output_idx_and_tensor : host_tensor.second) { + auto output_idx = output_idx_and_tensor.first; + GELOGD("Preload const host tensor, node_id = %ld, output id = %d", node_id, output_idx); + ctx->SetTensor(node_id, output_idx, output_idx_and_tensor.second.Clone()); + } + } + return SUCCESS; +} + +Status HybridModelExecutor::CheckInputShapeByShapeRange(const GraphItem *graph_item, + HybridModelExecutor::ExecuteArgs &args) { + GE_CHECK_NOTNULL(graph_item); + auto input_nodes = graph_item->GetInputNodes(); + for (size_t i = 0; i < input_nodes.size(); ++i) { + auto &input_node = input_nodes[i]; + if (input_node == nullptr) { + GELOGD("[%s] Input[%zu] is not needed by graph, skip it.", graph_item->GetName().c_str(), i); + continue; + } + if (!input_node->is_dynamic) { + GELOGD("[%s] Input[%zu] is not dynamic, skip it.", graph_item->GetName().c_str(), i); + continue; + } + GeTensorDescPtr model_input_desc = input_node->MutableInputDesc(0); + GE_CHECK_NOTNULL(model_input_desc); + std::vector> shape_range; + if (model_input_desc->GetShapeRange(shape_range) != SUCCESS) { + REPORT_INNER_ERROR("E19999", "[%s] Input[%zu] get shape range failed", graph_item->GetName().c_str(), i); + GELOGE(INTERNAL_ERROR, "[%s] Input[%zu] get shape range failed", graph_item->GetName().c_str(), i); + return INTERNAL_ERROR; + } + if (shape_range.empty()) { + GELOGD("[%s] Input[%zu] shape is not needed to check by shape range, skip it.", graph_item->GetName().c_str(), i); + continue; + } + if (i >= args.input_desc.size()) { + REPORT_INNER_ERROR("E19999", "[%s] Inputs[%zu] is greater than or equal to input desc size[%zu].", + graph_item->GetName().c_str(), i, args.input_desc.size()); + GELOGE(INTERNAL_ERROR, "[%s] inputs[%zu] is greater than or equal to input desc size[%zu].", + graph_item->GetName().c_str(), i, args.input_desc.size()); + return INTERNAL_ERROR; + } + ConstGeTensorDescPtr args_tensor_desc = args.input_desc[i]; + GE_CHECK_NOTNULL(args_tensor_desc); + GeShape shape = args_tensor_desc->GetShape(); + if (shape.IsUnknownShape()) { + REPORT_INNER_ERROR("E19999", "[%s] Input desc shape [%zu] designed by user must be static.", + graph_item->GetName().c_str(), i); + GELOGE(INTERNAL_ERROR, "[%s] Input desc shape [%zu] designed by user must be static.", + graph_item->GetName().c_str(), i); + return INTERNAL_ERROR; + } + + if (TensorUtils::CheckShapeByShapeRange(shape, shape_range) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Check][InputShape] [%s] check input [%zu] shape failed by shape range.", + graph_item->GetName().c_str(), i); + return PARAM_INVALID; + } + } + return SUCCESS; } } // namespace hybrid diff --git a/ge/hybrid/executor/hybrid_model_executor.h b/ge/hybrid/executor/hybrid_model_executor.h index 0b2cd1ed..566043d9 100644 --- a/ge/hybrid/executor/hybrid_model_executor.h +++ b/ge/hybrid/executor/hybrid_model_executor.h @@ -52,6 +52,7 @@ class HybridModelExecutor { Status Cleanup(); Status InitExecutionContext(); static Status ResetExecutionContext(GraphExecutionContext &context); + static Status CheckInputShapeByShapeRange(const GraphItem *graph_item, HybridModelExecutor::ExecuteArgs &args); HybridModel *model_; uint32_t device_id_; diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index 4706fa97..b2a77653 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -4,6 +4,7 @@ #include "common/dump/dump_manager.h" #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" +#include "graph/load/model_manager/model_manager.h" namespace ge { namespace hybrid { @@ -38,6 +39,16 @@ Status StageExecutor::ResetExecutionContext(GraphExecutionContext &context) { string ctx_id = std::to_string(context.context_id); RuntimeInferenceContext::DestroyContext(ctx_id); GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::CreateContext(ctx_id), "Failed to Destroy RuntimeInferenceContext"); + RuntimeInferenceContext *ctx = nullptr; + GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(ctx_id, &ctx), "Failed to get context"); + for (auto &host_tensor : context.model->GetHostTensors()) { + auto node_id = host_tensor.first; + for (const auto &output_idx_and_tensor : host_tensor.second) { + auto output_idx = output_idx_and_tensor.first; + GELOGD("Preload const host tensor, node_id = %ld, output id = %d", node_id, output_idx); + ctx->SetTensor(node_id, output_idx, output_idx_and_tensor.second.Clone()); + } + } return SUCCESS; } @@ -59,23 +70,25 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v task_queue_.Pop(task_info); GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration); if (task_info.iteration >= pipe_config_->iteration_end) { - GELOGE(INTERNAL_ERROR, "[Executor: %d] Unexpected iteration: %d", id_, task_info.iteration); + GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %ld.", id_, task_info.iteration); + REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %ld.", id_, task_info.iteration); return INTERNAL_ERROR; } if (task_info.event != nullptr) { GELOGD("[%d] Add StreamWaitEvent", id_); GE_CHK_RT_RET(rtStreamWaitEvent(stream_, task_info.event)); - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration - 1, + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration - 1, task_info.stage); } - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] Start", task_info.iteration, + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %lld] [Stage = %d] Start", task_info.iteration, task_info.stage); if (task_info.stage == 0) { GELOGD("[Executor: %d] To ResetExecutionContext", id_); - GE_CHK_STATUS_RET(ResetExecutionContext(context_), "[Executor: %d] Failed to reset context", id_); + GE_CHK_STATUS_RET(ResetExecutionContext(context_), + "[Invoke][ResetExecutionContext][Executor: %d] Failed to reset context", id_); context_.iteration = task_info.iteration; GE_CHK_STATUS_RET_NOLOG(SetInputs(inputs, input_desc)); } @@ -92,19 +105,22 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v auto sync_result = Synchronize(); if (sync_result != SUCCESS) { - GELOGE(sync_result, "[Executor: %d] Failed to sync result. iteration = %d", id_, task_info.iteration); - + GELOGE(sync_result, + "[Invoke][Synchronize][Executor: %d] Failed to sync result:%d. iteration = %ld", + id_, sync_result, task_info.iteration); + REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result:%d. iteration = %ld", + id_, sync_result, task_info.iteration); context_.profiler->Dump(std::cout); context_.callback_manager->Destroy(); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); return sync_result; } - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration, task_info.stage); + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration, task_info.stage); // if not end stage if (task_info.stage >= pipe_config_->num_stages - 1) { - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] Schedule End", task_info.iteration); + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] Schedule End", task_info.iteration); GELOGD("[Executor: %d] End of iteration [%ld]", id_, task_info.iteration); context_.callback_manager->Destroy(); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); @@ -242,7 +258,8 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar GELOGD("Start to sync result of executor[%zu]", i); auto ret = futures[i].get(); if (ret != SUCCESS) { - GELOGE(ret, "[Executor: %zu] Failed to schedule tasks.", i); + GELOGE(ret, "[Check][Result][Executor: %zu] Failed to schedule tasks.", i); + REPORT_INNER_ERROR("E19999", "[Executor: %zu] Failed to schedule tasks.", i); has_error = true; continue; } @@ -250,7 +267,15 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar ret = stage_executors_[i]->Synchronize(); if (ret != SUCCESS) { - GELOGE(ret, "[Executor: %zu] Failed to synchronize result.", i); + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } + GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i); + REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result.", i); has_error = true; continue; } @@ -266,13 +291,14 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar iteration_ = config_.iteration_end; if (has_error) { - GELOGE(FAILED, "Error occurred while execution"); + GELOGE(FAILED, "[Check][Error]Error occurred while execution."); + REPORT_INNER_ERROR("E19999", "Error occurred while execution."); return FAILED; } auto last_iter_executor_idx = loop_count % stage_executors_.size(); GE_CHK_STATUS_RET(stage_executors_[last_iter_executor_idx]->GetOutputs(args.outputs, args.output_desc), - "Failed to get output from executor[%zu]", last_iter_executor_idx); + "[Get][Outputs]Failed from executor[%zu]", last_iter_executor_idx); return SUCCESS; } diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc index 3b6865bb..384dc770 100644 --- a/ge/hybrid/executor/hybrid_profiler.cc +++ b/ge/hybrid/executor/hybrid_profiler.cc @@ -40,7 +40,8 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { char buf[kEventDescMax]; if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) { - GELOGE(FAILED, "Format %s failed.", fmt); + GELOGE(FAILED, "[Parse][Param:fmt]Format %s failed.", fmt); + REPORT_CALL_ERROR("E19999", "Parse Format %s failed.", fmt); va_end(args); return; } @@ -48,7 +49,9 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { va_end(args); auto index = counter_++; if (index >= static_cast(events_.size())) { - GELOGE(INTERNAL_ERROR, "index out of range. index = %d, max event size = %zu", index, events_.size()); + GELOGE(INTERNAL_ERROR, + "[Check][Range]index out of range. index = %d, max event size = %zu", index, events_.size()); + REPORT_INNER_ERROR("E19999", "index out of range. index = %d, max event size = %zu", index, events_.size()); return; } auto &evt = events_[index]; diff --git a/ge/hybrid/executor/node_done_manager.cc b/ge/hybrid/executor/node_done_manager.cc index f0d4324a..f33ffcd9 100644 --- a/ge/hybrid/executor/node_done_manager.cc +++ b/ge/hybrid/executor/node_done_manager.cc @@ -28,7 +28,8 @@ bool NodeDoneManager::Cond::Await() { if (!cv_.wait_for(lk, std::chrono::seconds(kDefaultWaitTimeoutInSec), [&]() { return is_released_ || is_cancelled_; })) { - GELOGE(INTERNAL_ERROR, "Wait timed out."); + GELOGE(INTERNAL_ERROR, "[Invoke][wait_for]Wait timed out."); + REPORT_INNER_ERROR("E19999", "wait timed out[%d].", kDefaultWaitTimeoutInSec); return false; } diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index 3ec967d3..ce8304b0 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -35,12 +35,14 @@ ShapeInferenceState::ShapeInferenceState(const NodeItem &node_item) : node_item( node_item.NodeName().c_str(), this->num_pending_shapes_); - for (int i = 0; i < node_item.num_inputs; ++i){ - input_tensor_desc.emplace_back(*node_item.MutableInputDesc(i)); + input_tensor_desc.resize(node_item.num_inputs); + for (int i = 0; i < node_item.num_inputs; ++i) { + node_item.GetInputDesc(i, input_tensor_desc[i]); } - for (int i = 0; i < node_item.num_outputs; ++i){ - output_tensor_desc.emplace_back(*node_item.MutableOutputDesc(i)); + output_tensor_desc.resize(node_item.num_outputs); + for (int i = 0; i < node_item.num_outputs; ++i) { + node_item.GetOutputDesc(i, output_tensor_desc[i]); } } @@ -54,19 +56,28 @@ Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target return SUCCESS; } + std::lock_guard lk(mu_); + auto &input_desc = input_tensor_desc[idx]; + GeShape shape = target.GetShape(); + input_desc.SetShape(shape); + input_desc.SetOriginShape(target.GetOriginShape()); int64_t tensor_size = -1; (void) TensorUtils::GetSize(target, tensor_size); + if (tensor_size <= 0) { + Format format = input_desc.GetFormat(); + DataType data_type = input_desc.GetDataType(); + if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s].", node_item.NodeName().c_str()); + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s].", node_item.NodeName().c_str()); + return FAILED; + } + } GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s], size = %ld", node_item.NodeName().c_str(), idx, - target.GetShape().ToString().c_str(), + shape.ToString().c_str(), target.GetOriginShape().ToString().c_str(), tensor_size); - - std::lock_guard lk(mu_); - auto &input_desc = input_tensor_desc[idx]; - input_desc.SetShape(target.GetShape()); - input_desc.SetOriginShape(target.GetOriginShape()); (void) TensorUtils::SetSize(input_desc, tensor_size); if (--num_pending_shapes_ <= 0) { ready_cv_.notify_all(); @@ -111,13 +122,15 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex } if (context.GetStatus() != SUCCESS) { - GELOGE(FAILED, "[%s] Await pending shape cancelled", node_item.NodeName().c_str()); + GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled.", node_item.NodeName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled.", node_item.NodeName().c_str()); break; } } if (!wait_success) { - GELOGE(FAILED, "[%s] Wait for shape timeout.", node_item.NodeName().c_str()); + GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout:%d.", node_item.NodeName().c_str(), kWaitInternal); + REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout:%d.", node_item.NodeName().c_str(), kWaitInternal); return FAILED; } } @@ -221,8 +234,7 @@ Status NodeState::AwaitInputTensors(GraphExecutionContext &context) const { Status NodeState::WaitForPrepareDone() { if (prepare_future_.valid()) { GELOGD("[%s] Start to wait for prepare future.", GetName().c_str()); - GE_CHK_STATUS_RET(prepare_future_.get(), - "[%s] PreRun failed.", GetName().c_str()); + GE_CHK_STATUS_RET(prepare_future_.get(), "[Check][Status][%s] PreRun failed.", GetName().c_str()); } return SUCCESS; diff --git a/ge/hybrid/executor/rt_callback_manager.cc b/ge/hybrid/executor/rt_callback_manager.cc index d3989f31..90f579ab 100644 --- a/ge/hybrid/executor/rt_callback_manager.cc +++ b/ge/hybrid/executor/rt_callback_manager.cc @@ -27,7 +27,8 @@ Status CallbackManager::RegisterCallback(rtStream_t stream, rtCallback_t callbac GE_CHK_RT_RET(rtEventCreate(&event)); auto rt_ret = rtEventRecord(event, stream); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to invoke rtEventRecord, error code = %d", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtEventRecord] failed, error code = %d", rt_ret); + REPORT_CALL_ERROR("E19999", "Invoke rtEventRecord failed, error code = %d", rt_ret); (void) rtEventDestroy(event); return RT_FAILED; } @@ -50,7 +51,8 @@ Status CallbackManager::Init() { return CallbackProcess(context); }, ctx); if (!ret_future_.valid()) { - GELOGE(INTERNAL_ERROR, "Failed to init callback manager."); + GELOGE(INTERNAL_ERROR, "[Check][ShareState]Failed to init callback manager."); + REPORT_INNER_ERROR("E19999", "Failed to init callback manager."); return INTERNAL_ERROR; } @@ -73,7 +75,8 @@ Status CallbackManager::CallbackProcess(rtContext_t context) { auto rt_err = rtEventSynchronize(event); if (rt_err != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtEventSynchronize failed. ret = %d", rt_err); + GELOGE(RT_FAILED, "[Invoke][rtEventSynchronize] failed. ret = %d", rt_err); + REPORT_CALL_ERROR("E19999", "Invoke rtEventSynchronize failed, ret = %d.", rt_err); GE_CHK_RT(rtEventDestroy(event)); return RT_FAILED; } diff --git a/ge/hybrid/executor/subgraph_context.cc b/ge/hybrid/executor/subgraph_context.cc index 0fa112a4..b26afb9c 100644 --- a/ge/hybrid/executor/subgraph_context.cc +++ b/ge/hybrid/executor/subgraph_context.cc @@ -50,9 +50,10 @@ NodeStatePtr SubgraphContext::GetOrCreateNodeState(const NodeItem *node_item) { Status SubgraphContext::SetInput(int index, const TensorValue &tensor) { if (static_cast(index) >= all_inputs_.size()) { GELOGE(INTERNAL_ERROR, - "output index output range. all input num = %zu, input index = %d", - all_inputs_.size(), - index); + "[Check][Param:index]input index out of range. all input num = %zu, input index = %d", + all_inputs_.size(), index); + REPORT_INNER_ERROR("E19999", "input param index out of range, all input num = %zu, input index = %d.", + all_inputs_.size(), index); return INTERNAL_ERROR; } all_inputs_[index] = tensor; @@ -67,11 +68,11 @@ Status SubgraphContext::SetInput(const NodeItem &node_item, int input_index, con Status SubgraphContext::SetOutput(const NodeItem &node_item, int output_index, const TensorValue &tensor) { auto index = node_item.output_start + output_index; if ((output_index >= node_item.num_outputs) || (static_cast(index) >= all_outputs_.size())) { - GELOGE(INTERNAL_ERROR, - "output index output range. all output num = %zu, node_item = %s, output index = %d", - all_outputs_.size(), - node_item.DebugString().c_str(), - output_index); + GELOGE(INTERNAL_ERROR, "[Check][Param:output_index]output index out of range. all output num = %zu," + "node_item = %s, output index = %d.", + all_outputs_.size(), node_item.DebugString().c_str(), output_index); + REPORT_INNER_ERROR("E19999", "output index out of range. all output num = %zu, node_item = %s, output index = %d.", + all_outputs_.size(), node_item.DebugString().c_str(), output_index); return INTERNAL_ERROR; } @@ -126,7 +127,10 @@ Status SubgraphContext::Await(const NodePtr &node) { void SubgraphContext::OnError(Status error) { if (error != END_OF_SEQUENCE) { - GELOGE(error, "[%s] Error occurred while executing graph.", graph_item_->GetName().c_str()); + GELOGE(error, "[Check][Param:error][%s] Error:%d occurred while executing graph.", + graph_item_->GetName().c_str(), error); + REPORT_INNER_ERROR("E19999", "[%s] Error:%d occurred while executing graph.", + graph_item_->GetName().c_str(), error); } node_done_manager_.Destroy(); } diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc index 45db9936..e41ab253 100644 --- a/ge/hybrid/executor/subgraph_executor.cc +++ b/ge/hybrid/executor/subgraph_executor.cc @@ -44,7 +44,8 @@ Status SubgraphExecutor::Init(const std::vector &inputs, const std::vector &input_desc) { subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_)); GE_CHECK_NOTNULL(subgraph_context_); - GE_CHK_STATUS_RET(subgraph_context_->Init(), "[%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(subgraph_context_->Init(), + "[Init][SubgraphContext][%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); shape_inference_engine_.reset(new(std::nothrow) ShapeInferenceEngine(context_, subgraph_context_.get())); GE_CHECK_NOTNULL(shape_inference_engine_); @@ -55,8 +56,8 @@ Status SubgraphExecutor::Init(const std::vector &inputs, graph_item_->GetName().c_str()); } else { GE_CHK_STATUS_RET(InitInputsForKnownShape(inputs), - "[%s] Failed to init subgraph executor for known shape subgraph.", - graph_item_->GetName().c_str()); + "[Invoke][InitInputsForKnownShape][%s] Failed to init subgraph executor for known shape subgraph.", + graph_item_->GetName().c_str()); } return SUCCESS; @@ -67,8 +68,12 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorGetInputNodes(); if (inputs.size() < input_nodes.size()) { - GELOGE(INTERNAL_ERROR, "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", + GELOGE(INTERNAL_ERROR, + "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); + REPORT_INNER_ERROR("E19999", + "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", + graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); return INTERNAL_ERROR; } @@ -87,9 +92,8 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorSetInput(*input_node, kDataInputIndex, input_tensor), - "[%s] Failed to set input tensor[%zu]", - graph_item_->GetName().c_str(), - i); + "[Invoke][SetInput] failed for grap_item[%s] input tensor[%zu]", + graph_item_->GetName().c_str(), i); if (force_infer_shape_ || input_node->is_dynamic) { GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i); @@ -111,12 +115,12 @@ Status SubgraphExecutor::InitInputsForKnownShape(const std::vector for (size_t i = 0; i < input_index_mapping.size(); ++i) { auto &parent_input_index = input_index_mapping[i]; if (static_cast(parent_input_index) >= inputs.size()) { - GELOGE(INTERNAL_ERROR, - "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs", - graph_item_->GetName().c_str(), - inputs.size(), + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph" + "which needs at lease [%d] inputs", graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1); - + REPORT_INNER_ERROR("E19999", "[%s] Number of inputs [%zu] is not sufficient for subgraph" + "which needs at lease [%d] inputs", + graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1); return INTERNAL_ERROR; } @@ -136,10 +140,10 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector &inputs, const std::vector &input_desc, const std::vector &outputs) { GELOGD("[%s] is dynamic = %s", graph_item_->GetName().c_str(), graph_item_->IsDynamic() ? "true" : "false"); - GE_CHK_STATUS_RET(Init(inputs, input_desc), "[%s] Failed to init executor.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(Init(inputs, input_desc), "[Invoke][Init]failed for [%s].", graph_item_->GetName().c_str()); if (!outputs.empty()) { GE_CHK_STATUS_RET(EnableOutputZeroCopy(outputs), - "Failed to enable output zero copy by user provided outputs."); + "[Invoke][EnableOutputZeroCopy] Failed by user provided outputs."); } if (!graph_item_->IsDynamic()) { return ExecuteAsyncForKnownShape(inputs); @@ -158,10 +162,10 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector &inputs, Status SubgraphExecutor::ExecuteAsyncForKnownShape(const std::vector &inputs) { GELOGD("[%s] subgraph is not dynamic.", graph_item_->GetName().c_str()); if (graph_item_->GetAllNodes().size() != 1) { - GELOGE(INTERNAL_ERROR, - "[%s] Invalid known shape subgraph. node size = %zu", - graph_item_->GetName().c_str(), - graph_item_->GetAllNodes().size()); + REPORT_INNER_ERROR("E19999", "[%s] Invalid known shape subgraph. node size = %zu", + graph_item_->GetName().c_str(), graph_item_->GetAllNodes().size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Invalid known shape subgraph. node size = %zu", + graph_item_->GetName().c_str(), graph_item_->GetAllNodes().size()); return INTERNAL_ERROR; } @@ -193,12 +197,11 @@ Status SubgraphExecutor::ExecuteAsync(TaskContext &task_context) { input_desc.emplace_back(task_context.GetInputDesc(i)); } - GE_CHK_STATUS_RET(ExecuteAsync(inputs, input_desc), - "[%s] Failed to execute subgraph.", + GE_CHK_STATUS_RET(ExecuteAsync(inputs, input_desc), "[Invoke][ExecuteAsync] failed for [%s].", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(SetOutputsToParentNode(task_context), - "[%s] Failed to set output shapes to parent node.", + "[Invoke][SetOutputsToParentNode][%s] Failed to set output shapes to parent node.", graph_item_->GetName().c_str()); return SUCCESS; } @@ -227,6 +230,7 @@ Status SubgraphExecutor::PrepareNodes(int group) { if (node_item.is_dynamic) { auto prepare_future = pre_run_pool_.commit([this, p_node_state]() -> Status { GetContext().SetSessionId(context_->session_id); + GetContext().SetContextId(context_->context_id); GE_CHK_STATUS_RET_NOLOG(InferShape(shape_inference_engine_.get(), *p_node_state)); return PrepareForExecution(context_, *p_node_state); }); @@ -238,7 +242,7 @@ Status SubgraphExecutor::PrepareNodes(int group) { if (node_item.kernel_task == nullptr) { GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str()); GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_), - "[%s] Failed to create task.", p_node_state->GetName().c_str()); + "[Invoke][Compile] failed for [%s].", p_node_state->GetName().c_str()); } else { node_state->SetKernelTask(node_item.kernel_task); } @@ -247,7 +251,8 @@ Status SubgraphExecutor::PrepareNodes(int group) { GE_CHECK_NOTNULL(unique_task_context); const auto &task = node_state->GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][KernelTask] failed for[%s], NodeTask is null.", node_state->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetKernelTask failed for %s, nodetask is null.", node_state->GetName().c_str()); return INTERNAL_ERROR; } auto shared_task_context = std::shared_ptr(unique_task_context.release()); @@ -260,8 +265,10 @@ Status SubgraphExecutor::PrepareNodes(int group) { GELOGD("Got end of sequence"); return SUCCESS; } - GELOGE(INTERNAL_ERROR, "[%s] Error occurs while launching tasks. quit from preparing nodes.", + GELOGE(INTERNAL_ERROR, "[Check][State][%s] Error occurs while launching tasks. quit from preparing nodes.", graph_item_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Error occurs while launching tasks. quit from preparing nodes.", + graph_item_->GetName().c_str()); return INTERNAL_ERROR; } @@ -273,12 +280,10 @@ Status SubgraphExecutor::PrepareNodes(int group) { } Status SubgraphExecutor::InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state) const { - GetContext().SetSessionId(context_->context_id); HYBRID_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state), - "[%s] Failed to InferShape.", node_state.GetName().c_str()); - GetContext().SetSessionId(context_->session_id); + "[Invoke][InferShape] failed for [%s].", node_state.GetName().c_str()); HYBRID_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_state), - "[%s] Failed to PropagateOutputShapes.", node_state.GetName().c_str()); + "[Invoke][PropagateOutputShapes] failed for [%s].", node_state.GetName().c_str()); return SUCCESS; } @@ -286,7 +291,7 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta auto &node_item = *node_state.GetNodeItem(); if (node_item.kernel_task == nullptr) { GE_CHK_STATUS_RET(TaskCompileEngine::Compile(node_state, ctx), - "Failed to create task for node[%s]", node_state.GetName().c_str()); + "[Invoke][Compile] Failed for node[%s]", node_state.GetName().c_str()); } else { node_state.SetKernelTask(node_item.kernel_task); } @@ -294,7 +299,8 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta GE_CHECK_NOTNULL(unique_task_context); const auto &task = node_state.GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetKernelTask] failed for[%s], NodeTask is null.", node_state.GetName().c_str()); + REPORT_CALL_ERROR("E19999", "invoke GetKernelTask failed for %s, NodeTask is null.", node_state.GetName().c_str()); return INTERNAL_ERROR; } auto shared_task_context = std::shared_ptr(unique_task_context.release()); @@ -310,7 +316,8 @@ Status SubgraphExecutor::LaunchTasks() { while (true) { NodeState *node_state = nullptr; if (!ready_queue_.Pop(node_state)) { - GELOGE(INTERNAL_ERROR, "[%s] Failed to pop node.", graph_item_->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][Pop] failed for [%s].", graph_item_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "invoke pop failed for %s.", graph_item_->GetName().c_str()); return INTERNAL_ERROR; } @@ -335,8 +342,7 @@ Status SubgraphExecutor::LaunchTasks() { GE_CHECK_NOTNULL(shared_task_context); shared_task_context->SetForceInferShape(force_infer_shape_); HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_), - "[%s] Execute node failed.", - node_state->GetName().c_str()); + "[Invoke][ExecuteAsync] failed for [%s].", node_state->GetName().c_str()); GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str()); } } @@ -345,6 +351,7 @@ Status SubgraphExecutor::ScheduleTasks(int group) { GELOGD("[%s] Start to schedule prepare workers.", graph_item_->GetName().c_str()); auto prepare_future = std::async(std::launch::async, [&]() -> Status { GetContext().SetSessionId(context_->session_id); + GetContext().SetContextId(context_->context_id); auto ret = PrepareNodes(group); ready_queue_.Push(nullptr); return ret; @@ -360,8 +367,7 @@ Status SubgraphExecutor::ScheduleTasks(int group) { return ret; } - GE_CHK_STATUS_RET(prepare_future.get(), - "[%s] Error occurred in task preparation.", + GE_CHK_STATUS_RET(prepare_future.get(), "[Invoke][get] [%s] Error occurred in task preparation.", graph_item_->GetName().c_str()); GELOGD("[%s] Done launching all tasks successfully.", graph_item_->GetName().c_str()); @@ -373,17 +379,17 @@ Status SubgraphExecutor::GetOutputs(vector &outputs) { } Status SubgraphExecutor::GetOutputs(vector &outputs, std::vector &output_desc) { - GE_CHK_STATUS_RET(GetOutputs(outputs), "[%s] Failed to get output tensors.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(GetOutputs(outputs), "[Invoke][GetOutputs] failed for [%s].", graph_item_->GetName().c_str()); // copy output data from op to designated position GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc), - "[%s] Failed to get output tensor desc.", + "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str()); if (outputs.size() != output_desc.size()) { - GELOGE(INTERNAL_ERROR, - "Number of output tensors(%zu) mismatch number of output tensor desc(%zu).", - outputs.size(), - output_desc.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]Number of outputs(%zu) mismatch number of output_desc(%zu).", + outputs.size(), output_desc.size()); + REPORT_INNER_ERROR("E19999", "Number of outputs(%zu) mismatch number of output_desc(%zu).", + outputs.size(), output_desc.size()); return INTERNAL_ERROR; } return SUCCESS; @@ -400,18 +406,17 @@ Status SubgraphExecutor::SetOutputsToParentNode(TaskContext &task_context) { // get output tensors and tensor desc list std::vector outputs; std::vector output_desc_list; - GE_CHK_STATUS_RET(subgraph_context_->GetOutputs(outputs), - "[%s] Failed to get output tensors.", + GE_CHK_STATUS_RET(subgraph_context_->GetOutputs(outputs), "[Invoke][GetOutputs][%s] Failed to get output tensors.", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc_list), - "[%s] Failed to get output tensor desc.", + "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str()); if (outputs.size() != output_desc_list.size()) { - GELOGE(INTERNAL_ERROR, "[%s] num output tensors = %zu, num output tensor desc = %zu", - graph_item_->GetName().c_str(), - outputs.size(), - output_desc_list.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] num of output tensors = %zu, num of output tensor desc = %zu not equal", + graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size()); + REPORT_INNER_ERROR("E19999", "%s num of output tensors = %zu, num of output tensor desc = %zu not equal", + graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size()); return INTERNAL_ERROR; } @@ -460,9 +465,10 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector &outputs const auto &output_edges = graph_item_->GetOutputEdges(); // Op -> MetOutput, set the output tensor of Op that output to the NetOutput node if (outputs.size() != output_edges.size()) { - GELOGE(PARAM_INVALID, "Output number mismatches, expect = %zu, but given = %zu", - output_edges.size(), - outputs.size()); + GELOGE(PARAM_INVALID, "[Check][Size]Output number mismatches, expect = %zu, but given = %zu", + output_edges.size(), outputs.size()); + REPORT_INNER_ERROR("E19999", "Output number mismatches, expect = %zu, but given = %zu", + output_edges.size(), outputs.size()); return PARAM_INVALID; } @@ -478,9 +484,8 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector &outputs output_tensor.DebugString().c_str()); GE_CHK_STATUS_RET(subgraph_context_->SetOutput(*output_node, output_idx, output_tensor), - "[%s] Failed to set input tensor[%zu]", - graph_item_->GetName().c_str(), - i); + "[Invoke][SetOutput][%s] Failed to set input tensor[%zu]", + graph_item_->GetName().c_str(), i); } GELOGD("Done enabling zero copy for outputs successfully."); diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 673c82dd..dcb3f300 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -19,6 +19,7 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/tensor_adapter.h" #include "graph/debug/ge_attr_define.h" +#include "graph/load/model_manager/model_manager.h" #include "hybrid/node_executor/node_executor.h" #include "hybrid/executor//worker//shape_inference_engine.h" #include "common/dump/dump_op.h" @@ -70,6 +71,7 @@ class NodeDoneCallback { Status PrepareConstInputs(const NodeItem &node_item); Status DumpDynamicNode(); Status ProfilingReport(); + Status SaveDumpOpInfo(); Status GetTaskDescInfo(const NodePtr node, const HybridModel *model, std::vector &task_desc_info); GraphExecutionContext *graph_context_; @@ -102,11 +104,13 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { if (output_tensor->GetSize() < static_cast(tensor_size)) { GELOGE(INTERNAL_ERROR, - "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s", - node_item.NodeName().c_str(), - output_idx, - tensor_size, - output_tensor->DebugString().c_str()); + "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s.", + node_item.NodeName().c_str(), output_idx, tensor_size, + output_tensor->DebugString().c_str()); + REPORT_INNER_ERROR("E19999", + "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s.", + node_item.NodeName().c_str(), output_idx, tensor_size, + output_tensor->DebugString().c_str()); return INTERNAL_ERROR; } @@ -128,7 +132,7 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(context_id, &runtime_infer_ctx), "Failed to get RuntimeInferenceContext, context_id = %s", context_id.c_str()); GE_CHK_STATUS_RET(runtime_infer_ctx->SetTensor(node_item.node_id, output_idx, std::move(tensor)), - "Failed to SetTensor, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx); + "[Set][Tensor] Failed, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx); GELOGD("[%s] Output[%d] cached successfully in context: %s. node_id = %d, shape = [%s]", node_item.NodeName().c_str(), output_idx, @@ -173,7 +177,8 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel * Status NodeDoneCallback::ProfilingReport() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "Get node is nullptr"); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr."); + REPORT_INNER_ERROR("E19999", "TaskContext GetNodeItem value is nullptr."); return PARAM_INVALID; } @@ -190,7 +195,8 @@ Status NodeDoneCallback::ProfilingReport() { std::vector task_desc_info; auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info); if (profiling_ret != RT_ERROR_NONE) { - GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str()); + GELOGE(profiling_ret, "[Get][TaskDescInfo] of node:%s failed.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetTaskDescInfo of node:%s failed.", node->GetName().c_str()); return profiling_ret; } @@ -202,41 +208,41 @@ Status NodeDoneCallback::ProfilingReport() { Status NodeDoneCallback::DumpDynamicNode() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "Get node is nullptr"); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr."); + REPORT_INNER_ERROR("E19999", "get node value is nullptr."); return PARAM_INVALID; } auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(graph_context_); + const HybridModel *model = graph_context_->model; + GE_CHECK_NOTNULL(model); + std::string dynamic_model_name = model->GetModelName(); + std::string dynamic_om_name = model->GetOmName(); + uint32_t model_id = model->GetModelId(); + if (!context_->GetDumpProperties().IsLayerNeedDump(dynamic_model_name, dynamic_om_name, op_desc->GetName())) { + GELOGI("[%s] is not in dump list, no need dump", op_desc->GetName().c_str()); + return SUCCESS; + } + dump_op_.SetDynamicModelInfo(dynamic_model_name, dynamic_om_name, model_id); + auto stream = context_->GetStream(); vector input_addrs; vector output_addrs; for (int i = 0; i < context_->NumInputs(); i++) { auto tensor_value = context_->GetInput(i); - GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); - uint64_t input_addr = reinterpret_cast(tensor_value->GetData()); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr."); + uintptr_t input_addr = reinterpret_cast(tensor_value->GetData()); input_addrs.emplace_back(input_addr); } for (int j = 0; j < context_->NumOutputs(); j++) { auto tensor_value = context_->GetOutput(j); - GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); - uint64_t output_addr = reinterpret_cast(tensor_value->GetData()); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr."); + uintptr_t output_addr = reinterpret_cast(tensor_value->GetData()); output_addrs.emplace_back(output_addr); } dump_op_.SetDumpInfo(context_->GetDumpProperties(), op_desc, input_addrs, output_addrs, stream); - GE_CHECK_NOTNULL(graph_context_); - const HybridModel *model = graph_context_->model; - GE_CHECK_NOTNULL(model); - std::string dynamic_model_name = model->GetModelName(); - uint32_t model_id = model->GetModelId(); - dump_op_.SetDynamicModelInfo(dynamic_model_name, model_id); - - void *global_step = nullptr; - TensorValue *varible_global_step = context_->GetVariable(NODE_NAME_GLOBAL_STEP); - if (varible_global_step != nullptr) { - global_step = const_cast(varible_global_step->GetData()); - } - void *loop_per_iter = nullptr; TensorValue *varible_loop_per_iter = context_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_PER_ITER); if (varible_loop_per_iter != nullptr) { @@ -248,18 +254,54 @@ Status NodeDoneCallback::DumpDynamicNode() { if (varible_loop_cond != nullptr) { loop_cond = const_cast(varible_loop_cond->GetData()); } + void *global_step = context_->GetExecutionContext()->global_step; dump_op_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "Failed to launch dump op in hybird model"); + GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "[Launch][DumpOp] failed in hybird model."); auto rt_ret = rtStreamSynchronize(stream); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtStreamSynchronize failed"); + GELOGE(rt_ret, "[Call][rtStreamSynchronize] failed, ret = %d.", rt_ret); + REPORT_CALL_ERROR("E19999", "call rtStreamSynchronize failed, ret = %d.", rt_ret); return rt_ret; } return SUCCESS; } +Status NodeDoneCallback::SaveDumpOpInfo() { + GE_CHECK_NOTNULL(graph_context_); + GE_CHECK_NOTNULL(graph_context_->model); + + auto node = context_->GetNodeItem().node; + if (node == nullptr) { + GELOGE(PARAM_INVALID, "[Save][DumpOpInfo] Get node is nullptr."); + return PARAM_INVALID; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + + vector input_addrs; + vector output_addrs; + for (int i = 0; i < context_->NumInputs(); i++) { + auto tensor_value = context_->GetInput(i); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *input_addr = const_cast(tensor_value->GetData()); + input_addrs.emplace_back(input_addr); + } + for (int j = 0; j < context_->NumOutputs(); j++) { + auto tensor_value = context_->GetOutput(j); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *output_addr = const_cast(tensor_value->GetData()); + output_addrs.emplace_back(output_addr); + } + + uint32_t stream_id = context_->GetStreamId(); + uint32_t task_id = context_->GetTaskId(); + graph_context_->exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addrs, output_addrs); + + return SUCCESS; +} + Status NodeDoneCallback::OnNodeDone() { auto &node_item = context_->GetNodeItem(); GELOGI("[%s] Start callback process.", node_item.NodeName().c_str()); @@ -269,12 +311,17 @@ Status NodeDoneCallback::OnNodeDone() { const DumpProperties &dump_properties = context_->GetDumpProperties(); if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) { GELOGI("Start to dump dynamic shape op"); - GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node"); + GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed."); + } + + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + if (model_manager->IsDumpExceptionOpen()) { + GE_CHK_STATUS_RET(SaveDumpOpInfo(), "[Save][DumpOpInfo] Failed to dump op info."); } if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { - GE_CHK_STATUS_RET(ProfilingReport(), "Report node[%s] to profiling failed.", - node_item.NodeName().c_str()); + GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed.", node_item.NodeName().c_str()); } // release workspace @@ -296,8 +343,7 @@ Status NodeDoneCallback::OnNodeDone() { (void) LogOutputs(node_item, *context_); } - GE_CHK_STATUS_RET(context_->PropagateOutputs(), - "[%s] Failed to propagate outputs failed", + GE_CHK_STATUS_RET(context_->PropagateOutputs(), "[Propagate][Outputs] of [%s] failed.", node_item.NodeName().c_str()); RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[PropagateOutputs] End"); @@ -338,7 +384,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, const std::function &callback) { const auto &task = node_state.GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null.", node_state.GetName().c_str()); + REPORT_INNER_ERROR("E19999", "GetKernelTask of %s is null.", node_state.GetName().c_str()); return INTERNAL_ERROR; } @@ -352,8 +399,7 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, auto executor = node_item.node_executor; GE_CHECK_NOTNULL(executor); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] Start"); - GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context), - "[%s] Failed to prepare task", + GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context), "[Prepare][Task] for [%s] failed.", node_state.GetName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] End"); GELOGD("[%s] Done task preparation successfully.", node_state.GetName().c_str()); @@ -365,7 +411,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, } } - GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "Failed to validate input tensors."); + GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "[Validate][InputTensors] for %s failed.", + node_state.GetName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ValidateInputTensors] End"); if (context.profiling_level > 0) { @@ -419,11 +466,10 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const input_tensor->GetSize()); } else { GELOGE(INTERNAL_ERROR, - "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu", - task_context.GetNodeName(), - i, - expected_size, - input_tensor->GetSize()); + "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu.", + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize()); + REPORT_INNER_ERROR("E19999", "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu.", + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize()); return INTERNAL_ERROR; } } @@ -436,8 +482,7 @@ Status ExecutionEngine::PropagateOutputs(const NodeItem &node_item, TaskContext &task_context, GraphExecutionContext &context) { if (node_item.shape_inference_type != DEPEND_COMPUTE) { - GE_CHK_STATUS_RET(task_context.PropagateOutputs(), - "[%s] Failed to propagate outputs.", + GE_CHK_STATUS_RET(task_context.PropagateOutputs(), "[Propagate][Outputs] for [%s] failed.", node_item.NodeName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PropagateOutputs] End"); GELOGD("[%s] Done propagating outputs successfully.", node_item.NodeName().c_str()); diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index bb6281e1..33e8fce6 100755 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -41,7 +41,7 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { // Wait for "const input nodes" if node's shape inference function requires any. // Even if output shape is static, there are cases that the const-input will be used in OpTiling and Execution GE_CHK_STATUS_RET_NOLOG(AwaitDependentNodes(node_state)); - if (node_item.is_output_shape_static) { + if (node_item.is_output_shape_static && !node_item.is_need_force_infershape) { return SUCCESS; } @@ -70,7 +70,7 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { { RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start"); GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), - "Invoke InferShapeAndType failed."); + "[Invoke][InferShapeAndType] for %s failed.", node_item.NodeName().c_str()); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); } @@ -172,8 +172,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndType(node)); GELOGD("[%s] Done invoking InferShapeAndType", node->GetName().c_str()); GE_CHK_STATUS_RET(UpdatePeerNodeShape(*node), - "[%s] Failed to update shapes of peer node.", - node->GetName().c_str()); + "[Update][PeerNodeShape] failed for [%s].", node->GetName().c_str()); } for (auto &it : fused_subgraph.output_mapping) { @@ -205,7 +204,8 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { GE_CHECK_NOTNULL(peer_op_desc); auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx()); if (peer_input_desc == nullptr) { - GELOGE(GRAPH_FAILED, "peer_input_desc is nullptr"); + GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr.", peer_op_desc->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr.", peer_op_desc->GetName().c_str()); continue; } @@ -230,8 +230,11 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, const auto &tensor_shape = tensor_desc.MutableShape(); if (tensor_shape.IsUnknownShape()) { if (!fallback_with_range) { - GELOGE(INTERNAL_ERROR, "Output shape is still unknown after shape inference. shape = [%s]", + GELOGE(INTERNAL_ERROR, + "[Is][UnknownShape] Output shape is still unknown after shape inference. shape = [%s].", tensor_shape.ToString().c_str()); + REPORT_INNER_ERROR("E19999", "Output shape is still unknown after shape inference. shape = [%s].", + tensor_shape.ToString().c_str()); return INTERNAL_ERROR; } @@ -239,9 +242,10 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector> shape_range; GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); if (shape_range.size() != shape.size()) { - GELOGE(INTERNAL_ERROR, "Number of shape ranges (%zu) mismatches that of dims (%zu)", - shape_range.size(), - shape.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu).", + shape_range.size(), shape.size()); + REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)", + shape_range.size(), shape.size()); return INTERNAL_ERROR; } @@ -265,23 +269,25 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); uint32_t type_size; if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { - GELOGE(INTERNAL_ERROR, "Failed to get data type size"); + GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s.", + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s.", + TypeUtils::DataTypeToSerialString(data_type).c_str()); return INTERNAL_ERROR; } tensor_size = type_size; for (const auto &dim : shape) { GE_CHECK_GE(dim, 0); - GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), - "Shape size overflow, shape = [%s]", + GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), "[Check][Overflow] Shape size overflow, shape = [%s]", GeShape(shape).ToString().c_str()); tensor_size *= dim; } GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), - "Tensor size is too large: %ld, shape = [%s]", - tensor_size, - GeShape(shape).ToString().c_str()); + "[Check][Overflow]Tensor size is too large:%ld, shape = [%s]" + "Shape size will overflow when add align.", + tensor_size, GeShape(shape).ToString().c_str()); tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; return SUCCESS; } @@ -294,16 +300,23 @@ Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bo const auto &shape = tensor_desc->MutableShape(); // modify on copy auto dims = shape.GetDims(); - GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range), - "[%s] Failed to canonicalize shape for output %zu", - node_item.NodeName().c_str(), - output_index); - + auto status_result = CanonicalizeShape(*tensor_desc, dims, fallback_with_range); + if (status_result != SUCCESS) { + REPORT_CALL_ERROR("E19999", "CanonicalizeShape failed, node:%s, output:%zu.", + node_item.NodeName().c_str(), output_index); + GELOGE(ge::FAILED, "[Canonicalize][Shape] failed for [%s], output %zu.", + node_item.NodeName().c_str(), output_index); + return status_result; + } int64_t tensor_size; - GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size), - "[%s] Failed to calc tensor size for output %zu", - node_item.NodeName().c_str(), - output_index); + status_result = CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size); + if (status_result != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Invoke CalcTensorSize failed, node:%s, output:%zu.", + node_item.NodeName().c_str(), output_index); + GELOGE(ge::FAILED, "[Calc][TensorSize] failed for [%s], output %zu.", + node_item.NodeName().c_str(), output_index); + return status_result; + } GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); (void) TensorUtils::SetSize(*tensor_desc, tensor_size); } diff --git a/ge/hybrid/executor/worker/task_compile_engine.cc b/ge/hybrid/executor/worker/task_compile_engine.cc index f80374c6..f7da9acd 100755 --- a/ge/hybrid/executor/worker/task_compile_engine.cc +++ b/ge/hybrid/executor/worker/task_compile_engine.cc @@ -32,7 +32,7 @@ Status TaskCompileEngine::Compile(NodeState &node_state, GraphExecutionContext * shared_ptr kernel_task; auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task); RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End"); - GE_CHK_STATUS_RET(ret, "Failed to create task for node: %s", node_item.NodeName().c_str()); + GE_CHK_STATUS_RET(ret, "[Compile][Task] failed for node: %s.", node_item.NodeName().c_str()); node_state.SetKernelTask(kernel_task); GELOGI("Compiling node %s successfully", node_state.GetName().c_str()); return SUCCESS; diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc index 430dfa85..0ad1c865 100755 --- a/ge/hybrid/hybrid_davinci_model.cc +++ b/ge/hybrid/hybrid_davinci_model.cc @@ -19,6 +19,7 @@ #include "hybrid/model/hybrid_model.h" #include "hybrid/executor/hybrid_model_async_executor.h" #include "hybrid/node_executor/node_executor.h" +#include "graph/manager/graph_manager_utils.h" namespace ge { namespace hybrid { @@ -32,9 +33,10 @@ class HybridDavinciModel::Impl { } Status Init() { - GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), "Failed to initialize executors"); - GE_CHK_STATUS_RET(model_.Init(), "Failed to init model.") - GE_CHK_STATUS_RET(executor_.Init(), "Failed to init model executor.") + GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), + "[Initialize][NodeExecutorManager] failed"); + GE_CHK_STATUS_RET(model_.Init(), "[Init][HybridModel] failed.") + GE_CHK_STATUS_RET(executor_.Init(), "[Init][HybridModelAsyncExecutor] failed.") return SUCCESS; } @@ -76,11 +78,16 @@ class HybridDavinciModel::Impl { executor_.SetDeviceId(device_id); } - void SetModelName(const string &model_name) { - model_.SetModelName(model_name); - executor_.SetModelName(model_name); + void SetOmName(const string &model_name) { + model_.SetOmName(model_name); } + uint32_t GetDeviceId() { + return model_.GetDeviceId(); + } + + const GraphExecutionContext * GeContext() { return executor_.GeContext(); } + uint64_t GetSessionId() { return model_.GetSessionId(); } @@ -108,6 +115,17 @@ class HybridDavinciModel::Impl { model_.SetModelDescVersion(is_new_model_desc); } + uint32_t GetDataInputerSize() { return executor_.GetDataInputerSize(); } + + bool GetRunningFlag() const { return executor_.GetRunningFlag(); } + + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + auto listener = dynamic_cast(listener_.get()); + GE_CHECK_NOTNULL(listener); + listener->SetCallback(callback); + return SUCCESS; + } + private: std::shared_ptr listener_; HybridModel model_; @@ -181,12 +199,17 @@ void HybridDavinciModel::SetDeviceId(uint32_t device_id) { } } -void HybridDavinciModel::SetModelName(const string &model_name) { +void HybridDavinciModel::SetOmName(const string &om_name) { if (impl_ != nullptr) { - impl_->SetModelName(model_name); + impl_->SetOmName(om_name); } } +uint32_t HybridDavinciModel::GetDeviceId() const { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDeviceId(); +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { GE_CHECK_NOTNULL(impl_); return impl_->GetDynamicBatchInfo(batch_info, dynamic_type); @@ -222,5 +245,33 @@ uint64_t HybridDavinciModel::GetSessionId() { GE_CHECK_NOTNULL(impl_); return impl_->GetSessionId(); } + +uint32_t HybridDavinciModel::GetDataInputerSize() { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDataInputerSize(); +} + +bool HybridDavinciModel::GetRunningFlag() const { return impl_->GetRunningFlag(); } + +Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + return impl_->SetRunAsyncListenerCallback(callback); +} + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + if (impl_ == nullptr) { + return false; + } + auto context = impl_->GeContext(); + GE_CHECK_NOTNULL(context); + bool ret = context->exception_dumper.GetOpDescInfo(stream_id, task_id, op_desc_info); + if (!ret) { + for (const auto &iter : context->davinci_model) { + if (iter->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + return true; + } + } + } + return ret; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/hybrid_davinci_model.h b/ge/hybrid/hybrid_davinci_model.h index 74dca9ed..472fff17 100644 --- a/ge/hybrid/hybrid_davinci_model.h +++ b/ge/hybrid/hybrid_davinci_model.h @@ -57,10 +57,12 @@ class HybridDavinciModel { void SetDeviceId(uint32_t device_id); - void SetModelName(const string &model_name); + void SetOmName(const string &om_name); uint64_t GetSessionId(); + uint32_t GetDeviceId() const; + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type); void GetUserDesignateShapeOrder(std::vector &user_input_shape_order); @@ -74,6 +76,14 @@ class HybridDavinciModel { void SetModelDescVersion(bool is_new_model_desc); + uint32_t GetDataInputerSize(); + + bool GetRunningFlag() const; + + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback); + + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + private: HybridDavinciModel() = default; class Impl; diff --git a/ge/hybrid/hybrid_davinci_model_stub.cc b/ge/hybrid/hybrid_davinci_model_stub.cc index 5b10fb7a..2d4fbe03 100644 --- a/ge/hybrid/hybrid_davinci_model_stub.cc +++ b/ge/hybrid/hybrid_davinci_model_stub.cc @@ -61,13 +61,21 @@ void HybridDavinciModel::SetModelId(uint32_t model_id) { void HybridDavinciModel::SetDeviceId(uint32_t device_id) { } -void HybridDavinciModel::SetModelName(const string &model_name) { +void HybridDavinciModel::SetOmName(const string &om_name) { } uint64_t HybridDavinciModel::GetSessionId() { return 0; } +uint32_t HybridDavinciModel::GetDataInputerSize() { + return 0; +} + +uint32_t HybridDavinciModel::GetDeviceId() const { + return 0; +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { return UNSUPPORTED; } @@ -87,5 +95,17 @@ Status HybridDavinciModel::GetInputOutputDescInfo(vector &i void HybridDavinciModel::SetModelDescVersion(bool is_new_model_desc) { } + +bool HybridDavinciModel::GetRunningFlag() const { + return false; +} + +Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + return UNSUPPORTED; +} + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + return true; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/model/graph_item.cc b/ge/hybrid/model/graph_item.cc index 91f675a4..09e0a117 100644 --- a/ge/hybrid/model/graph_item.cc +++ b/ge/hybrid/model/graph_item.cc @@ -95,7 +95,8 @@ Status GraphItem::GroupNodes() { int group = node->group; if (group != last_group) { if (seen_groups.find(group) != seen_groups.end()) { - GELOGE(INTERNAL_ERROR, "Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); + GELOGE(INTERNAL_ERROR, + "[Find][Group]Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); return INTERNAL_ERROR; } else { last_group = group; diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc index 77c9be2b..a669c06f 100644 --- a/ge/hybrid/model/hybrid_model.cc +++ b/ge/hybrid/model/hybrid_model.cc @@ -44,9 +44,9 @@ Status HybridModel::Init(bool is_single_op) { GELOGD("Start to init hybrid model."); is_single_op_ = is_single_op; if (is_single_op) { - GE_CHK_STATUS_RET(HybridModelBuilder(*this).BuildForSingleOp(), "Failed to build hybrid model."); + GE_CHK_STATUS_RET(HybridModelBuilder(*this).BuildForSingleOp(), "[Build][HybridModel] for SingleOp failed."); } else { - GE_CHK_STATUS_RET(HybridModelBuilder(*this).Build(), "Failed to build hybrid model."); + GE_CHK_STATUS_RET(HybridModelBuilder(*this).Build(), "[Build][HybridModel] failed."); } GELOGD("HybridModel initialized successfully."); return SUCCESS; @@ -106,7 +106,10 @@ const NodeItem *HybridModel::GetNodeItem(const NodePtr &node) const { GeModelPtr HybridModel::GetGeModel(const NodePtr &node) const { auto it = known_shape_sub_models_.find(node); if (it == known_shape_sub_models_.end()) { - GELOGE(INTERNAL_ERROR, "[%s] Failed to get GeModel for subgraph node.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Param:node][%s] Failed to get GeModel for subgraph node," + "because node not in known_shape_sub_models_.", node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "%s Failed to get GeModel for subgraph node," + "because node not in known_shape_sub_models_.", node->GetName().c_str()); return nullptr; } @@ -130,7 +133,10 @@ const GraphItem *HybridModel::GetSubgraphItem(const std::string &graph_name) con const GraphItem *HybridModel::GetSubgraphItem(const ComputeGraphPtr &subgraph) const { if (subgraph == nullptr) { - GELOGE(PARAM_INVALID, "subgraph is nullptr"); + REPORT_INNER_ERROR("E19999", "Input param subgraph is nullptr, Graph:%s", + root_graph_item_->GetName().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]subgraph is nullptr. graph:%s", + root_graph_item_->GetName().c_str()); return nullptr; } @@ -164,19 +170,27 @@ Status HybridModel::GetInputOutputDescInfo(vector &input_de std::vector &output_formats) { auto node_item_list = root_graph_item_->GetInputNodes(); if (node_item_list.empty()) { - GELOGE(FAILED, "node item list is empty!"); + REPORT_INNER_ERROR("E19999", "node item list is empty!, graph:%s", + root_graph_item_->GetName().c_str()); + GELOGE(FAILED, "[Get][InputNodes]node item list is empty!, graph:%s", + root_graph_item_->GetName().c_str()); return FAILED; } GE_CHECK_NOTNULL(node_item_list[0]->node); GE_CHECK_NOTNULL(node_item_list[0]->node->GetOpDesc()); if (node_item_list[0]->node->GetOpDesc()->GetInputsSize() != 1) { - GELOGE(FAILED, "input size of op is not 1!"); + REPORT_INNER_ERROR("E19999", "Input size of op is not 1, op:%s, type:%s", + node_item_list[0]->node->GetName().c_str(), + node_item_list[0]->node->GetType().c_str()); + GELOGE(FAILED, "[Check][Size]input size of op is not 1! op:%s, type:%s", + node_item_list[0]->node->GetName().c_str(), + node_item_list[0]->node->GetType().c_str()); return FAILED; } - GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed"); - GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get ouput desc info failed"); + GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "[Get][InputDescInfo] failed."); + GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "[Get][OutputDescInfo] failed."); return SUCCESS; } @@ -225,23 +239,26 @@ Status HybridModel::GetInputDescInfo(vector &input_desc, st GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(0)); Format format = op_desc->GetInputDescPtr(0)->GetFormat(); - input.data_type = op_desc->GetInputDescPtr(0)->GetDataType(); + DataType data_type = op_desc->GetInputDescPtr(0)->GetDataType(); + input.data_type = static_cast(data_type); input.name = op_desc->GetName(); - - int64_t input_size = 0; - GE_CHK_STATUS_RET(TensorUtils::GetSize(*op_desc->GetInputDescPtr(0), input_size), "get input size failed."); - - // support dynamic shape - if (input_size < 0) { - GELOGD("dynamic shape scene, input size is unknown. " - "format=%d, data_type=%d, input_size=%ld", - format, input.data_type, input_size); - input_size = kMemSizeUnknownShape; // -1 + GeShape shape = op_desc->GetInputDescPtr(0)->GetShape(); + int64_t tensor_size = 0; + if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[Calculate][TensorMemSize] failed input0 desc in node:%s." + "shape:%s, format:%s, datatype:%s.", op_desc->GetName().c_str(), + shape.ToString().c_str(), TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for input0 desc in node:%s," + "shape:%s, format:%s, datatype:%s", op_desc->GetName().c_str(), + shape.ToString().c_str(), TypeUtils::FormatToSerialString(format).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); + return FAILED; } - - // not support dynamic shape input for now, so input_size here will be not less than zero. - input.size = input_size; - + if (tensor_size == kMemSizeUnknownShape) { + tensor_size = 0; + } + input.size = static_cast(tensor_size); CreateInputDimsInfo(op_desc, input); formats.push_back(format); @@ -253,7 +270,10 @@ Status HybridModel::GetInputDescInfo(vector &input_desc, st void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDescInfo &output_desc_info, uint32_t &format_result) { - GE_IF_BOOL_EXEC(output_desc == nullptr, GELOGE(FAILED, "output desc ptr is nullptr"); return ); + GE_IF_BOOL_EXEC(output_desc == nullptr, + REPORT_INNER_ERROR("E19999", "param output_desc is nullptr, check invalid."); + GELOGE(FAILED, "[Check][Param:output_desc]output desc ptr is nullptr"); + return ); Format format = output_desc->GetFormat(); GeShape shape = output_desc->GetShape(); std::vector> shape_ranges; @@ -284,6 +304,9 @@ void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, } int64_t tensor_size = 0; (void)TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size); + if (tensor_size == kMemSizeUnknownShape) { + tensor_size = 0; + } output_desc_info.size = static_cast(tensor_size); output_desc_info.data_type = output_desc->GetDataType(); } @@ -291,7 +314,9 @@ void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, Status HybridModel::GetOutputDescInfo(vector &output_desc, std::vector &formats) { std::vector output_desc_list; // output_desc_list contains vaild input desc - GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed"); + GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), + "[Invoke][GetOutputDescList]get output desc info failed, Graph:%s", + root_graph_item_->GetName().c_str()); vector out_node_names; (void)ge::AttrUtils::GetListStr(ge_root_model_->GetRootGraph(), ATTR_MODEL_OUT_NODES_NAME, out_node_names); @@ -301,8 +326,12 @@ Status HybridModel::GetOutputDescInfo(vector &output_desc, GE_CHECK_NOTNULL(op_desc); auto out_size = static_cast(op_desc->GetInputsSize()); - GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(), - FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size()); + GE_IF_BOOL_EXEC(out_size != output_desc_list.size(), + REPORT_INNER_ERROR("E19999", "output size[%u] not match output_desc_list size[%zu]", + out_size, output_desc_list.size()); + GELOGE(FAILED, "[Check][Size]output size[%u] not match output_desc_list size[%zu]", + out_size, output_desc_list.size()); + return FAILED;); for (uint32_t index = 0; index < out_size; ++index) { string output_name; @@ -330,7 +359,8 @@ Status HybridModel::GetOutputDescInfo(vector &output_desc, TensorValue *HybridModel::GetConstant(const NodePtr &node) const { if (node == nullptr) { - GELOGE(PARAM_INVALID, "Param is null"); + GELOGE(PARAM_INVALID, "[Check][Param:node]node is null."); + REPORT_INNER_ERROR("E19999", "param node is null, check invalid."); return nullptr; } @@ -348,7 +378,8 @@ TensorValue *HybridModel::GetConstant(const NodePtr &node) const { TensorValue *HybridModel::GetTensor(const NodePtr &node) const { if (node == nullptr) { - GELOGE(PARAM_INVALID, "Param is null"); + GELOGE(PARAM_INVALID, "[Check][Param:node]node is null."); + REPORT_INNER_ERROR("E19999", "param node is null, check invalid."); return nullptr; } @@ -358,5 +389,25 @@ TensorValue *HybridModel::GetTensor(const NodePtr &node) const { return GetVariable(node->GetName()); } + +const map>> &HybridModel::GetHostTensors() const { + return host_tensors_; +} + +void *HybridModel::GetGlobalStep() const { + if (global_step_ == nullptr) { + return nullptr; + } + return global_step_->GetData(); +} + +TensorBuffer *HybridModel::GetModelWeight(const string &subgraph_name) const { + auto it = weight_buffer_map_.find(subgraph_name); + if (it == weight_buffer_map_.end()) { + GELOGD("Model weight not found, subgraph name = %s", subgraph_name.c_str()); + return nullptr; + } + return it->second.get(); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h index 3e5bd635..18daed4f 100644 --- a/ge/hybrid/model/hybrid_model.h +++ b/ge/hybrid/model/hybrid_model.h @@ -45,6 +45,8 @@ class HybridModel { return root_runtime_param_.session_id; } + void *GetGlobalStep() const; + GeModelPtr GetGeModel(const NodePtr &node) const; NodeItem *MutableNodeItem(const NodePtr &node); @@ -69,8 +71,8 @@ class HybridModel { model_id_ = model_id; } - void SetModelName(const string &model_name) { - om_name_ = model_name; + void SetOmName(const string &om_name) { + om_name_ = om_name; } const std::string &GetOmName() const { @@ -91,6 +93,10 @@ class HybridModel { TensorValue* GetTensor(const NodePtr &node) const; + TensorBuffer* GetModelWeight(const std::string &subgraph_name) const; + + const std::map>> &GetHostTensors() const; + const std::vector* GetTaskDefs(const NodePtr &node) const; const GraphItem *GetRootGraphItem() const; @@ -135,6 +141,7 @@ class HybridModel { std::string model_name_; GeRootModelPtr ge_root_model_; std::map input_nodes_; + ComputeGraphPtr root_graph_; std::map device_variable_nodes_; //lint !e148 std::map host_variable_nodes_; //lint !e148 std::map> variable_tensors_; @@ -145,6 +152,7 @@ class HybridModel { std::unique_ptr root_graph_item_; std::map> subgraph_items_; std::map> node_items_; + std::map>> host_tensors_; bool is_new_model_desc_ = false; // support aipp bool is_single_op_ = false; @@ -153,9 +161,10 @@ class HybridModel { uint32_t device_id_ = 0; uint32_t model_id_ = 0; uint8_t *var_mem_base_ = nullptr; - std::unique_ptr weight_buffer_; + std::map> weight_buffer_map_; RuntimeParam root_runtime_param_; string om_name_; + std::unique_ptr global_step_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index ac57b2ea..9b3cb692 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -50,6 +50,7 @@ const char *const kProfilingBpNode = "ProfilingBpNode"; const char *const kProfilingEndNode = "ProfilingEndNode"; const char *const kProfilingArNode = "ProfilingAllReduceNode"; const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE"; +const char *const kForceInfershape = "_force_infershape_when_running"; Status SetOutputNameAttr(ComputeGraph &graph) { vector output_names; @@ -70,8 +71,11 @@ Status SetOutputNameAttr(ComputeGraph &graph) { } } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&graph, ATTR_MODEL_OUT_NODES_NAME, output_names), - GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed."); - return FAILED); + GELOGE(FAILED, "[Invoke][SetListStr] failed, graph:%s name:%s.", graph.GetName().c_str(), + ATTR_MODEL_OUT_NODES_NAME.c_str()); + REPORT_CALL_ERROR("E19999", "SetListStr failed, graph:%s name:%s.", graph.GetName().c_str(), + ATTR_MODEL_OUT_NODES_NAME.c_str()); + return FAILED); return SUCCESS; } @@ -107,11 +111,12 @@ Status CollectDependenciesForFusedGraph(NodeItem &node_item, std::set auto src_op_desc = src_node->GetOpDesc(); GE_CHECK_NOTNULL(src_op_desc); if (src_node->GetType() != DATA_TYPE) { - GELOGE(UNSUPPORTED, - "[%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s", - node_item.NodeName().c_str(), - node->GetName().c_str(), + GELOGE(UNSUPPORTED, "[Check][NodeType][%s::%s] Node in fused subgraph can only depend on Data nodes," + "but depend on %s actually", node_item.NodeName().c_str(), node->GetName().c_str(), src_node->GetType().c_str()); + REPORT_INNER_ERROR("E19999", "[%s::%s] Node in fused subgraph can only depend on Data nodes," + "but depend on %s actually.", node_item.NodeName().c_str(), node->GetName().c_str(), + src_node->GetType().c_str()); return UNSUPPORTED; } @@ -128,37 +133,42 @@ HybridModelBuilder::HybridModelBuilder(HybridModel &hybrid_model) } Status HybridModelBuilder::Build() { - GE_CHK_STATUS_RET(ValidateParams(), "Failed to validate GeRootModel"); - hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); + GE_CHK_STATUS_RET(ValidateParams(), "[Invoke][ValidateParams] failed, model_name_:[%s]", GetGraphName()); + hybrid_model_.model_name_ = ge_root_model_->GetModelName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); - GE_CHK_STATUS_RET(InitRuntimeParams(), "[%s] Failed to InitRuntimeParams", GetGraphName()); - GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), "[%s] Failed to RecoverGraphUnknownFlag", GetGraphName()); - GE_CHK_STATUS_RET(IndexSpecialNodes(), "[%s] Failed to index nodes", GetGraphName()); - GE_CHK_STATUS_RET(IndexTaskDefs(), "[%s] Failed to index task defs", GetGraphName()); - GE_CHK_STATUS_RET(LoadGraph(), "[%s] Failed to load graph", GetGraphName()); - GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), "[%s] Failed to assign uninitialized constants", GetGraphName()); - GE_CHK_STATUS_RET(TransAllVarData(), "[%s] Failed to trans all var data", GetGraphName()); - GE_CHK_STATUS_RET(CopyVarData(), "[%s] Failed to copy var data", GetGraphName()); - GE_CHK_STATUS_RET(InitModelMem(), "[%s] Failed to init memory", GetGraphName()); - GE_CHK_STATUS_RET(InitWeights(), "[%s] Failed to init weights", GetGraphName()); - GE_CHK_STATUS_RET(InitConstantOps(), "[%s] Failed to init constant op", GetGraphName()); - GE_CHK_STATUS_RET(InitVariableTensors(), "[%s] Failed to init variables", GetGraphName()); - GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName()); + GE_CHK_STATUS_RET(InitRuntimeParams(), "[Invoke][InitRuntimeParams] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), + "[Invoke][RecoverGraphUnknownFlag] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(IndexSpecialNodes(), "[Invoke][IndexSpecialNodes] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(IndexTaskDefs(), "[Invoke][IndexTaskDefs] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitWeights(), "[Invoke][InitWeights] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadGraph(), "[Invoke][LoadGraph] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), + "[Invoke][AssignUninitializedConstantOps] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(TransAllVarData(), "[Invoke][TransAllVarData] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(CopyVarData(), "[Invoke][CopyVarData] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitModelMem(), "[Invoke][InitModelMem] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitConstantOps(), "[Invoke][InitConstantOps] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitVariableTensors(), "[Invoke][InitVariableTensors], model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadTasks(), "[Invoke][LoadTasks] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(OptimizeDependenciesForConstantInputs(), + "[Invoke][OptimizeDependenciesForConstantInputs] failed, model_name_:[%s]", + GetGraphName()); GELOGI("[%s] Done building hybrid model successfully.", GetGraphName()); return SUCCESS; } Status HybridModelBuilder::BuildForSingleOp() { - GE_CHK_STATUS_RET(ValidateParams(), "Failed to validate GeRootModel"); + GE_CHK_STATUS_RET(ValidateParams(), "[Invoke][ValidateParams] failed, model_name_:[%s]", GetGraphName()); hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); auto ret = ge_root_model_->GetSubgraphInstanceNameToModel(); const GeModelPtr ge_model = ret[ge_root_model_->GetRootGraph()->GetName()]; GE_CHK_STATUS_RET(IndexTaskDefs(ge_root_model_->GetRootGraph(), ge_model), - "[%s] Failed to index task defs", GetGraphName()); - GE_CHK_STATUS_RET(LoadGraph(), "[%s] Failed to load graph", GetGraphName()); - GE_CHK_STATUS_RET(InitWeights(), "[%s] Failed to init weights", GetGraphName()); - GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName()); + "[Invoke][IndexTaskDefs] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadGraph(), "[Invoke][LoadGraph] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitWeights(), "[Invoke][InitWeights] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadTasks(), "[Invoke][LoadTasks] failed, model_name_:[%s]", GetGraphName()); GELOGI("[%s] Done building hybrid model for single op successfully.", GetGraphName()); return SUCCESS; } @@ -171,16 +181,20 @@ Status HybridModelBuilder::ValidateParams() { Status HybridModelBuilder::BuildNodeItem(const NodePtr &node, NodeItem &node_item) { auto op_desc = node->GetOpDesc(); + GE_CHK_STATUS_RET(ParseForceInfershapeNodes(node, node_item), + "[Invoke][ParseForceInfershapeNodes]failed, node:[%s].", + node_item.NodeName().c_str()); vector dependencies = node->GetOpDesc()->GetOpInferDepends(); GE_CHK_STATUS_RET(ParseDependentInputNodes(node_item, dependencies), - "[%s] Failed to parse node dependencies.", + "[Invoke][ParseDependentInputNodes]failed, node:[%s].", node_item.NodeName().c_str()); node_item.outputs.resize(node_item.num_outputs); for (int i = 0; i < node_item.num_outputs; ++i) { auto out_data_anchor = node->GetOutDataAnchor(i); if (out_data_anchor == nullptr) { - GELOGE(INTERNAL_ERROR, "out anchor[%d] of node %s is nullptr", i, node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][OutDataAnchor]out anchor[%d] of node %s is nullptr", i, node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "out anchor[%d] of node %s is nullptr.", i, node->GetName().c_str()); return INTERNAL_ERROR; } @@ -193,12 +207,10 @@ Status HybridModelBuilder::BuildNodeItem(const NodePtr &node, NodeItem &node_ite NodeItem *dst_node_item = nullptr; GE_CHK_STATUS_RET(GetOrCreateNodeItem(dst_node, &dst_node_item), - "[%s] Failed to get or create node item.", - dst_node->GetName().c_str()); + "[GetOrCreate][NodeItem] failed, dst_node:[%s].", dst_node->GetName().c_str()); int canonical_index; GE_CHK_STATUS_RET(dst_node_item->GetCanonicalInputIndex(dst_in_anchor->GetIdx(), canonical_index), - "[%s] Failed to canonical input index", - dst_node->GetName().c_str()); + "[Invoke][GetCanonicalInputIndex] failed, dst_node:[%s].", dst_node->GetName().c_str()); node_item.outputs[i].emplace_back(canonical_index, dst_node_item); } @@ -242,7 +254,7 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n } std::unique_ptr new_node; - GE_CHK_STATUS_RET(NodeItem::Create(node, new_node), "Failed to create node item"); + GE_CHK_STATUS_RET(NodeItem::Create(node, new_node), "[Invoke][Create] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET_NOLOG(NodeExecutorManager::GetInstance().GetExecutor(*node, &new_node->node_executor)); // we do not need L2 Buffer @@ -251,9 +263,7 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n (void) AttrUtils::SetBool(new_node->op_desc, kIsFirstNode, false); (void) AttrUtils::SetBool(new_node->op_desc, kIsLastNode, false); - new_node->node_id = node_index; - new_node->op_desc->SetId(node_index); - node_index += 1; + new_node->node_id = static_cast(new_node->op_desc->GetId()); NodeExecutorManager::ExecutorType executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node); new_node->is_profiling_report = (executor_type == NodeExecutorManager::ExecutorType::AICORE) || (executor_type == NodeExecutorManager::ExecutorType::AICPU_TF) || @@ -263,11 +273,22 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n return SUCCESS; } +Status HybridModelBuilder::ParseForceInfershapeNodes(const NodePtr &node, NodeItem &node_item) { + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + // not care result, if no this attr, stand for the op does not need force infershape + (void) AttrUtils::GetBool(op_desc, kForceInfershape, node_item.is_need_force_infershape); + GELOGD("node [%s] is need do infershape, flag is %d", + op_desc->GetName().c_str(), + node_item.is_need_force_infershape); + return SUCCESS; +} + Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const std::vector &dependencies) { - std::set dependent_input_nodes; + std::set dependent_for_shape_inference; + std::set dependent_for_execution; auto &ge_node = node_item.node; - bool is_hccl_op = - NodeExecutorManager::GetInstance().ResolveExecutorType(*ge_node) == NodeExecutorManager::ExecutorType::HCCL; + bool is_hccl_op = node_item.IsHcclOp(); // The input tensors become valid after computation is done for parent nodes of type DEPEND_COMPUTE. // Wait for these parent nodes before execution. @@ -282,29 +303,15 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s auto src_node_item = MutableNodeItem(src_node); GE_CHECK_NOTNULL(src_node_item); - if (is_hccl_op) { - GELOGD("[%s] Add input data dependent node [%s] due to engine type is HCCL", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str()); - src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); - node_item.has_observer = true; - for (auto &dst_node : ge_node->GetOutNodes()) { - if (dst_node == nullptr) { - continue; - } - - NodeItem *dst_node_item = nullptr; - GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(dst_node, &dst_node_item)); - dst_node_item->dependents_for_execution.emplace_back(ge_node); - } - } else if (src_node_item->shape_inference_type == DEPEND_COMPUTE) { - GELOGD("[%s] Add input data dependent node [%s] due to inference type = DEPEND_COMPUTE", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str()); - + if (src_node_item->shape_inference_type == DEPEND_COMPUTE || is_hccl_op || src_node_item->IsHcclOp()) { + GELOGD("[%s](%s) Add input data dependent node [%s](%s), shape inference type = %d", + ge_node->GetName().c_str(), + ge_node->GetType().c_str(), + src_node->GetName().c_str(), + src_node->GetType().c_str(), + static_cast(src_node_item->shape_inference_type)); src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); + dependent_for_execution.emplace(src_node); } if (src_node_item->shape_inference_type == DEPEND_SHAPE_RANGE) { @@ -312,22 +319,29 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s node_item.NodeName().c_str(), src_node_item->NodeName().c_str()); src_node_item->has_observer = true; - dependent_input_nodes.emplace(src_node); + dependent_for_shape_inference.emplace(src_node); + } + } + + for (const auto &src_node : ge_node->GetInControlNodes()) { + auto src_node_item = MutableNodeItem(src_node); + if ((src_node_item != nullptr) && (is_hccl_op || src_node_item->IsHcclOp())) { + GELOGD("[%s](%s) Add input control dependent node [%s](%s)", + ge_node->GetName().c_str(), + ge_node->GetType().c_str(), + src_node->GetName().c_str(), + src_node->GetType().c_str()); + dependent_for_execution.emplace(src_node); } } // cond or branch need to be prepared before the execution of IF or CASE if (node_item.node_type == IF || node_item.node_type == STATELESSIF || node_item.node_type == CASE) { - const auto &in_anchor = ge_node->GetInDataAnchor(0); - GE_CHECK_NOTNULL(in_anchor); - const auto &peer_anchor = in_anchor->GetPeerOutAnchor(); - GE_CHECK_NOTNULL(peer_anchor); - auto src_node = peer_anchor->GetOwnerNode(); + auto src_node = NodeUtils::GetInDataNodeByIndex(*ge_node, 0); // cond input GE_CHECK_NOTNULL(src_node); auto src_node_item = MutableNodeItem(src_node); GE_CHECK_NOTNULL(src_node_item); - src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); + dependent_for_execution.emplace(src_node); GELOGD("[%s] Dependent added from %s for control op's cond/branch", node_item.NodeName().c_str(), src_node_item->NodeName().c_str()); @@ -336,10 +350,10 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s for (const auto &input_name : dependencies) { int input_index = node_item.op_desc->GetInputIndexByName(input_name); if (input_index < 0) { - GELOGE(INTERNAL_ERROR, - "[%s] Failed to get input index by name: %s", - node_item.NodeName().c_str(), - input_name.c_str()); + GELOGE(INTERNAL_ERROR, "[Get][InputIndex]failed, node:[%s] inputname: %s.", + node_item.NodeName().c_str(), input_name.c_str()); + REPORT_CALL_ERROR("E19999", "GetInputIndexByName failed, node:[%s] inputname: %s.", + node_item.NodeName().c_str(), input_name.c_str()); return INTERNAL_ERROR; } @@ -351,24 +365,33 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s GE_CHECK_NOTNULL(src_node); auto src_node_item = MutableNodeItem(src_node); src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx()); - src_node_item->has_observer = true; - - dependent_input_nodes.emplace(src_node); + dependent_for_shape_inference.emplace(src_node); + host_input_value_dependencies_[&node_item].emplace_back(peer_out_anchor->GetIdx(), src_node_item); GELOGD("[%s] Dependent added from output of [%s:%d]", node_item.NodeName().c_str(), src_node_item->NodeName().c_str(), peer_out_anchor->GetIdx()); } - for (const auto &dep_node : dependent_input_nodes) { + GE_CHK_STATUS_RET(ParseDependentForFusedSubgraph(node_item, dependent_for_shape_inference)); + for (const auto &dep_node : dependent_for_shape_inference) { + auto src_node_item = MutableNodeItem(dep_node); + GE_CHECK_NOTNULL(src_node_item); + src_node_item->has_observer = true; node_item.dependents_for_shape_inference.emplace_back(dep_node); } - GE_CHK_STATUS_RET(ParseDependentForFusedSubgraph(node_item)); + for (const auto &dep_node : dependent_for_execution) { + auto src_node_item = MutableNodeItem(dep_node); + GE_CHECK_NOTNULL(src_node_item); + src_node_item->has_observer = true; + node_item.dependents_for_execution.emplace_back(dep_node); + } + return SUCCESS; } -Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { +Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item, std::set &dependencies) { if (node_item.fused_subgraph == nullptr) { return SUCCESS; } @@ -378,10 +401,10 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { for (auto &op_desc : data_ops) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(*op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(INTERNAL_ERROR, - "[%s] Failed to get attr [%s]", - op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt] failed, node:[%s] attr:[%s]", + op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "invoke GetInt failed, node:[%s] attr:[%s]", + op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; } @@ -398,17 +421,12 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { node_item.NodeName().c_str(), op_desc->GetName().c_str(), src_node_item->NodeName().c_str()); - src_node_item->has_observer = true; src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx()); - - auto &depends = node_item.dependents_for_shape_inference; - if (std::find(depends.begin(), depends.end(), src_node) == depends.end()) { - depends.emplace_back(src_node); - GELOGD("[%s] Dependent added from output of [%s:%d]", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str(), - peer_out_anchor->GetIdx()); - } + dependencies.emplace(src_node); + GELOGD("[%s] Dependent added from output of [%s:%d]", + node_item.NodeName().c_str(), + src_node_item->NodeName().c_str(), + peer_out_anchor->GetIdx()); } return SUCCESS; @@ -416,24 +434,29 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { Status HybridModelBuilder::UpdateAnchorStatus(const NodePtr &node) { if (NodeUtils::SetAllAnchorStatus(node) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] NodeUtils::SetAllAnchorStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetAllAnchorStatus] failed, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] NodeUtils::SetAllAnchorStatus failed.", node->GetName().c_str()); return INTERNAL_ERROR; } for (auto &anchor : node->GetAllInDataAnchors()) { auto peer_anchor = anchor->GetPeerOutAnchor(); if (peer_anchor == nullptr) { if (AnchorUtils::SetStatus(anchor, ANCHOR_SUSPEND) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_SUSPEND, node:[%s].", + node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_SUSPEND, node:[%s].", node->GetName().c_str()); return INTERNAL_ERROR; } } else if (peer_anchor->GetOwnerNode()->GetType() == CONSTANT) { if (AnchorUtils::SetStatus(anchor, ANCHOR_CONST) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_CONST, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_CONST, node:[%s].", node->GetName().c_str()); return INTERNAL_ERROR; } } else { if (AnchorUtils::SetStatus(anchor, ANCHOR_DATA) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_DATA, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_DATA, node:[%s].", node->GetName().c_str()); return INTERNAL_ERROR; } } @@ -444,11 +467,10 @@ Status HybridModelBuilder::UpdateAnchorStatus(const NodePtr &node) { Status HybridModelBuilder::DoUnlinkDataAnchors(const OutDataAnchorPtr &out_data_anchor, const InDataAnchorPtr &in_data_anchor) { - GE_CHK_GRAPH_STATUS_RET(out_data_anchor->Unlink(in_data_anchor), "Failed to unlink %s:%d from %s:%d", - out_data_anchor->GetOwnerNode()->GetName().c_str(), - out_data_anchor->GetIdx(), - in_data_anchor->GetOwnerNode()->GetName().c_str(), - in_data_anchor->GetIdx()); + GE_CHK_GRAPH_STATUS_RET(out_data_anchor->Unlink(in_data_anchor), + "[Invoke][Unlink] failed to unlink %s:%d from %s:%d", + out_data_anchor->GetOwnerNode()->GetName().c_str(), out_data_anchor->GetIdx(), + in_data_anchor->GetOwnerNode()->GetName().c_str(), in_data_anchor->GetIdx()); GELOGD("Succeeded in unlinking %s:%d from %s:%d", out_data_anchor->GetOwnerNode()->GetName().c_str(), @@ -459,7 +481,7 @@ Status HybridModelBuilder::DoUnlinkDataAnchors(const OutDataAnchorPtr &out_data_ } Status HybridModelBuilder::DoLinkDataAnchors(OutDataAnchorPtr &out_data_anchor, InDataAnchorPtr &in_data_anchor) { - GE_CHK_GRAPH_STATUS_RET(out_data_anchor->LinkTo(in_data_anchor), "Failed to link %s:%d to %s:%d", + GE_CHK_GRAPH_STATUS_RET(out_data_anchor->LinkTo(in_data_anchor), "[Invoke][LinkTo]Failed to link %s:%d to %s:%d", out_data_anchor->GetOwnerNode()->GetName().c_str(), out_data_anchor->GetIdx(), in_data_anchor->GetOwnerNode()->GetName().c_str(), @@ -491,10 +513,10 @@ Status HybridModelBuilder::MergeInputNodes(ComputeGraph &graph) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(data_op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(FAILED, - "[%s] Failed to get attr [%s]", - data_op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt] failed, node:[%s] attr:[%s]", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "GetInt failed, node:[%s] attr:[%s]", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } @@ -540,7 +562,7 @@ Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) { const auto &parent_node = graph.GetParentNode(); const NodePtr &net_output_node = graph.FindFirstNodeMatchType(NETOUTPUT); if (net_output_node == nullptr) { - GELOGD("Graph has no netoutput no need to merge."); + GELOGD("Graph has no netoutput no need to merge"); return SUCCESS; } const auto &net_output_desc = net_output_node->GetOpDesc(); @@ -560,7 +582,9 @@ Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) { auto index = in_data_anchor->GetIdx(); auto input_desc = net_output_desc->MutableInputDesc(index); if (input_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Failed to get input desc[%d]", net_output_desc->GetName().c_str(), index); + GELOGE(INTERNAL_ERROR, "[Invoke][MutableInputDesc][%s] Failed to get input desc[%d]", + net_output_desc->GetName().c_str(), index); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get input desc[%d].", net_output_desc->GetName().c_str(), index); return INTERNAL_ERROR; } @@ -602,9 +626,10 @@ Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) { return SUCCESS; } -Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGraphPtr &merged_graph) { +Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraphPtr &root_graph, ComputeGraphPtr &merged_graph) { merged_graph = MakeShared("MergedGraph"); - for (const auto &node : root_graph.GetDirectNode()) { + merged_graph->SetGraphUnknownFlag(root_graph->GetGraphUnknownFlag()); + for (const auto &node : root_graph->GetDirectNode()) { GE_CHECK_NOTNULL(node); auto op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); @@ -634,13 +659,14 @@ Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGrap } } } - GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraph(root_graph, *merged_graph, *subgraph), - "[%s] Failed to merge subgraph.", + GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraph(root_graph, merged_graph, *subgraph), + "[Invoke][UnfoldSubgraph][%s] Failed to merge subgraph.", subgraph->GetName().c_str()); } // invoke before adding subgraphs. in case modify node id in known-shaped subgraphs. - GE_CHK_GRAPH_STATUS_RET(merged_graph->TopologicalSorting(), "Failed to invoke TopologicalSorting on merged graph."); + GE_CHK_GRAPH_STATUS_RET(merged_graph->TopologicalSorting(), + "[Invoke][TopologicalSorting]Failed to invoke TopologicalSorting on merged graph."); GE_DUMP(merged_graph, "hybrid_merged_graph_BeforeStageSort"); merged_graph->TopologicalSorting([](const NodePtr &a, const NodePtr &b) -> bool { uint32_t a_level = UINT32_MAX; @@ -650,29 +676,30 @@ Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGrap return a_level < b_level; }); - for (auto &remained_subgraph : root_graph.GetAllSubgraphs()) { + for (auto &remained_subgraph : root_graph->GetAllSubgraphs()) { GELOGD("Adding subgraph [%s] to merged-graph.", remained_subgraph->GetName().c_str()); GE_CHK_GRAPH_STATUS_RET(merged_graph->AddSubgraph(remained_subgraph), - "Failed to add subgraph [%s]", + "[Invoke][AddSubgraph]Failed to add subgraph [%s]", remained_subgraph->GetName().c_str()); + remained_subgraph->SetParentGraph(merged_graph); } return SUCCESS; } -Status HybridModelBuilder::UnfoldSubgraph(ComputeGraph &root_graph, - ComputeGraph &parent_graph, +Status HybridModelBuilder::UnfoldSubgraph(ComputeGraphPtr &root_graph, + ComputeGraphPtr &parent_graph, ComputeGraph &sub_graph) { auto parent_node = sub_graph.GetParentNode(); GE_CHECK_NOTNULL(parent_node); GE_CHK_STATUS_RET(MergeInputNodes(sub_graph), - "[%s] Failed to merge data nodes for subgraph", + "[Invoke][MergeInputNodes][%s] Failed to merge data nodes for subgraph", sub_graph.GetName().c_str()); GE_CHK_STATUS_RET(MergeNetOutputNode(sub_graph), - "[%s] Failed to merge net output nodes for subgraph", + "[Invoke][MergeNetOutputNode][%s] Failed to merge net output nodes for subgraph", sub_graph.GetName().c_str()); - GELOGD("[%s] Done merging subgraph inputs and outputs successfully.", sub_graph.GetName().c_str()); + GELOGD("[%s] Done merging subgraph inputs and outputs successfully", sub_graph.GetName().c_str()); for (auto &sub_node : sub_graph.GetDirectNode()) { auto sub_op_type = sub_node->GetType(); @@ -684,21 +711,29 @@ Status HybridModelBuilder::UnfoldSubgraph(ComputeGraph &root_graph, GE_CHECK_NOTNULL(sub_sub_graph); if (sub_sub_graph->GetGraphUnknownFlag()) { GE_CHK_STATUS_RET(UnfoldSubgraph(root_graph, parent_graph, *sub_sub_graph), - "[%s] Failed to merge subgraph", + "[Invoke][UnfoldSubgraph][%s] Failed to merge subgraph", sub_sub_graph->GetName().c_str()); continue; } } - parent_graph.AddNode(sub_node); + if (!sub_node->GetOpDesc()->GetSubgraphInstanceNames().empty()) { + for (size_t i = 0; i < sub_node->GetOpDesc()->GetSubgraphInstanceNames().size(); ++i) { + auto sub_sub_graph = NodeUtils::GetSubgraph(*sub_node, i); + GE_CHECK_NOTNULL(sub_sub_graph); + sub_sub_graph->SetParentGraph(parent_graph); + } + } + parent_graph->AddNode(sub_node); GELOGD("[%s::%s] added to parent graph: [%s].", sub_graph.GetName().c_str(), sub_node->GetName().c_str(), - parent_graph.GetName().c_str()); + parent_graph->GetName().c_str()); + sub_node->SetOwnerComputeGraph(parent_graph); } - GELOGD("[%s] Done merging subgraph. remove it from root graph.", sub_graph.GetName().c_str()); - root_graph.RemoveSubgraph(sub_graph.GetName()); + GELOGD("[%s] Done merging subgraph. remove it from root graph", sub_graph.GetName().c_str()); + root_graph->RemoveSubgraph(sub_graph.GetName()); return SUCCESS; } @@ -750,17 +785,34 @@ Status HybridModelBuilder::LoadGraph() { GELOGI("Before merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), root_graph->GetAllNodesSize()); - GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(*root_graph, merged_graph), "Failed to unfold subgraphs."); + GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(root_graph, merged_graph), + "[Invoke][UnfoldSubgraphs]Failed to unfold subgraphs, model_name_:%s.", GetGraphName()); root_graph = std::move(merged_graph); GELOGI("After merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), root_graph->GetAllNodesSize()); - GE_DUMP(root_graph, "hybrid_merged_graph"); } - GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), "Failed to load root graph."); + hybrid_model_.root_graph_ = root_graph; + // Reset node id by topological order across all subgraphs + int64_t index = 0; + for (const auto &node : root_graph->GetAllNodes()) { + GE_CHECK_NOTNULL(node); + auto parent_graph = node->GetOwnerComputeGraph(); + // No need to update nodes in known subgraph + if (parent_graph != nullptr && !parent_graph->GetGraphUnknownFlag()) { + continue; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + op_desc->SetId(index++); + } + GE_DUMP(root_graph, "hybrid_merged_graph"); + GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), + "[Invoke][LoadDynamicSubgraph]Failed to load root graph, model_name_:%s.", GetGraphName()); GELOGD("Done loading root graph successfully."); - GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), "Failed to group nodes for root graph"); + GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), + "[Invoke][GroupNodes]Failed to group nodes for root graph, model_name_:%s.", GetGraphName()); for (auto &sub_graph : root_graph->GetAllSubgraphs()) { GE_CHECK_NOTNULL(sub_graph); @@ -776,25 +828,28 @@ Status HybridModelBuilder::LoadGraph() { if (sub_graph->GetGraphUnknownFlag()) { GE_CHK_STATUS_RET(LoadDynamicSubgraph(*sub_graph, false), - "Failed to load subgraph: [%s]", + "[Invoke][LoadDynamicSubgraph]Failed to load subgraph: [%s]", sub_graph->GetName().c_str()); } else { GE_CHK_STATUS_RET(IdentifyVariableOutputs(*parent_node_item), - "[%s] Failed to identify ref outputs.", + "[Invoke][IdentifyVariableOutputs][%s] Failed to identify ref outputs.", parent_node_item->NodeName().c_str()); GE_CHK_STATUS_RET(IdentifySameInputs(*parent_node_item), - "[%s] Failed to identify same outputs.", + "[Invoke][IdentifySameInputs][%s] Failed to identify same outputs.", parent_node_item->NodeName().c_str()); // if parent is function control op. need add a virtual partitioned call if (parent_node_item->IsControlOp()) { GE_CHK_STATUS_RET(LoadKnownShapedSubgraph(*sub_graph, parent_node_item), - "Failed to load function control op subgraph [%s]", + "[Invoke][LoadKnownShapedSubgraph]Failed to load function control op subgraph [%s]", sub_graph->GetName().c_str()); } } } + GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), + "[Invoke][ParseDependentByParallelGroup]Failed to establish dependencies for hccl ops," + "model_name_:%s.", GetGraphName()); GELOGI("Done loading all subgraphs successfully."); return SUCCESS; } @@ -812,7 +867,7 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ auto tensor_desc = var_node->GetOpDesc()->MutableOutputDesc(0); uint8_t *var_logic = nullptr; GE_CHK_STATUS_RET(var_manager_->GetVarAddr(var_name, *tensor_desc, &var_logic), - "Failed to get var addr. var_name = %s, session_id = %ld", + "[Invoke][GetVarAddr]Failed to get var addr. var_name = %s, session_id = %ld", var_name.c_str(), hybrid_model_.GetSessionId()); @@ -823,10 +878,10 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ } uint8_t *dev_mem = var_manager_->GetVarMemoryAddr(var_logic, memory_type); if (dev_mem == nullptr) { - GELOGE(INTERNAL_ERROR, - "Failed to copy var %s from device, cant not get " - "var addr from logic addr %p", - var_node->GetName().c_str(), var_logic); + GELOGE(INTERNAL_ERROR, "[Invoke][GetVarMemoryAddr]Failed to copy var %s from device," + "cant not get var addr from logic addr %p", var_node->GetName().c_str(), var_logic); + REPORT_CALL_ERROR("E19999", "GetVarMemoryAddr failed, Failed to copy var %s from device," + "cant not get var addr from logic addr %p", var_node->GetName().c_str(), var_logic); return INTERNAL_ERROR; } @@ -854,7 +909,7 @@ Status HybridModelBuilder::HandleDtString(const GeTensor &tensor, void *var_addr auto &mutable_tensor = const_cast(tensor); uint64_t *buff = reinterpret_cast(mutable_tensor.MutableData().data()); GE_CHK_BOOL_RET_STATUS(ge::CheckInt64Uint32MulOverflow(elem_num, kBytes * kStringHeadElems) == SUCCESS, FAILED, - "Shape size is invalid"); + "[Invoke][CheckInt64Uint32MulOverflow] failed because Shape size is invalid."); auto offset = static_cast(elem_num * kBytes * kStringHeadElems); auto hbm_raw_data_base_addr = static_cast(reinterpret_cast(var_addr) + offset); @@ -906,7 +961,7 @@ Status HybridModelBuilder::InitConstantOps() { auto op_desc = var_node->GetOpDesc(); auto v_weights = ModelUtils::GetWeights(op_desc); if (v_weights.empty()) { - GELOGE(INTERNAL_ERROR, "[%s] Constant no not have value", var_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Constant op has no weight", var_node->GetName().c_str()); return INTERNAL_ERROR; } auto *ge_tensor = const_cast(v_weights[0].get()); @@ -920,7 +975,8 @@ Status HybridModelBuilder::InitConstantOps() { GELOGD("Init tensor with host constant %s size = %zu", var_name.c_str(), aligned_tensor.MutableData().GetSize()); if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(aligned_tensor.GetAlignedPtr(), aligned_tensor.GetData().size()) == nullptr) { - GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed."); + GELOGE(MEMALLOC_FAILED, "[Malloc][HostMemory] for an existed GeTensor failed, model_name_:%s.", + GetGraphName()); return MEMALLOC_FAILED; } var_tensor.reset(new(std::nothrow)TensorValue(aligned_tensor.MutableData().data(), @@ -934,7 +990,8 @@ Status HybridModelBuilder::InitConstantOps() { if (ge_tensor->GetData().size() > 0) { GE_CHK_STATUS_RET_NOLOG(HandleDtString(*ge_tensor, v_output_addr)); - GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p] mem_size[%zu] datasize[%zu]", + GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p]" + "mem_size[%zu] datasize[%zu]", runtime_param_.graph_id, op_desc->GetName().c_str(), 0, v_output_addr, v_output_size, ge_tensor->GetData().size()); GE_CHK_RT_RET(rtMemcpy(v_output_addr, v_output_size, ge_tensor->GetData().data(), ge_tensor->GetData().size(), @@ -969,19 +1026,21 @@ Status HybridModelBuilder::InitVariableTensors() { GE_CHECK_NOTNULL(op_desc); GeTensorDesc output_tensor = op_desc->GetOutputDesc(0); int64_t tensor_size = 0; - if (TensorUtils::CalcTensorMemSize(output_tensor.GetShape(), output_tensor.GetFormat(), output_tensor.GetDataType(), - tensor_size) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Calculate variable size failed, node name:%s", it.first.c_str()); + if (TensorUtils::CalcTensorMemSize(output_tensor.GetShape(), output_tensor.GetFormat(), + output_tensor.GetDataType(), tensor_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed, node name:%s", it.first.c_str()); + GELOGE(INTERNAL_ERROR, "[Calculate][TensorMemSize] failed, node name:%s", it.first.c_str()); return INTERNAL_ERROR; } SharedMemInfo mem_info(it.first, tensor_size); if (HostMemManager::Instance().MallocSharedMemory(mem_info) != SUCCESS) { - GELOGE(GE_GRAPH_MALLOC_FAILED, "Host variable [%s] malloc failed.", it.first.c_str()); + GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][SharedMemory] failed, Host variable [%s].", it.first.c_str()); return GE_GRAPH_MALLOC_FAILED; } if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(mem_info.host_aligned_ptr, tensor_size) == nullptr) { - GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed."); + GELOGE(MEMALLOC_FAILED, "[Malloc][HostMem] for an existed GeTensor failed, Host variable [%s].", + it.first.c_str()); return MEMALLOC_FAILED; } GELOGD("Host variable [%s] malloc success, size=%ld.", it.first.c_str(), tensor_size); @@ -997,93 +1056,110 @@ Status HybridModelBuilder::InitVariableTensors() { Status HybridModelBuilder::InitWeights() { // For constant in root graph - const auto &root_graph = ge_root_model_->GetRootGraph(); - const auto &subgraph_models = ge_root_model_->GetSubgraphInstanceNameToModel(); - auto iter = subgraph_models.find(root_graph->GetName()); - if (iter == subgraph_models.end()) { - GELOGD("Root graph model not found"); - return SUCCESS; - } - - auto &root_model = iter->second; - const auto &weight_buffer = root_model->GetWeight(); - if (weight_buffer.GetSize() == 0) { - GELOGD("weight is empty"); - return SUCCESS; - } + for (const auto &subgraph_model : ge_root_model_->GetSubgraphInstanceNameToModel()) { + const auto &weight_buffer = subgraph_model.second->GetWeight(); + if (weight_buffer.GetSize() == 0) { + GELOGD("weight is empty"); + return SUCCESS; + } - auto allocator = NpuMemoryAllocator::GetAllocator(); - GE_CHECK_NOTNULL(allocator); - hybrid_model_.weight_buffer_ = TensorBuffer::Create(allocator, weight_buffer.size()); - GE_CHECK_NOTNULL(hybrid_model_.weight_buffer_); - auto weight_base = reinterpret_cast(hybrid_model_.weight_buffer_->GetData()); - GE_CHK_RT_RET(rtMemcpy(weight_base, - hybrid_model_.weight_buffer_->GetSize(), - weight_buffer.GetData(), - weight_buffer.GetSize(), - RT_MEMCPY_HOST_TO_DEVICE)); - - GELOGI("Init weight mem successfully, weight base %p, weight size = %zu", - weight_base, - hybrid_model_.weight_buffer_->GetSize()); - for (auto &node : root_graph->GetDirectNode()) { - if (node->GetType() != CONSTANT) { - continue; + auto allocator = NpuMemoryAllocator::GetAllocator(); + GE_CHECK_NOTNULL(allocator); + auto sub_weight_buffer = TensorBuffer::Create(allocator, weight_buffer.size()); + GE_CHECK_NOTNULL(sub_weight_buffer); + auto weight_base = reinterpret_cast(sub_weight_buffer->GetData()); + GE_CHK_RT_RET(rtMemcpy(weight_base, + sub_weight_buffer->GetSize(), + weight_buffer.GetData(), + weight_buffer.GetSize(), + RT_MEMCPY_HOST_TO_DEVICE)); + + GELOGI("Init weight mem successfully, weight base %p, weight size = %zu", + weight_base, + sub_weight_buffer->GetSize()); + auto subgraph = GraphUtils::GetComputeGraph(subgraph_model.second->GetGraph()); + if (subgraph != ge_root_model_->GetRootGraph()) { + subgraph = ge_root_model_->GetRootGraph()->GetSubgraph(subgraph_model.first); } + GE_CHECK_NOTNULL(subgraph); + hybrid_model_.weight_buffer_map_.emplace(subgraph->GetName(), std::move(sub_weight_buffer)); + for (auto &node : subgraph->GetDirectNode()) { + if (node->GetType() != CONSTANT) { + continue; + } - auto op_desc = node->GetOpDesc(); - auto v_weights = ModelUtils::GetWeights(op_desc); - if (v_weights.empty()) { - GELOGE(INTERNAL_ERROR, "[%s] Constant has no value", node->GetName().c_str()); - return INTERNAL_ERROR; + auto op_desc = node->GetOpDesc(); + auto v_weights = ModelUtils::GetWeights(op_desc); + if (v_weights.empty()) { + GELOGE(INTERNAL_ERROR, "[Invoke][GetWeights][%s] Constant has no value", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Constant has no value.", node->GetName().c_str()); + return INTERNAL_ERROR; + } + auto *ge_tensor = const_cast(v_weights[0].get()); + GE_CHECK_NOTNULL(ge_tensor); + const GeTensorDesc &tensor_desc = ge_tensor->GetTensorDesc(); + int64_t tensor_size = 0; + GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*op_desc->MutableOutputDesc(0), tensor_size), + "[Invoke][GetSize][%s] Failed to get output tensor size", + node->GetName().c_str()); + int64_t data_offset = 0; + GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetDataOffset(tensor_desc, data_offset), + "[Invoke][GetDataOffset][%s] Failed to get data offset", + node->GetName().c_str()); + GELOGD("[%s] Start to init Constant node [%s], size = %ld, offset = %ld", + GetGraphName(), + node->GetName().c_str(), + tensor_size, + data_offset); + + auto tensor_buffer = TensorBuffer::Create(weight_base + data_offset, tensor_size); + GE_CHECK_NOTNULL(tensor_buffer); + std::unique_ptr constant_tensor(new (std::nothrow)TensorValue(std::move(tensor_buffer))); + GE_CHECK_NOTNULL(constant_tensor); + constant_tensor->SetName("Constant_" + op_desc->GetName()); + hybrid_model_.constant_tensors_.emplace(node, std::move(constant_tensor)); + GELOGD("[%s] Constant node [%s] added, size = %ld", GetGraphName(), node->GetName().c_str(), tensor_size); } - auto *ge_tensor = const_cast(v_weights[0].get()); - GE_CHECK_NOTNULL(ge_tensor); - const GeTensorDesc &tensor_desc = ge_tensor->GetTensorDesc(); - int64_t tensor_size = 0; - GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*op_desc->MutableOutputDesc(0), tensor_size), - "[%s] Failed to get tensor size", - node->GetName().c_str()); - int64_t data_offset = 0; - GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetDataOffset(tensor_desc, data_offset), - "[%s] Failed to get data offset", - node->GetName().c_str()); - GELOGD("[%s] Start to init Constant node [%s], size = %ld, offset = %ld", - GetGraphName(), - node->GetName().c_str(), - tensor_size, - data_offset); - - auto tensor_buffer = TensorBuffer::Create(weight_base + data_offset, tensor_size); - GE_CHECK_NOTNULL(tensor_buffer); - std::unique_ptr constant_tensor(new (std::nothrow)TensorValue(std::move(tensor_buffer))); - GE_CHECK_NOTNULL(constant_tensor); - constant_tensor->SetName("Constant_" + op_desc->GetName()); - hybrid_model_.constant_tensors_.emplace(node, std::move(constant_tensor)); - GELOGD("[%s] Constant node [%s] added, size = %ld", GetGraphName(), node->GetName().c_str(), tensor_size); } return SUCCESS; } +Status HybridModelBuilder::LoadTask(NodeItem &node_item) { + auto &node_ptr = node_item.node; + GELOGD("[%s] Start to build kernel task", node_ptr->GetName().c_str()); + auto load_ret = node_item.node_executor->LoadTask(hybrid_model_, + node_ptr, + node_item.kernel_task); + if (load_ret != UNSUPPORTED && load_ret != SUCCESS) { + GELOGE(load_ret, "[Invoke][LoadTask][%s] Failed to load task", node_ptr->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to load task", node_ptr->GetName().c_str()); + return load_ret; + } + + GELOGD("[%s] Done loading task successfully.", node_ptr->GetName().c_str()); + return SUCCESS; +} + Status HybridModelBuilder::LoadTasks() { - GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); + GE_CHK_STATUS_RET(CheckAicpuOpList(), "[Check][AicpuOpList] failed."); + std::map> ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { auto &node_item = it.second; - auto &node_ptr = node_item->node; if (node_item->node_type == NETOUTPUT) { continue; } - - GELOGD("[%s] Start to build kernel task", node_ptr->GetName().c_str()); - auto load_ret = node_item->node_executor->LoadTask(hybrid_model_, - node_ptr, - node_item->kernel_task); - if (load_ret != UNSUPPORTED && load_ret != SUCCESS) { - GELOGE(load_ret, "[%s] Failed to load task", node_ptr->GetName().c_str()); - return load_ret; + if (node_item->node_type == PARTITIONEDCALL) { + ordered_partitioned_calls[node_item->node_id][node_item->node_name] = node_item.get(); + continue; } + GE_CHK_STATUS_RET_NOLOG(LoadTask(*node_item)); + } - GELOGD("[%s] Done loading task successfully.", node_ptr->GetName().c_str()); + // HCCL operators need to be loaded in the same order across different processes + for (auto &it : ordered_partitioned_calls) { + for (auto &it2 : it.second) { + GE_CHK_STATUS_RET_NOLOG(LoadTask(*it2.second)); + } } return SUCCESS; @@ -1138,11 +1214,12 @@ Status HybridModelBuilder::IndexTaskDefs(const ComputeGraphPtr &sub_graph, const GELOGD("Skip task type: %d", static_cast(task_type)); continue; } - GELOGD("op_index = %u, task_type = %d.", op_index, task_type); + GELOGD("op_index = %u, task_type = %d", op_index, task_type); auto iter = node_map.find(op_index); if (iter == node_map.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get node by op_index = %u.", op_index); + GELOGE(INTERNAL_ERROR, "[Find][Node]Failed to get node by op_index = %u", op_index); + REPORT_INNER_ERROR("E19999", "Failed to get node by op_index = %u.", op_index); return INTERNAL_ERROR; } @@ -1151,7 +1228,7 @@ Status HybridModelBuilder::IndexTaskDefs(const ComputeGraphPtr &sub_graph, const ge_model->GetTBEKernelStore().LoadTBEKernelBinToOpDesc(node->GetOpDesc()); } - GELOGD("Task loaded for node: %s, task type = %d, op_index = %u.", node->GetName().c_str(), task_type, op_index); + GELOGD("Task loaded for node: %s, task type = %d, op_index = %u", node->GetName().c_str(), task_type, op_index); hybrid_model_.task_defs_[node].emplace_back(task_def); } @@ -1212,7 +1289,8 @@ Status HybridModelBuilder::IndexTaskDefs() { auto iter = node_map.find(op_index); if (iter == node_map.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get node by index = %u", op_index); + GELOGE(INTERNAL_ERROR, "[Find][Node]Failed to get node by index = %u.", op_index); + REPORT_INNER_ERROR("E19999", "Failed to get node by index = %u.", op_index); return INTERNAL_ERROR; } @@ -1277,16 +1355,17 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GELOGD("To get peer node of %s::%s", sub_graph->GetName().c_str(), data_node->GetName().c_str()); auto wrapped_node = data_node->GetOwnerComputeGraph()->GetParentNode(); if (wrapped_node == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Node is in root graph.", data_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Node is in root graph.", data_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetParentNode][%s] Node is in root graph.", data_node->GetName().c_str()); return INTERNAL_ERROR; } auto data_op_desc = data_node->GetOpDesc(); uint32_t parent_index = 0; if (!AttrUtils::GetInt(data_op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(INTERNAL_ERROR, - "[%s] Failed to get attr [%s]", - data_op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get attr [%s].", data_op_desc->GetName().c_str(), + ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt][%s] Failed to get attr [%s]", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; } @@ -1294,7 +1373,9 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GE_CHECK_NOTNULL(wrapped_node_in_anchor); auto src_out_anchor = wrapped_node_in_anchor->GetPeerOutAnchor(); if (src_out_anchor == nullptr || src_out_anchor->GetOwnerNode() == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, + "[Check][ParentNode][%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1317,10 +1398,12 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, auto src_graph = NodeUtils::GetSubgraph(*src_wrapped_node, kSubgraphIndex); GE_CHECK_NOTNULL(src_graph); auto src_net_output_node = src_graph->FindFirstNodeMatchType(NETOUTPUT); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(src_net_output_node == nullptr, - return INTERNAL_ERROR, - "Failed to find NetOutput in subgraph: %s", - src_graph->GetName().c_str()); + if (src_net_output_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Failed to find NetOutput in subgraph: %s", src_graph->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][FindFirstNodeMatchType]Failed to find NetOutput in subgraph: %s", + src_graph->GetName().c_str()); + return INTERNAL_ERROR; + } auto net_output_desc = src_net_output_node->GetOpDesc(); GE_CHECK_NOTNULL(net_output_desc); @@ -1356,17 +1439,18 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, } } - GELOGE(FAILED, - "Failed to find peer node for %s::%s", - sub_graph->GetName().c_str(), + GELOGE(FAILED, "[Get][PeerNode]Failed to find peer node for %s::%s", sub_graph->GetName().c_str(), data_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to find peer node for %s::%s.", + sub_graph->GetName().c_str(), data_node->GetName().c_str()); return FAILED; } Status HybridModelBuilder::InitRuntimeParams() { int64_t value = 0; bool ret = false; if (ge_root_model_->GetSubgraphInstanceNameToModel().empty()) { - GELOGE(INTERNAL_ERROR, "Root model has no sub model"); + GELOGE(INTERNAL_ERROR, "[Get][SubModel]Root model has no sub model, model:%s.", GetGraphName()); + REPORT_INNER_ERROR("E19999", "Root model has no sub model, model:%s.", GetGraphName()); return INTERNAL_ERROR; } @@ -1456,16 +1540,25 @@ Status HybridModelBuilder::IdentifyVariableOutputs(NodeItem &node_item) { in_data_anchor->GetIdx(), src_node->GetName().c_str(), src_op_type.c_str()); + uint32_t parent_index = 0; + GE_CHK_STATUS_RET_NOLOG(GetParentNodeOutputIndex(*net_output_desc, in_data_anchor->GetIdx(), parent_index)); + GELOGD("Got parent output index = %u", parent_index); + if (src_op_type == DATA) { + int ref_i = 0; + (void)AttrUtils::GetInt(src_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, ref_i); + node_item.reuse_inputs.emplace(static_cast(parent_index), ref_i); + GELOGD("[%s] output[%u] resues input[%d]", node_item.NodeName().c_str(), parent_index, ref_i); + } - if (src_op_type != CONSTANTOP && src_op_type != VARIABLE) { + if (src_op_type != CONSTANTOP && src_op_type != CONSTANT && src_op_type != VARIABLE) { continue; } - uint32_t parent_index = 0; - GE_CHK_STATUS_RET_NOLOG(GetParentNodeOutputIndex(*net_output_desc, in_data_anchor->GetIdx(), parent_index)); - GELOGD("Got parent output index = %u", parent_index); GE_CHECK_LE(parent_index, INT32_MAX); node_item.ref_outputs.emplace(static_cast(parent_index), src_node); + if (src_op_type == CONSTANTOP || src_op_type == CONSTANT) { + known_subgraph_constant_output_refs_[&node_item].emplace(parent_index, src_node); + } } // Data nodes marked with REF_VAR_SRC_VAR_NAME @@ -1509,8 +1602,10 @@ Status HybridModelBuilder::GetParentNodeOutputIndex(const OpDesc &op_desc, int i auto input_desc = op_desc.MutableInputDesc(index); GE_CHECK_NOTNULL(input_desc); if (!AttrUtils::GetInt(input_desc, ATTR_NAME_PARENT_NODE_INDEX, out_index)) { - GELOGE(INTERNAL_ERROR, "NetOutput input tensor %d, attr %s not found.", - index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt]NetOutput %s input tensor %d, attr %s not found.", + op_desc.GetName().c_str(), index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "NetOutput %s input tensor %d, attr %s not found.", + op_desc.GetName().c_str(), index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; } return SUCCESS; @@ -1526,11 +1621,15 @@ Status HybridModelBuilder::InitModelMem() { if (total_var_size > 0 && hybrid_model_.var_mem_base_ == nullptr) { GE_CHK_STATUS_RET(var_manager_->MallocVarMemory(total_var_size), - "Malloc Var Memory Fail."); + "[Malloc][VarMemory] failed, size:%zu.", total_var_size); hybrid_model_.var_mem_base_ = var_manager_->GetVarMemoryBase(RT_MEMORY_HBM); } runtime_param_.var_base = hybrid_model_.var_mem_base_; + auto allocator = NpuMemoryAllocator::GetAllocator(); + GE_CHECK_NOTNULL(allocator); + hybrid_model_.global_step_ = TensorBuffer::Create(allocator, sizeof(int64_t)); + GE_CHECK_NOTNULL(hybrid_model_.global_step_); return SUCCESS; } @@ -1539,7 +1638,8 @@ Status HybridModelBuilder::TransAllVarData() { rtContext_t ctx = nullptr; rtError_t rt_ret = rtCtxGetCurrent(&ctx); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to get current context, error_code is: 0x%X.", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtCtxGetCurrent]Failed to get current context, error_code is: 0x%X.", rt_ret); + REPORT_CALL_ERROR("E19999", "rtCtxGetCurrent failed, error_code: 0x%X.", rt_ret); return RT_FAILED; } @@ -1553,7 +1653,7 @@ Status HybridModelBuilder::TransAllVarData() { runtime_param_.session_id, ctx, runtime_param_.graph_id), - "TransAllVarData failed."); + "[Invoke][TransAllVarData] failed."); GELOGI("TransAllVarData success."); return SUCCESS; @@ -1563,7 +1663,7 @@ Status HybridModelBuilder::CopyVarData() { GE_CHK_STATUS_RET(TransVarDataUtils::CopyVarData(ge_root_model_->GetRootGraph(), runtime_param_.session_id, hybrid_model_.device_id_), - "CopyVarData failed."); + "[Invoke][CopyVarData] failed."); GELOGI("CopyVarData success."); return SUCCESS; } @@ -1587,7 +1687,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem int32_t data_index = 0; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, data_index)) { GELOGE(FAILED, - "[%s] Failed to get attr [%s]", + "[Invoke][GetInt][%s] Failed to get attr [%s]", node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; @@ -1604,7 +1704,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem } GE_CHK_GRAPH_STATUS_RET(wrapper_op_desc->AddOutputDesc(*output_desc), - "[%s] Failed to add output desc. output index = %d", + "[Invoke][AddOutputDesc][%s] Failed to add output desc. output index = %d", graph.GetName().c_str(), output_index); @@ -1616,6 +1716,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem auto temp_graph = MakeShared("temp"); GE_CHECK_NOTNULL(temp_graph); auto wrapper_node = temp_graph->AddNode(wrapper_op_desc); + wrapper_op_desc->SetId(parent_node_item->node_id); GeModelPtr ge_model = subgraph_models_[subgraph_name]; GE_CHECK_NOTNULL(ge_model); hybrid_model_.known_shape_sub_models_.emplace(wrapper_node, ge_model); @@ -1960,10 +2061,10 @@ Status HybridModelBuilder::BuildInputMapping(GraphItem &graph_item, data_op_index++; } else { if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, data_index)) { - GELOGE(FAILED, - "[%s] Failed to get attr [%s]", - node->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt][%s] Failed to get attr [%s]", + node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "call GetInt failed, [%s] Failed to get attr [%s]", + node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } } @@ -1998,7 +2099,174 @@ Status HybridModelBuilder::CheckAicpuOpList() { aicpu_optype_list.assign(aicpu_optype_set.begin(), aicpu_optype_set.end()); aicpu_tf_optype_list.assign(aicpu_tf_optype_set.begin(), aicpu_tf_optype_set.end()); GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchKernelCheckAicpuOp(aicpu_optype_list, aicpu_tf_optype_list), - "Launch check aicpu op type failed."); + "[Launch][KernelCheckAicpuOp] failed."); + return SUCCESS; +} + +Status HybridModelBuilder::CollectParallelGroups(NodeItem *node_item) { + const auto &node = node_item->node; + auto executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node); + if (executor_type == NodeExecutorManager::ExecutorType::HCCL) { + std::string parallel_group; + if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, parallel_group)) { + GELOGD("[%s] Got parallel group = [%s]", node_item->NodeName().c_str(), parallel_group.c_str()); + parallel_group_to_nodes_[parallel_group].emplace(node_item); + std::set group{parallel_group}; + node_to_parallel_groups_[node_item].emplace(parallel_group); + } + } else if (executor_type == NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH) { + std::set parallel_groups; + GELOGD("[%s] To collect parallel group for known-shaped subgraph", node_item->NodeName().c_str()); + for (const auto &subgraph_name : node->GetOpDesc()->GetSubgraphInstanceNames()) { + GELOGD("[%s] Start to get parallel group from subgraph: %s", + node_item->NodeName().c_str(), + subgraph_name.c_str()); + auto subgraph = hybrid_model_.root_graph_->GetSubgraph(subgraph_name); + GE_CHECK_NOTNULL(subgraph); + for (const auto &sub_node : subgraph->GetAllNodes()) { + std::string parallel_group; + if (AttrUtils::GetStr(sub_node->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, parallel_group)) { + GELOGD("[%s::%s] Got parallel group = %s", + subgraph_name.c_str(), + sub_node->GetName().c_str(), + parallel_group.c_str()); + parallel_groups.emplace(parallel_group); + } + } + } + + if (!parallel_groups.empty()) { + for (const auto ¶llel_group : parallel_groups) { + parallel_group_to_nodes_[parallel_group].emplace(node_item); + GELOGD("[%s] has parallel group: %s", node_item->NodeName().c_str(), parallel_group.c_str()); + } + node_to_parallel_groups_.emplace(node_item, std::move(parallel_groups)); + } + } + + return SUCCESS; +} + +Status HybridModelBuilder::ParseDependentByParallelGroup() { + for (auto &it : hybrid_model_.node_items_) { + GE_CHK_STATUS_RET_NOLOG(CollectParallelGroups(it.second.get())); + } + for (const auto &it : node_to_parallel_groups_) { + auto node_item = it.first; + auto dst_executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node_item->node); + for (const auto ¶llel_group : it.second) { + auto &dependent_nodes = parallel_group_to_nodes_[parallel_group]; + NodeItem *nearest_dep_node = nullptr; + int max_id = -1; + for (auto &dep_node : dependent_nodes) { + if (dep_node->node_id < node_item->node_id && dep_node->node_id > max_id) { + nearest_dep_node = dep_node; + max_id = dep_node->node_id; + } + } + + if (nearest_dep_node != nullptr) { + GELOGD("[%s] Nearest node = [%s]", node_item->NodeName().c_str(), nearest_dep_node->NodeName().c_str()); + auto src_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*nearest_dep_node->node); + if (src_engine_type == dst_executor_type) { + GELOGD("No need to add dependency for nodes with same executor type"); + continue; + } + auto &deps = node_item->dependents_for_execution; + if (std::find(deps.begin(), deps.end(), nearest_dep_node->node) != deps.end()) { + GELOGD("%s->%s Already has dependency, skip it", + nearest_dep_node->node->GetName().c_str(), + node_item->NodeName().c_str()); + continue; + } + nearest_dep_node->has_observer = true; + deps.emplace_back(nearest_dep_node->node); + GELOGD("Add dependency for nodes with the same parallel group[%s], src = [%s], dst = [%s]", + parallel_group.c_str(), + nearest_dep_node->NodeName().c_str(), + node_item->NodeName().c_str()); + } + } + } + return SUCCESS; +} + +Status HybridModelBuilder::OptimizeDependenciesForConstantInputs() { + std::map> converted; + for (auto &it : host_input_value_dependencies_) { + auto node_item = it.first; + std::map ref_counts; + bool changed = false; + for (auto output_idx_and_node : it.second) { + auto output_idx = output_idx_and_node.first; + auto src_node_item = output_idx_and_node.second; + ++ref_counts[src_node_item]; + NodePtr constant_node; + if (src_node_item->node_type == CONSTANT || src_node_item->node_type == CONSTANTOP) { + constant_node = src_node_item->node; + GELOGD("src node [%s] is a constant", src_node_item->NodeName().c_str()); + } else { + auto iter = known_subgraph_constant_output_refs_.find(src_node_item); + if (iter != known_subgraph_constant_output_refs_.end()) { + constant_node = iter->second[output_idx]; + if (constant_node != nullptr) { + GELOGD("Output[%u] of subgraph [%s] is a constant", output_idx, src_node_item->NodeName().c_str()); + } + } + } + if (constant_node == nullptr) { + GELOGD("Output[%u] of [%s] is not a constant", output_idx, src_node_item->NodeName().c_str()); + continue; + } + if (converted[constant_node].count(output_idx) == 0) { + GE_CHK_STATUS_RET(Convert2HostTensor(constant_node, src_node_item->node_id, output_idx), + "[%s] Failed to convert constant to host tensor", constant_node->GetName().c_str()); + converted[constant_node].emplace(output_idx); + } + src_node_item->to_const_output_id_list.erase(output_idx); + --ref_counts[src_node_item]; + changed = true; + } + if (changed) { + std::vector depends_to_keep; + for (auto &ref_count_it : ref_counts) { + if (ref_count_it.second == 0) { + GELOGD("[%s] no longer depends on [%s] for shape inference", + node_item->NodeName().c_str(), + ref_count_it.first->NodeName().c_str()); + } else { + depends_to_keep.emplace_back(ref_count_it.first->node); + } + } + node_item->dependents_for_shape_inference.swap(depends_to_keep); + } + } + + return SUCCESS; +} +Status HybridModelBuilder::Convert2HostTensor(const NodePtr &node, int node_id, uint32_t output_idx) { + auto tensor_value = hybrid_model_.GetTensor(node); + GE_CHECK_NOTNULL(tensor_value); + auto tensor_desc = node->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL(tensor_desc); + Tensor tensor(TensorAdapter::GeTensorDesc2TensorDesc(*tensor_desc)); + int64_t tensor_size = -1; + GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorSizeInBytes(*tensor_desc, tensor_size), + "[%s] Failed to get tensor size", node->GetName().c_str()); + if (tensor_size > 0) { + auto copy_size = static_cast(tensor_size); + GE_CHECK_GE(tensor_value->GetSize(), copy_size); + std::vector buffer(copy_size); + GE_CHK_RT_RET(rtMemcpy(buffer.data(), + copy_size, + tensor_value->GetData(), + copy_size, + RT_MEMCPY_DEVICE_TO_HOST)); + tensor.SetData(std::move(buffer)); + GELOGD("[%s] Copy constant tensor to host successfully, size = %zu", node->GetName().c_str(), copy_size); + } + + hybrid_model_.host_tensors_[node_id].emplace_back(output_idx, std::move(tensor)); return SUCCESS; } } // namespace hybrid diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h index 71663a6e..3e467dc8 100644 --- a/ge/hybrid/model/hybrid_model_builder.h +++ b/ge/hybrid/model/hybrid_model_builder.h @@ -47,8 +47,8 @@ class HybridModelBuilder { static Status HandleDtString(const GeTensor &tensor, void *var_addr); static Status MergeInputNodes(ComputeGraph &compute_graph); static Status MergeNetOutputNode(ComputeGraph &compute_graph); - static Status UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGraphPtr &merged_graph); - static Status UnfoldSubgraph(ComputeGraph &root_graph, ComputeGraph &parent_graph, ComputeGraph &sub_graph); + static Status UnfoldSubgraphs(ComputeGraphPtr &root_graph, ComputeGraphPtr &merged_graph); + static Status UnfoldSubgraph(ComputeGraphPtr &root_graph, ComputeGraphPtr &parent_graph, ComputeGraph &sub_graph); static Status BuildInputMapping(GraphItem &graph_item, std::vector &data_nodes, bool is_root_graph); @@ -57,13 +57,17 @@ class HybridModelBuilder { Status ValidateParams(); Status LoadGraph(); Status LoadGeModel(ComputeGraph &graph, const GeModelPtr &ge_model); + Status LoadTask(NodeItem &node_item); Status LoadTasks(); Status IdentifyVariableOutputs(NodeItem &node_item); Status IdentifySameInputs(NodeItem &node_item); Status BuildNodeItem(const NodePtr &node, NodeItem &node_item); Status GetOrCreateNodeItem(const NodePtr &node, NodeItem **node_item); + Status ParseForceInfershapeNodes(const NodePtr &node, NodeItem &node_item); + Status CollectParallelGroups(NodeItem *node_item); Status ParseDependentInputNodes(NodeItem &node_item, const std::vector &dependencies); - Status ParseDependentForFusedSubgraph(NodeItem &node_item); + Status ParseDependentForFusedSubgraph(NodeItem &node_item, std::set &dependencies); + Status ParseDependentByParallelGroup(); Status IndexTaskDefs(); Status IndexTaskDefs(const ComputeGraphPtr &sub_graph, const GeModelPtr &ge_model); Status IndexSpecialNodes(); @@ -87,6 +91,8 @@ class HybridModelBuilder { Status GenerateBpProfilingTask(const OpDescPtr &op_desc, vector &task_def_list); Status GenerateEndProfilingTask(const OpDescPtr &op_desc, vector &task_def_list); Status GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, vector &task_def_list); + Status OptimizeDependenciesForConstantInputs(); + Status Convert2HostTensor(const NodePtr &node, int node_id, uint32_t output_idx); const char* GetGraphName() const { return hybrid_model_.model_name_.c_str(); @@ -98,13 +104,20 @@ class HybridModelBuilder { GeRootModelPtr ge_root_model_; std::map subgraph_models_; std::map constant_op_nodes_; + std::map> parallel_group_to_nodes_; + std::map> node_to_parallel_groups_; HybridModel &hybrid_model_; std::map>> node_ref_inputs_; - int node_index = 0; RuntimeParam &runtime_param_; VarManager *var_manager_ = nullptr; + + // map> + std::map> known_subgraph_constant_output_refs_; + + // map> + std::map>> host_input_value_dependencies_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/node_item.cc b/ge/hybrid/model/node_item.cc index 100530fc..8211dde3 100644 --- a/ge/hybrid/model/node_item.cc +++ b/ge/hybrid/model/node_item.cc @@ -36,10 +36,10 @@ std::set kControlOpTypes{ Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgraph) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(FAILED, - "[%s] Failed to get attr [%s]", - op_desc.GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt][%s] Failed to get attr [%s]", + op_desc.GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get attr [%s]", + op_desc.GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } @@ -58,10 +58,10 @@ Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgr Status ParseOutputMapping(const OpDescPtr &op_desc, FusedSubgraph &fused_subgraph) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(FAILED, - "[%s] Failed to get attr [%s]", - op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt][%s] Failed to get attr [%s]", + op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get attr [%s].", + op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } @@ -122,7 +122,7 @@ Status NodeItem::Create(const NodePtr &node, std::unique_ptr &node_ite GE_CHECK_NOTNULL(node->GetOpDesc()); std::unique_ptr instance(new(std::nothrow)NodeItem(node)); GE_CHECK_NOTNULL(instance); - GE_CHK_STATUS_RET(instance->Init(), "Failed to init NodeItem [%s] .", node->GetName().c_str()); + GE_CHK_STATUS_RET(instance->Init(), "[Invoke][Init]Failed to init NodeItem [%s] .", node->GetName().c_str()); node_item = std::move(instance); return SUCCESS; } @@ -149,14 +149,16 @@ Status NodeItem::InitInputsAndOutputs() { if (AttrUtils::GetInt(op_desc, ::ge::ATTR_STAGE_LEVEL, group)) { GELOGD("[%s] Got stage level from op_desc = %d", op_desc->GetName().c_str(), group); } else { - if (AttrUtils::GetInt(node->GetOwnerComputeGraph(), ::ge::ATTR_STAGE_LEVEL, group)) { - GELOGD("[%s] Got stage level from parent graph = %d", op_desc->GetName().c_str(), group); - } else { - auto parent_node = node->GetOwnerComputeGraph()->GetParentNode(); - if ((parent_node != nullptr) && (AttrUtils::GetInt(parent_node->GetOpDesc(), ::ge::ATTR_STAGE_LEVEL, group))) { - GELOGD("[%s] Got stage level from parent node = %d", op_desc->GetName().c_str(), group); + if (node->GetOwnerComputeGraph() != nullptr) { + if (AttrUtils::GetInt(node->GetOwnerComputeGraph(), ::ge::ATTR_STAGE_LEVEL, group)) { + GELOGD("[%s] Got stage level from parent graph = %d", op_desc->GetName().c_str(), group); } else { - GELOGD("[%s] Node do not set stage level", op_desc->GetName().c_str()); + auto parent_node = node->GetOwnerComputeGraph()->GetParentNode(); + if ((parent_node != nullptr) && (AttrUtils::GetInt(parent_node->GetOpDesc(), ::ge::ATTR_STAGE_LEVEL, group))) { + GELOGD("[%s] Got stage level from parent node = %d", op_desc->GetName().c_str(), group); + } else { + GELOGD("[%s] Node do not set stage level", op_desc->GetName().c_str()); + } } } } @@ -166,10 +168,10 @@ Status NodeItem::InitInputsAndOutputs() { Status NodeItem::ResolveDynamicState() { (void) AttrUtils::GetBool(op_desc, ATTR_NAME_FORCE_UNKNOWN_SHAPE, is_dynamic); - GELOGD("node name = %s, is_dynamic = %d.", this->node_name.c_str(), is_dynamic); + GELOGD("Node name is %s, dynamic state is %d.", this->node_name.c_str(), is_dynamic); if (!is_dynamic) { GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node, is_dynamic), - "[%s] Failed to get shape status.", + "[Invoke][GetNodeUnknownShapeStatus][%s] Failed to get shape status.", node->GetName().c_str()); } return SUCCESS; @@ -239,7 +241,8 @@ Status NodeItem::Init() { ResolveUnknownShapeType(); if (is_dynamic) { GE_CHK_STATUS_RET_NOLOG(ResolveStaticInputsAndOutputs()); - GE_CHK_STATUS_RET(ParseFusedSubgraph(*this), "[%s] Failed to parse fused subgraph", node_name.c_str()); + GE_CHK_STATUS_RET(ParseFusedSubgraph(*this), + "[Invoke][ParseFusedSubgraph][%s] Failed to parse fused subgraph", node_name.c_str()); } return SUCCESS; @@ -249,6 +252,10 @@ bool NodeItem::IsControlOp() const { return ge::hybrid::IsControlOp(op_desc->GetType()); } +bool NodeItem::IsHcclOp() const { + return NodeExecutorManager::GetInstance().ResolveExecutorType(*node) == NodeExecutorManager::ExecutorType::HCCL; +} + std::string NodeItem::DebugString() const { std::stringstream ss; ss << "Node: "; @@ -291,23 +298,56 @@ void NodeItem::SetToDynamic() { } } -GeTensorDescPtr NodeItem::MutableInputDesc(int index) const { +GeTensorDescPtr NodeItem::DoGetInputDesc(int index) const { if (!has_optional_inputs) { return op_desc->MutableInputDesc(static_cast(index)); } if (index < 0 || index >= num_inputs) { - GELOGE(PARAM_INVALID, - "[%s] Invalid input index, num inputs = %d, index = %d", - node_name.c_str(), - num_inputs, - index); + GELOGE(PARAM_INVALID, "[Check][Param:index][%s] Invalid input index, num inputs = %d, index = %d", + node_name.c_str(), num_inputs, index); + REPORT_INNER_ERROR("E19999", "Invalid input index, node:%s num inputs = %d, index = %d", + node_name.c_str(), num_inputs, index); return nullptr; } return op_desc->MutableInputDesc(input_desc_indices_[index]); } +GeTensorDescPtr NodeItem::MutableInputDesc(int index) const { + std::lock_guard lk(mu_); + return DoGetInputDesc(index); +} + +Status NodeItem::GetInputDesc(int index, GeTensorDesc &tensor_desc) const { + std::lock_guard lk(mu_); + auto input_desc = DoGetInputDesc(index); + GE_CHECK_NOTNULL(input_desc); + tensor_desc = *input_desc; + return SUCCESS; +} + +Status NodeItem::GetOutputDesc(int index, GeTensorDesc &tensor_desc) const { + std::lock_guard lk(mu_); + auto output_desc = op_desc->MutableOutputDesc(static_cast(index)); + GE_CHECK_NOTNULL(output_desc); + tensor_desc = *output_desc; + return SUCCESS; +} + +GeTensorDescPtr NodeItem::MutableOutputDesc(int index) const { + std::lock_guard lk(mu_); + return op_desc->MutableOutputDesc(static_cast(index)); +} + +Status NodeItem::UpdateInputDesc(int index, const GeTensorDesc &tensor_desc) { + std::lock_guard lk(mu_); + auto input_desc = DoGetInputDesc(index); + GE_CHECK_NOTNULL(input_desc); + *input_desc = tensor_desc; + return SUCCESS; +} + Status NodeItem::GetCanonicalInputIndex(uint32_t index, int &canonical_index) const { if (!has_optional_inputs) { canonical_index = index; @@ -316,7 +356,11 @@ Status NodeItem::GetCanonicalInputIndex(uint32_t index, int &canonical_index) co auto iter = std::find(input_desc_indices_.begin(), input_desc_indices_.end(), index); if (iter == input_desc_indices_.end()) { - GELOGE(INTERNAL_ERROR, "[%s] Invalid input index: %u", node_name.c_str(), index); + GELOGE(INTERNAL_ERROR, + "[Check][Param:index]input index:%u not in input_desc_indices_, check Invalid, node:%s", + index, node_name.c_str()); + REPORT_INNER_ERROR("E19999", "input index:%u not in input_desc_indices_, check Invalid, node:%s", + index, node_name.c_str()); return INTERNAL_ERROR; } @@ -331,7 +375,9 @@ bool NodeItem::IsInputShapeStatic(int index) const { } if (static_cast(index) >= is_input_shape_static_.size()) { - GELOGE(PARAM_INVALID, "Input index(%d) out of range: [0, %zu)", index, is_input_shape_static_.size()); + GELOGE(PARAM_INVALID, "[Check][Param:index]Input index(%d) out of range: [0, %zu)", + index, is_input_shape_static_.size()); + REPORT_INNER_ERROR("E19999", "Input index(%d) out of range: [0, %zu).", index, is_input_shape_static_.size()); return false; } diff --git a/ge/hybrid/model/node_item.h b/ge/hybrid/model/node_item.h index 300744d1..54c5e938 100644 --- a/ge/hybrid/model/node_item.h +++ b/ge/hybrid/model/node_item.h @@ -17,6 +17,7 @@ #ifndef GE_HYBRID_MODEL_NODE_ITEM_H_ #define GE_HYBRID_MODEL_NODE_ITEM_H_ +#include #include #include "external/ge/ge_api_error_codes.h" #include "graph/node.h" @@ -57,16 +58,22 @@ struct NodeItem { bool IsInputShapeStatic(int index) const; - GeTensorDescPtr MutableOutputDesc(int index) const { - return op_desc->MutableOutputDesc(static_cast(index)); - } + GeTensorDescPtr MutableOutputDesc(int index) const; + + Status UpdateInputDesc(int index, const GeTensorDesc &tensor_desc); GeTensorDescPtr MutableInputDesc(int index) const; + Status GetInputDesc(int index, GeTensorDesc &tensor_desc) const; + + Status GetOutputDesc(int index, GeTensorDesc &tensor_desc) const; + Status GetCanonicalInputIndex(uint32_t index, int &canonical_index) const; bool IsControlOp() const; + bool IsHcclOp() const; + void SetToDynamic(); std::string DebugString() const; @@ -83,6 +90,7 @@ struct NodeItem { bool has_observer = false; bool has_optional_inputs = false; bool is_output_shape_static = true; + bool is_need_force_infershape = false; UnknowShapeOpType shape_inference_type = DEPEND_IN_SHAPE; std::string node_name; std::string node_type; @@ -110,9 +118,11 @@ struct NodeItem { Status ResolveDynamicState(); Status ResolveStaticInputsAndOutputs(); void ResolveUnknownShapeType(); + GeTensorDescPtr DoGetInputDesc(int index) const; std::vector is_input_shape_static_; std::vector input_desc_indices_; + mutable std::mutex mu_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc index 119db0af..29ae831c 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc @@ -42,7 +42,7 @@ AiCoreNodeTask::AiCoreNodeTask(std::vector> &&task Status AiCoreNodeExecutor::Initialize() { compiler_ = TaskCompilerFactory::GetInstance().GetTaskCompiler(); if (compiler_ != nullptr) { - GE_CHK_STATUS_RET(compiler_->Initialize(), "Failed to init aicore task compiler."); + GE_CHK_STATUS_RET(compiler_->Initialize(), "[Init][TaskCompiler] failed."); } return SUCCESS; } @@ -60,8 +60,12 @@ Status AiCoreNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &nod node->GetName().c_str()); return SUCCESS; } else { - GELOGE(FAILED, "Task_defs is empty for node (%s) which 'support_dynamicshape' is true, failed.", + GELOGE(FAILED, "[Invoke][GetBool]Task_defs is empty for node (%s)" + "which 'support_dynamicshape' is true, check invalid", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Task_defs is empty for node (%s)" + "which 'support_dynamicshape' is true, check invalid", + node->GetName().c_str()); return FAILED; } } @@ -69,7 +73,7 @@ Status AiCoreNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &nod AiCoreTaskBuilder builder(node->GetOpDesc(), *task_defs); std::unique_ptr node_task; GE_CHK_STATUS_RET(builder.BuildTask(node_task, true, is_single_op), - "[%s] Failed to build op tasks.", node->GetName().c_str()); + "[Invoke][BuildTask][%s] Failed to build op tasks.", node->GetName().c_str()); task = std::move(node_task); GELOGI("AiCoreNodeExecutor(%s) LoadTask End.", node->GetName().c_str()); return SUCCESS; @@ -105,7 +109,8 @@ bool AiCoreNodeTaskRegistry::AddTask(const std::string &node_key, const std::sha std::lock_guard lock(mutex_); auto iter = reg_node_tasks_.find(node_key); if (iter != reg_node_tasks_.end()) { - GELOGE(FAILED, "AiCoreNodeTaskRegistry(%s) AddTask failed, key already exist.", node_key.c_str()); + GELOGE(FAILED, "[Add][Task] failed, key:%s already exist.", node_key.c_str()); + REPORT_INNER_ERROR("E19999", "AddTask failed, key:%s already exist.", node_key.c_str()); return false; } auto ret = reg_node_tasks_.emplace(node_key, task); @@ -131,13 +136,14 @@ Status AiCoreNodeExecutor::CompileTask(const HybridModel &model, auto ori_node_name = node->GetName(); if (compiler_ == nullptr) { - GELOGE(FAILED, "[%s] Can not find any valid aicore task compiler.", ori_node_name.c_str()); + GELOGE(FAILED, "[Find][Compiler][%s] Can not find any valid aicore task compiler.", ori_node_name.c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Can not find any valid aicore task compiler.", ori_node_name.c_str()); return FAILED; } AiCoreNodeTaskRegistry ®istry = AiCoreNodeTaskRegistry::GetInstance(); std::string shape_key; - GE_CHK_STATUS_RET(GenNodeKey(node, shape_key), "GenNodeKey failed, op name = %s.", node->GetName().c_str()); + GE_CHK_STATUS_RET(GenNodeKey(node, shape_key), "[Generate][NodeKey] failed, op name = %s.", node->GetName().c_str()); auto node_key = std::to_string(model.GetModelId()) + "/" + shape_key; GELOGD("NodeKey for %s = %s", node->GetName().c_str(), node_key.c_str()); @@ -152,19 +158,21 @@ Status AiCoreNodeExecutor::CompileTask(const HybridModel &model, std::vector task_defs; op_desc->SetName(ori_node_name + "_" + shape_key); - GE_CHK_STATUS_RET(compiler_->CompileOp(node, task_defs), "Compile op(%s) failed.", ori_node_name.c_str()); + GE_CHK_STATUS_RET(compiler_->CompileOp(node, task_defs), "[Compile][Op:%s] failed.", ori_node_name.c_str()); op_desc->SetName(ori_node_name); GELOGD("successfully generated task_defs: %s", node->GetName().c_str()); AiCoreTaskBuilder builder(node->GetOpDesc(), task_defs); std::unique_ptr node_task; - GE_CHK_STATUS_RET(builder.BuildTask(node_task, false), "[%s] Failed to build op tasks.", node->GetName().c_str()); + GE_CHK_STATUS_RET(builder.BuildTask(node_task, false), + "[Invoke][BuildTask][%s] Failed to build op tasks.", node->GetName().c_str()); node_task->SetWorkspaceSizes(op_desc->GetWorkspaceBytes()); aicore_task = std::move(node_task); GELOGD("successfully created node task: %s", node->GetName().c_str()); if (!registry.AddTask(node_key, aicore_task)) { - GELOGE(INTERNAL_ERROR, "Add NodeTask failed, op name = %s.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Add][NodeTask] failed, op name = %s.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "add task failed, op name = %s.", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -196,9 +204,12 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function uint32_t stream_id = 0; rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); // must be called after Launch kernel if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Get task_id and stream_id failed, ret: 0x%X.", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtGetTaskIdAndStreamID] failed, ret: 0x%X.", rt_ret); + REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } + context.SetTaskId(task_id); + context.SetStreamId(stream_id); GELOGD("Aicore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); (void)context.SaveProfilingTaskDescInfo(task_id, stream_id, kTaskTypeAicore, (*it)->GetBlockDim()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); @@ -271,7 +282,8 @@ Status AiCoreNodeTask::CheckOverflow(TaskContext &context) { GELOGW("Dynamic shape op %s is over flow", context.GetNodeName()); return SUCCESS; } else if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtstreamsynchronize failed"); + GELOGE(rt_ret, "[Invoke][rtstreamsynchronize] failed, ret:%d.", rt_ret); + REPORT_CALL_ERROR("E19999", "rtstreamsynchronize failed, ret:%d.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } return SUCCESS; diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc index 07c2ddb5..8bb871fb 100644 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc @@ -22,6 +22,7 @@ #include "hybrid/node_executor/aicore/aicore_task_builder.h" #include "graph/load/model_manager/tbe_handle_store.h" #include "graph/types.h" +#include "single_op/task/build_task_utils.h" using optiling::OpRunInfo; @@ -31,6 +32,7 @@ namespace { constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; constexpr char const *kAttrOpParamSize = "op_para_size"; constexpr char const *kAttrAtomicOpParamSize = "atomic_op_para_size"; +std::atomic log_id(0); } // namespace TbeHandleHolder::TbeHandleHolder(void *bin_handle) @@ -48,6 +50,12 @@ bool TbeHandleRegistry::AddHandle(std::unique_ptr &&holder) { } Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) { + log_name_ = op_desc.GetName() + "_tvmbin"; + log_id_ = log_id++; + auto op_desc_ptr = MakeShared(op_desc); + GE_CHECK_NOTNULL(op_desc_ptr); + auto task_info = BuildTaskUtils::GetTaskInfo(op_desc_ptr); + GELOGI("[TASK_INFO] %lu/%s %s.", log_id_, log_name_.c_str(), task_info.c_str()); GE_CHK_STATUS_RET_NOLOG(InitWithTaskDef(op_desc, task_def)); GE_CHK_STATUS_RET_NOLOG(InitTilingInfo(op_desc)); @@ -71,22 +79,22 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) } Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) { - auto op_desc_ptr = std::make_shared(op_desc); - GE_CHECK_NOTNULL(op_desc_ptr); - auto tbe_kernel = op_desc_ptr->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); - if (tbe_kernel == nullptr) { - GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc_ptr->GetName().c_str()); - return INTERNAL_ERROR; - } - TBEHandleStore &kernel_store = TBEHandleStore::GetInstance(); rtError_t rt_ret = rtQueryFunctionRegistered(stub_name_.c_str()); if (rt_ret != RT_ERROR_NONE || is_single_op_) { + auto op_desc_ptr = MakeShared(op_desc); + GE_CHECK_NOTNULL(op_desc_ptr); + auto tbe_kernel = op_desc_ptr->TryGetExtAttr(GetKeyForTbeKernel(), TBEKernelPtr()); + if (tbe_kernel == nullptr) { + GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc_ptr->GetName().c_str()); + return INTERNAL_ERROR; + } + TBEHandleStore &kernel_store = TBEHandleStore::GetInstance(); void *bin_handle = nullptr; if (!kernel_store.FindTBEHandle(stub_name_.c_str(), bin_handle)) { GELOGI("TBE: can't find the binfile_key[%s] in HandleMap", stub_name_.c_str()); rtDevBinary_t binary; std::string json_string; - GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, TVM_ATTR_NAME_MAGIC, json_string), + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, GetKeyForTvmMagic(), json_string), GELOGI("Get original type of session_graph_id.")); if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AICPU") { binary.magic = RT_DEV_BINARY_MAGIC_ELF_AICPU; @@ -95,7 +103,12 @@ Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) { } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") { binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; } else { - GELOGE(PARAM_INVALID, "TBE: Invalid parameter magic number! json: %s", json_string.c_str()); + GELOGE(PARAM_INVALID, "[Check][JsonStr]Attr:%s in op:%s(%s), value:%s check invalid", + TVM_ATTR_NAME_MAGIC.c_str(), op_desc_ptr->GetName().c_str(), + op_desc_ptr->GetType().c_str(), json_string.c_str()); + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%s check invalid", + TVM_ATTR_NAME_MAGIC.c_str(), op_desc_ptr->GetName().c_str(), + op_desc_ptr->GetType().c_str(), json_string.c_str()); return PARAM_INVALID; } binary.version = 0; @@ -104,20 +117,22 @@ Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) { GELOGI("TBE: binary.length: %lu", binary.length); GE_CHK_RT_RET(rtDevBinaryRegister(&binary, &bin_handle)); std::string meta_data; - GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, TVM_ATTR_NAME_METADATA, meta_data), + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, GetKeyForTvmMetaData(), meta_data), GELOGI("Get original type of json_string")); GELOGI("TBE: meta data: %s", meta_data.empty() ? "null" : meta_data.c_str()); - GE_IF_BOOL_EXEC(!meta_data.empty(), GE_CHK_RT_RET(rtMetadataRegister(bin_handle, meta_data.c_str()))); + GE_IF_BOOL_EXEC(!meta_data.empty(), + GE_CHK_RT_RET(rtMetadataRegister(bin_handle, meta_data.c_str()))); kernel_store.StoreTBEHandle(stub_name_.c_str(), bin_handle, tbe_kernel); } else { GELOGI("TBE: find the binfile_key[%s] in HandleMap", stub_name_.c_str()); kernel_store.ReferTBEHandle(stub_name_.c_str()); } std::string kernel_name; - GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, op_desc_ptr->GetName() + "_kernelname", kernel_name), + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, GetKeyForKernelName(op_desc), kernel_name), GELOGI("Get original type of kernel_name")); GELOGI("TBE: binfile_key=%s, kernel_name=%s", stub_name_.c_str(), kernel_name.c_str()); - GE_CHK_RT_RET(rtFunctionRegister(bin_handle, stub_name_.c_str(), stub_name_.c_str(), kernel_name.c_str(), 0)); + GE_CHK_RT_RET(rtFunctionRegister(bin_handle, stub_name_.c_str(), + stub_name_.c_str(), kernel_name.c_str(), 0)); } return SUCCESS; } @@ -126,7 +141,9 @@ Status AiCoreOpTask::RegisterKernelHandle(const OpDesc &op_desc) { TbeHandleRegistry ®istry = TbeHandleRegistry::GetInstance(); auto tbe_kernel = op_desc.TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); if (tbe_kernel == nullptr) { - GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][TryGetExtAttr]TBE: %s can't find tvm bin file!", + op_desc.GetName().c_str()); + REPORT_CALL_ERROR("E19999", "TBE: %s can't find tvm bin file.", op_desc.GetName().c_str()); return INTERNAL_ERROR; } @@ -143,7 +160,12 @@ Status AiCoreOpTask::RegisterKernelHandle(const OpDesc &op_desc) { } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") { binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; } else { - GELOGE(PARAM_INVALID, "TBE: Invalid parameter magic number! json: %s", json_string.c_str()); + GELOGE(PARAM_INVALID, "[Check][JsonStr]Attr:%s in op:%s(%s), value:%s check invalid", + TVM_ATTR_NAME_MAGIC.c_str(), op_desc.GetName().c_str(), + op_desc.GetType().c_str(), json_string.c_str()); + REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%s check invalid", + TVM_ATTR_NAME_MAGIC.c_str(), op_desc.GetName().c_str(), + op_desc.GetType().c_str(), json_string.c_str()); return PARAM_INVALID; } binary.version = 0; @@ -154,11 +176,15 @@ Status AiCoreOpTask::RegisterKernelHandle(const OpDesc &op_desc) { handle_ = bin_handle; auto holder = std::unique_ptr(new (std::nothrow) TbeHandleHolder(handle_)); if (holder == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create HandleHodler failed."); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, + "[Create][TbeHandleHolder] failed, node name = %s", op_desc.GetName().c_str()); + REPORT_CALL_ERROR("E19999", "create TbeHandleHolder failed, node name = %s.", + op_desc.GetName().c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } if (!registry.AddHandle(std::move(holder))) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Add handle failed. node name = %s", op_desc.GetName().c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Add][Handle] failed. node name = %s", op_desc.GetName().c_str()); + REPORT_CALL_ERROR("E19999", "AddHandle failed, node name = %s.", op_desc.GetName().c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } return SUCCESS; @@ -176,39 +202,48 @@ Status AiCoreOpTask::InitWithKernelDef(const OpDesc &op_desc, const domi::TaskDe args_.reset(new(std::nothrow) uint8_t[args_size_]); GE_CHECK_NOTNULL(args_); if (kernel_def.args().size() < args_size_) { - GELOGE(INTERNAL_ERROR, "args size of kernel_def is smaller than args_size_"); + GELOGE(INTERNAL_ERROR, "[Check][Size]args size:%zu of kernel_def is smaller than args_size_:%u, op:%s op_type:%s", + kernel_def.args().size(), args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "args size:%zu of kernel_def is smaller than args_size_:%u op:%s op_type:%s.", + kernel_def.args().size(), args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } errno_t err = memcpy_s(args_.get(), args_size_, kernel_def.args().data(), args_size_); if (err != EOK) { - GELOGE(INTERNAL_ERROR, "AiCoreTask memcpy args failed."); + GELOGE(INTERNAL_ERROR, "[Update][Date]AiCoreTask memcpy args failed, op:%s op_type:%s.", + op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "AiCoreTask memcpy args failed, op:%s op_type:%s.", + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } if (context.args_offset().size() < sizeof(uint16_t)) { - GELOGE(INTERNAL_ERROR, "Invalid args_offset, size = %zu.", context.args_offset().size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]Invalid args_offset," + "size:%zu is smaller than size of uint16_t, op:%s op_type:%s", + context.args_offset().size(), op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "Invalid args_offset, size:%zu is smaller than size of uint16_t, op:%s op_type:%s", + context.args_offset().size(), op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } const auto *args_offset_buffer = reinterpret_cast(context.args_offset().data()); - uint32_t offset = *args_offset_buffer; - if (offset > args_size_) { - GELOGE(INTERNAL_ERROR, - "[%s] Arg offset out of range. offset = %u, arg size = %u", - GetName().c_str(), - offset, - args_size_); + offset_ = *args_offset_buffer; + if (offset_ > args_size_) { + GELOGE(INTERNAL_ERROR, "[Check][Offset][%s] Arg offset out of range. offset = %u," + "arg size = %u , op:%s op_type:%s", GetName().c_str(), offset_, args_size_, + op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Arg offset out of range. offset = %u, arg size = %u" + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } - arg_base_ = reinterpret_cast(args_.get() + offset); - max_arg_count_ = (args_size_ - offset) / sizeof(void *); - GELOGD("[%s] Done setting kernel args successfully. stub_func = %s, block_dim = %d, arg base = %p, arg size = %u", - op_desc.GetName().c_str(), - stub_name_.c_str(), - block_dim_, - arg_base_, - args_size_); + arg_base_ = reinterpret_cast(args_.get() + offset_); + max_arg_count_ = (args_size_ - offset_) / sizeof(void *); + GELOGD("[%s] Done setting kernel args successfully. stub_func = %s, block_dim = %d," + "arg base = %p, arg size = %u", + op_desc.GetName().c_str(), stub_name_.c_str(), + block_dim_, arg_base_, args_size_); return SUCCESS; } @@ -225,43 +260,61 @@ Status AiCoreOpTask::InitWithKernelDefWithHandle(const OpDesc &op_desc, const do args_.reset(new(std::nothrow) uint8_t[args_size_]); GE_CHECK_NOTNULL(args_); if (kernel_with_handle.args().size() < args_size_) { - GELOGE(INTERNAL_ERROR, "args size of kernel_def is smaller than args_size_"); + GELOGE(INTERNAL_ERROR, "[Check][Size]args size:%zu of kernel_def is smaller than args_size_:%u. op:%s op_type:%s", + kernel_with_handle.args().size(), args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "args size:%zu of kernel_def is smaller than args_size_:%u. op:%s op_type:%s", + kernel_with_handle.args().size(), args_size_, + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } errno_t err = memcpy_s(args_.get(), args_size_, kernel_with_handle.args().data(), args_size_); if (err != EOK) { - GELOGE(INTERNAL_ERROR, "AiCoreTask memcpy args failed."); + GELOGE(INTERNAL_ERROR, "[Update][Date]AiCoreTask memcpy args failed. op:%s op_type:%s", + op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_CALL_ERROR("E19999", "AiCoreTask memcpy args failed. op:%s op_type:%s", + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } if (context.args_offset().size() < sizeof(uint16_t)) { - GELOGE(INTERNAL_ERROR, "Invalid args_offset, size = %zu.", context.args_offset().size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]Invalid args_offset, size:%zu is smaller" + "than size of uint16_t. op:%s op_type:%s", context.args_offset().size(), + op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "Invalid args_offset, size:%zu is smaller" + "than size of uint16_t. op:%s op_type:%s", context.args_offset().size(), + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } const auto *args_offset_buffer = reinterpret_cast(context.args_offset().data()); - uint32_t offset = *args_offset_buffer; - if (offset > args_size_) { - GELOGE(INTERNAL_ERROR, - "[%s] Arg offset out of range. offset = %u, arg size = %u", - GetName().c_str(), - offset, - args_size_); + offset_ = *args_offset_buffer; + if (offset_ > args_size_) { + GELOGE(INTERNAL_ERROR, "[Check][Offset][%s] Arg offset out of range. offset = %u, arg size = %u" + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, + op_desc.GetName().c_str(), op_desc.GetType().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Arg offset out of range. offset = %u, arg size = %u" + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, + op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } - arg_base_ = reinterpret_cast(args_.get() + offset); - max_arg_count_ = (args_size_ - offset) / sizeof(void *); + arg_base_ = reinterpret_cast(args_.get() + offset_); + max_arg_count_ = (args_size_ - offset_) / sizeof(void *); return SUCCESS; } Status AiCoreOpTask::InitWithTaskDef(const OpDesc &op_desc, const domi::TaskDef &task_def) { - GE_CHK_STATUS_RET(ValidateTaskDef(task_def), - "[%s] Failed to validate task def: [%s]", - op_desc.GetName().c_str(), - task_def.DebugString().c_str()); - + + auto rt_ret = ValidateTaskDef(task_def); + if (rt_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "op:%s(op_type:%s) failed to validate task def:%s", + op_desc.GetName().c_str(), op_desc.GetType().c_str(), task_def.DebugString().c_str()); + GELOGE(rt_ret, "[Invoke][ValidateTaskDef]failed for op:%s(op_type:%s) to validate task def:%s", + op_desc.GetName().c_str(), op_desc.GetType().c_str(), task_def.DebugString().c_str()); + return rt_ret; + } + if (task_def.type() != RT_MODEL_TASK_ALL_KERNEL) { GE_CHK_STATUS_RET(InitWithKernelDef(op_desc, task_def)); } else { @@ -273,14 +326,18 @@ Status AiCoreOpTask::InitWithTaskDef(const OpDesc &op_desc, const domi::TaskDef Status AiCoreOpTask::ValidateTaskDef(const domi::TaskDef &task_def) { auto task_type = static_cast(task_def.type()); if (task_type != RT_MODEL_TASK_KERNEL && task_type != RT_MODEL_TASK_ALL_KERNEL) { - GELOGE(INTERNAL_ERROR, "Invalid task type (%d) in AiCore CreateTask.", static_cast(task_type)); + GELOGE(INTERNAL_ERROR, + "[Check][TaskType]Invalid task type (%d) in AiCore CreateTask.", static_cast(task_type)); return INTERNAL_ERROR; } const auto &context = task_type == RT_MODEL_TASK_KERNEL ? task_def.kernel().context() : task_def.kernel_with_handle().context(); auto kernel_type = static_cast(context.kernel_type()); if (kernel_type != ccKernelType::TE) { - GELOGE(INTERNAL_ERROR, "Invalid kernel type(%d) in AiCore TaskDef.", static_cast(kernel_type)); + GELOGE(INTERNAL_ERROR, + "[Check][TaskType]Invalid kernel type(%d) in AiCore TaskDef.", static_cast(kernel_type)); + REPORT_INNER_ERROR("E19999", "Invalid kernel type(%d) in AiCore TaskDef.", + static_cast(kernel_type)); return INTERNAL_ERROR; } @@ -307,11 +364,9 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) { auto execution_context = context.GetExecutionContext(); - GetContext().SetSessionId(execution_context->context_id); RECORD_EXECUTION_EVENT(execution_context, context.GetNodeName(), "[CalcTilingInfo] Start"); GE_CHK_STATUS_RET(CalcTilingInfo(node, tiling_info)); RECORD_EXECUTION_EVENT(execution_context, context.GetNodeName(), "[CalcTilingInfo] End"); - GetContext().SetSessionId(execution_context->session_id); // update op args by tiling info block_dim_ = static_cast(tiling_info.block_dim); @@ -326,13 +381,22 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) { return SUCCESS; } if (tiling_buffer_ == nullptr) { - GELOGE(INTERNAL_ERROR, "tiling_buffer is nullptr while tiling_data is not empty!"); + GELOGE(INTERNAL_ERROR, "[Check][Buffer] %s tiling_buffer is nullptr while tiling_data is not empty!", + op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "%s tiling_buffer is nullptr while tiling_data is not empty.", + op_desc->GetName().c_str()); return INTERNAL_ERROR; } if (tiling_data_.size() > tiling_buffer_->GetSize()) { - GELOGE(INTERNAL_ERROR, "[%s] Tiling data size now (%zu) shouldn't larger than we alloc before (%zu).", - stub_name_.c_str(), tiling_data_.size(), tiling_buffer_->GetSize()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Tiling data size now (%zu)" + "shouldn't larger than we alloc before (%zu). op:%s op_type:%s", + stub_name_.c_str(), tiling_data_.size(), tiling_buffer_->GetSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Tiling data size now (%zu)" + "shouldn't larger than we alloc before (%zu). op:%s op_type:%s", + stub_name_.c_str(), tiling_data_.size(), tiling_buffer_->GetSize(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return INTERNAL_ERROR; } @@ -349,28 +413,34 @@ Status AiCoreOpTask::UpdateTilingInfo(TaskContext &context) { Status AiCoreOpTask::CalcTilingInfo(const NodePtr &node, OpRunInfo &tiling_info) { GELOGD("[%s] Start to invoke OpParaCalculate.", node->GetName().c_str()); GE_CHK_STATUS_RET(OpParaCalculate(*node, tiling_info), - "Failed calc tiling data of node %s.", + "[Invoke][OpParaCalculate]Failed calc tiling data of node %s.", node->GetName().c_str()); - if (is_single_op_) { - tiling_info.clear_atomic = false; - } GELOGD("[%s] Done invoking OpParaCalculate successfully.", node->GetName().c_str()); return SUCCESS; } Status AiCoreOpTask::UpdateArgs(TaskContext &task_context) { - size_t expected_arg_count = task_context.NumInputs() + task_context.NumOutputs() + task_context.NumWorkspaces() + size_t expected_arg_count = task_context.NumInputs() + task_context.NumOutputs() + + task_context.NumWorkspaces() - output_indices_to_skip_.size(); if (tiling_buffer_ != nullptr) { ++expected_arg_count; } if (expected_arg_count > max_arg_count_) { - GELOGE(INTERNAL_ERROR, - "[%s] Invalid arg memory, max arg count = %u, but expect = %zu", - GetName().c_str(), - max_arg_count_, - expected_arg_count); - return INTERNAL_ERROR; + GELOGD("Need to reset size of args_ from %u to %zu.", max_arg_count_, expected_arg_count); + auto length = expected_arg_count * sizeof(uintptr_t) + offset_; + std::unique_ptr new_args(new(std::nothrow) uint8_t[length]); + GE_CHECK_NOTNULL(new_args); + if (memcpy_s(new_args.get(), length, args_.get(), offset_) != EOK) { + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][new_args]failed, dst length is %zu, src length is %u.", + length, offset_); + REPORT_INNER_ERROR("E19999", "update kernel args failed of %s.", task_context.GetNodeName()); + return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; + } + args_ = std::move(new_args); + max_arg_count_ = static_cast(expected_arg_count); + args_size_ = static_cast(length); + arg_base_ = reinterpret_cast(args_.get() + offset_); } int index = 0; @@ -383,7 +453,8 @@ Status AiCoreOpTask::UpdateArgs(TaskContext &task_context) { for (int i = 0; i < task_context.NumOutputs(); ++i) { const auto output = task_context.GetOutput(i); GE_CHECK_NOTNULL(output); - if (find(output_indices_to_skip_.begin(), output_indices_to_skip_.end(), i) != output_indices_to_skip_.end()) { + if (find(output_indices_to_skip_.begin(), output_indices_to_skip_.end(), i) != + output_indices_to_skip_.end()) { GELOGD("Node:%s output[%d] is an optional, the address don't need to be saved.", task_context.GetNodeName(), i); continue; @@ -415,17 +486,18 @@ Status AiCoreOpTask::LaunchKernel(rtStream_t stream) { if (handle_ != nullptr) { std::string dev_func = original_kernel_key_ + std::to_string(tiling_key_); std::string kernel_info = node_info_ + std::to_string(tiling_key_); - GELOGD("AiCoreOpTask rtKernelLaunchWithHandle Start (dev_func = %s, block_dim = %u).", dev_func.c_str(), - block_dim_); - GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(), args_size_, nullptr, - stream, kernel_info.c_str())); - GELOGD("AiCoreOpTask rtKernelLaunchWithHandle End (dev_func = %s, block_dim = %u).", dev_func.c_str(), - block_dim_); + GELOGD("AiCoreOpTask rtKernelLaunchWithHandle Start (dev_func = %s, block_dim = %u).", + dev_func.c_str(), block_dim_); + GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(), + args_size_, nullptr, stream, kernel_info.c_str())); + GELOGD("AiCoreOpTask rtKernelLaunchWithHandle End (dev_func = %s, block_dim = %u).", + dev_func.c_str(), block_dim_); } else { GELOGD("AiCoreOpTask LaunchKernel Start (task = %s, block_dim = %u).", stub_name_.c_str(), block_dim_); GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), args_size_, nullptr, stream)); GELOGD("AiCoreOpTask LaunchKernel End (task = %s, block_dim = %u).", stub_name_.c_str(), block_dim_); } + GELOGI("[TASK_INFO] %lu/%s", log_id_, log_name_.c_str()); return SUCCESS; } @@ -441,7 +513,8 @@ Status AiCoreOpTask::InitTilingInfo(const OpDesc &op_desc) { (void) AttrUtils::GetInt(op_desc, GetKeyForOpParamSize(), max_size); GELOGD("Got op param size by key: %s, ret = %ld", GetKeyForOpParamSize().c_str(), max_size); if (max_size < 0) { - GELOGE(PARAM_INVALID, "[%s] Invalid op_param_size: %ld.", op_desc.GetName().c_str(), max_size); + GELOGE(PARAM_INVALID, "[Check][Size][%s] Invalid op_param_size: %ld.", op_desc.GetName().c_str(), max_size); + REPORT_INNER_ERROR("E19999", "[%s] Invalid op_param_size: %ld.", op_desc.GetName().c_str(), max_size); return PARAM_INVALID; } @@ -470,6 +543,22 @@ std::string AiCoreOpTask::GetKeyForOpParamSize() const { return kAttrOpParamSize; } +std::string AiCoreOpTask::GetKeyForTbeKernel() const { + return OP_EXTATTR_NAME_TBE_KERNEL; +} + +std::string AiCoreOpTask::GetKeyForTvmMagic() const { + return TVM_ATTR_NAME_MAGIC; +} + +std::string AiCoreOpTask::GetKeyForTvmMetaData() const { + return TVM_ATTR_NAME_METADATA; +} + +std::string AiCoreOpTask::GetKeyForKernelName(const OpDesc &op_desc) const { + return op_desc.GetName() + "_kernelname"; +} + Status AtomicAddrCleanOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) { GE_CHK_STATUS_RET_NOLOG(AiCoreOpTask::Init(op_desc, task_def)); return InitAtomicAddrCleanIndices(op_desc); @@ -483,8 +572,10 @@ Status AtomicAddrCleanOpTask::InitAtomicAddrCleanIndices(const OpDesc &op_desc) workspace_info = op_desc.TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_INFO, workspace_info); if (atomic_output_indices.empty() && workspace_info.empty()) { GELOGE(INTERNAL_ERROR, - "[%s] Neither ATOMIC_ATTR_OUTPUT_INDEX nor EXT_ATTR_ATOMIC_WORKSPACE_INFO is empty.", + "[Check][Size][%s] ATOMIC_ATTR_OUTPUT_INDEX and EXT_ATTR_ATOMIC_WORKSPACE_INFO is empty. check invalid", op_desc.GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] ATOMIC_ATTR_OUTPUT_INDEX and EXT_ATTR_ATOMIC_WORKSPACE_INFO" + "is empty. check invalid", op_desc.GetName().c_str()); return INTERNAL_ERROR; } @@ -511,11 +602,10 @@ Status AtomicAddrCleanOpTask::InitAtomicAddrCleanIndices(const OpDesc &op_desc) } if (arg_count > max_arg_count_) { - GELOGE(INTERNAL_ERROR, - "[%s] Invalid arg memory, max arg count = %u, but expect = %zu", - GetName().c_str(), - max_arg_count_, - arg_count); + GELOGE(INTERNAL_ERROR, "[Check][arg_count][%s] Invalid arg memory, max arg count = %u," + "but expect = %zu", GetName().c_str(), max_arg_count_, arg_count); + REPORT_INNER_ERROR("E19999", "[%s] Invalid arg memory, max arg count = %u, but expect = %zu", + GetName().c_str(), max_arg_count_, arg_count); return INTERNAL_ERROR; } @@ -526,10 +616,26 @@ std::string AtomicAddrCleanOpTask::GetKeyForOpParamSize() const { return kAttrAtomicOpParamSize; } +std::string AtomicAddrCleanOpTask::GetKeyForTbeKernel() const { + return EXT_ATTR_ATOMIC_TBE_KERNEL; +} + +std::string AtomicAddrCleanOpTask::GetKeyForTvmMagic() const { + return ATOMIC_ATTR_TVM_MAGIC; +} + +std::string AtomicAddrCleanOpTask::GetKeyForTvmMetaData() const { + return ATOMIC_ATTR_TVM_METADATA; +} + +std::string AtomicAddrCleanOpTask::GetKeyForKernelName(const OpDesc &op_desc) const { + return op_desc.GetName() + "_atomic_kernelname"; +} + Status AtomicAddrCleanOpTask::CalcTilingInfo(const NodePtr &node, OpRunInfo &tiling_info) { GELOGD("[%s] Start to invoke OpAtomicCalculate.", node->GetName().c_str()); GE_CHK_STATUS_RET(OpAtomicCalculate(*node, tiling_info), - "Failed calc tiling data of node %s.", + "[Invoke][OpAtomicCalculate]Failed calc tiling data of node %s.", node->GetName().c_str()); GELOGD("[%s] Done invoking OpAtomicCalculate successfully.", node->GetName().c_str()); return SUCCESS; diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.h b/ge/hybrid/node_executor/aicore/aicore_op_task.h index 97df2335..8d7b7f1e 100755 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.h +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h @@ -81,6 +81,10 @@ class AiCoreOpTask { protected: Status UpdateTilingInfo(TaskContext &context); virtual std::string GetKeyForOpParamSize() const; + virtual std::string GetKeyForTbeKernel() const; + virtual std::string GetKeyForTvmMagic() const; + virtual std::string GetKeyForTvmMetaData() const; + virtual std::string GetKeyForKernelName(const OpDesc &op_desc) const; virtual Status CalcTilingInfo(const NodePtr &node, optiling::OpRunInfo &tiling_info); std::unique_ptr tiling_buffer_ = nullptr; @@ -110,6 +114,9 @@ class AiCoreOpTask { uint32_t tiling_key_ = 0; void *handle_ = nullptr; bool is_dynamic_ = false; + uint64_t log_id_ = 0; + std::string log_name_; + uint32_t offset_ = 0; }; class AtomicAddrCleanOpTask : public AiCoreOpTask { @@ -119,6 +126,10 @@ class AtomicAddrCleanOpTask : public AiCoreOpTask { protected: std::string GetKeyForOpParamSize() const override; + std::string GetKeyForTbeKernel() const override; + std::string GetKeyForTvmMagic() const override; + std::string GetKeyForTvmMetaData() const override; + std::string GetKeyForKernelName(const OpDesc &op_desc) const override; Status CalcTilingInfo(const NodePtr &node, optiling::OpRunInfo &tiling_info) override; private: diff --git a/ge/hybrid/node_executor/aicore/aicore_task_builder.cc b/ge/hybrid/node_executor/aicore/aicore_task_builder.cc index 966e0910..114451b3 100755 --- a/ge/hybrid/node_executor/aicore/aicore_task_builder.cc +++ b/ge/hybrid/node_executor/aicore/aicore_task_builder.cc @@ -42,10 +42,10 @@ Status AiCoreTaskBuilder::BuildTask(std::unique_ptr &node_task, bool is_single_op) { GE_CHECK_NOTNULL(op_desc_); if (task_defs_.size() > kNumTaskWithAtomicAddrCleanTask) { - GELOGE(INTERNAL_ERROR, - "[%s] At most 2 task was supported, but got %zu", - op_desc_->GetName().c_str(), - task_defs_.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] At most %zu task was supported, but got %zu", + op_desc_->GetName().c_str(), kNumTaskWithAtomicAddrCleanTask, task_defs_.size()); + REPORT_INNER_ERROR("E19999", "[%s] At most %zu task was supported, but got %zu, check invalid.", + op_desc_->GetName().c_str(), kNumTaskWithAtomicAddrCleanTask, task_defs_.size()); return INTERNAL_ERROR; } @@ -58,10 +58,10 @@ Status AiCoreTaskBuilder::BuildTask(std::unique_ptr &node_task, task_defs_.size()); return SUCCESS; } else { - GELOGE(INTERNAL_ERROR, - "[%s] AtomicAddrClean task was expected, but got %zu task_defs", - op_desc_->GetName().c_str(), - task_defs_.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] AtomicAddrClean task was expected:%zu, but got %zu task_defs", + op_desc_->GetName().c_str(), kNumTaskWithAtomicAddrCleanTask, task_defs_.size()); + REPORT_INNER_ERROR("E19999", "[%s] AtomicAddrClean task was expected:%zu, but got %zu task_defs,", + op_desc_->GetName().c_str(), kNumTaskWithAtomicAddrCleanTask, task_defs_.size()); return INTERNAL_ERROR; } } @@ -70,8 +70,9 @@ Status AiCoreTaskBuilder::BuildTask(std::unique_ptr &node_task, auto atomic_task = std::unique_ptr(new(std::nothrow)AtomicAddrCleanOpTask()); GE_CHECK_NOTNULL(atomic_task); + atomic_task->SetSingleOp(is_single_op); GE_CHK_STATUS_RET(atomic_task->Init(*op_desc_, task_defs_.front()), - "[%s] Failed to init task for AtomicAddrClean", + "[Invoke][AtomicAddrCleanOpTask::Init] failed for [%s].", op_desc_->GetName().c_str()); op_tasks.emplace_back(std::move(atomic_task)); } @@ -81,7 +82,7 @@ Status AiCoreTaskBuilder::BuildTask(std::unique_ptr &node_task, GE_CHECK_NOTNULL(aicore_task); aicore_task->SetSingleOp(is_single_op); GE_CHK_STATUS_RET(aicore_task->Init(*op_desc_, task_defs_.back()), - "[%s] Failed to init task for AtomicAddrClean", + "[Invoke][AiCoreOpTask::Init] failed for [%s].", op_desc_->GetName().c_str()); op_tasks.emplace_back(std::move(aicore_task)); diff --git a/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc b/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc index 069c8699..742b3ca2 100755 --- a/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc +++ b/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc @@ -34,7 +34,8 @@ Status AiCoreTaskCompiler::Initialize() { auto ge_lib = GELib::GetInstance(); GE_CHECK_NOTNULL(ge_lib); if (!ge_lib->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge_lib is uninitialized, failed."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Check][State] failed, because Ge_lib is uninitialized."); + REPORT_INNER_ERROR("E19999", "Initialize failed, because Ge_lib is uninitialized."); return GE_CLI_GE_NOT_INITIALIZED; } auto &kernel_manager = ge_lib->OpsKernelManagerObj(); @@ -49,11 +50,9 @@ Status AiCoreTaskCompiler::DoCompileOp(const NodePtr &node) const { vector node_vec; node_vec.emplace_back(node); GE_CHK_STATUS_RET(aic_kernel_store_->CompileOpRun(node_vec), - "Failed to execute CompileOp, node = %s", - node->GetName().c_str()); + "[Invoke][CompileOpRun] Failed, node = %s", node->GetName().c_str()); GE_CHK_STATUS_RET(OpsKernelBuilderManager::Instance().CalcOpRunningParam(*node), - "Failed to execute CalcOpRunningParam, node = %s", - node->GetName().c_str()); + "[Invoke][CalcOpRunningParam] Failed, node = %s", node->GetName().c_str()); return SUCCESS; } @@ -102,7 +101,7 @@ Status AiCoreTaskCompiler::DoGenerateTask(const Node &node, ret = OpsKernelBuilderManager::Instance().GenerateTask(node, context, tasks); } - GE_CHK_STATUS(ret, "Failed to execute GenerateTask, node = %s", node.GetName().c_str()); + GE_CHK_STATUS(ret, "[Invoke][GenerateTask] Failed, node = %s", node.GetName().c_str()); GE_CHK_RT(rtModelUnbindStream(rt_model_, stream)); GE_CHK_RT(rtModelDestroy(rt_model_)); return ret; diff --git a/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc b/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc index 71a60f2f..b6c48157 100644 --- a/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc +++ b/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc @@ -29,8 +29,9 @@ constexpr int64_t kDimEndFlag = INT64_MIN; Status AicpuExtInfoHandler::Parse(const std::string &ext_info) { GELOGI("Node[%s] parse ext info start.", node_name_.c_str()); if (ext_info.empty()) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Node[%s] parse ext info failed as ext info is empty.", - node_name_.c_str()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param:ext_info]Node[%s] parse ext info failed as ext info is empty.", node_name_.c_str()); + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext info failed as ext info is empty.", node_name_.c_str()); return ACL_ERROR_GE_PARAM_INVALID; } @@ -39,7 +40,8 @@ Status AicpuExtInfoHandler::Parse(const std::string &ext_info) { GE_CHECK_NOTNULL(ext_info_); if (memcpy_s(ext_info_.get(), ext_info_len_, ext_info.c_str(), ext_info.size()) != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[%s] Failed to coy ext info", node_name_.c_str()); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][ext_info_][%s] Failed to copy ext info", node_name_.c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to copy ext info.", node_name_.c_str()); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } @@ -53,22 +55,22 @@ Status AicpuExtInfoHandler::Parse(const std::string &ext_info) { GELOGD("Ext infoType=%d, infoLen=%u.", aicpu_ext_info->infoType, aicpu_ext_info->infoLen); switch (aicpu_ext_info->infoType) { case aicpu::FWKAdapter::FWK_ADPT_EXT_SHAPE_TYPE: - GE_CHK_STATUS_RET(ParseExtShapeType(aicpu_ext_info), "Parse ext shape type failed."); + GE_CHK_STATUS_RET(ParseExtShapeType(aicpu_ext_info), "[Parse][ExtShapeType] failed."); break; case aicpu::FWKAdapter::FWK_ADPT_EXT_INPUT_SHAPE: - GE_CHK_STATUS_RET(ParseExtInputShape(aicpu_ext_info), "Parse ext input shape failed."); + GE_CHK_STATUS_RET(ParseExtInputShape(aicpu_ext_info), "[Parse][ExtInputShape] failed."); break; case aicpu::FWKAdapter::FWK_ADPT_EXT_OUTPUT_SHAPE: - GE_CHK_STATUS_RET(ParseExtOutputShape(aicpu_ext_info), "Parse ext output shape failed."); + GE_CHK_STATUS_RET(ParseExtOutputShape(aicpu_ext_info), "[Parse][ExtOutputShape] failed."); break; case aicpu::FWKAdapter::FWK_ADPT_EXT_SESSION_INFO: - GE_CHK_STATUS_RET(ParseExtSessionInfo(aicpu_ext_info), "Parse ext session info failed."); + GE_CHK_STATUS_RET(ParseExtSessionInfo(aicpu_ext_info), "[Parse][ExtSessionInfo] failed."); break; case aicpu::FWKAdapter::FWK_ADPT_EXT_BITMAP: - GE_CHK_STATUS_RET(ParseExtBitMap(aicpu_ext_info), "Parse ext bit map failed."); + GE_CHK_STATUS_RET(ParseExtBitMap(aicpu_ext_info), "[Parse][ExtBitMap] failed."); break; case aicpu::FWKAdapter::FWK_ADPT_EXT_UPDATE_ADDR: - GE_CHK_STATUS_RET(ParseExtUpdateAddr(aicpu_ext_info), "Parse ext update_addr failed."); + GE_CHK_STATUS_RET(ParseExtUpdateAddr(aicpu_ext_info), "[Parse][ExtUpdateAddr] failed."); break; default: GELOGD("Node[%s] ignore infoType=%d, infoLen=%u.", @@ -79,33 +81,51 @@ Status AicpuExtInfoHandler::Parse(const std::string &ext_info) { offset += aicpu_ext_info->infoLen; } - GE_CHK_BOOL_RET_STATUS(offset == ext_info_len_, ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] ext_info format error, parse not reach end, offset=%zu, ext_info_len=%zu.", + GE_IF_BOOL_EXEC(offset != ext_info_len_, + REPORT_INNER_ERROR("E19999", "Node[%s] ext_info format error, parse not reach end," + "offset=%zu, ext_info_len=%zu.", node_name_.c_str(), offset, ext_info_len_); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]Node[%s] ext_info format error," + "parse not reach end, offset=%zu, ext_info_len=%zu.", node_name_.c_str(), offset, ext_info_len_); + return ACL_ERROR_GE_PARAM_INVALID;); GELOGI("Node[%s] parse ext info end.", node_name_.c_str()); return SUCCESS; } Status AicpuExtInfoHandler::ParseExtShapeType(AicpuExtInfo *aicpu_ext_info) { - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == sizeof(int32_t), ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] parse ext shape type failed as infoLen must be %zu but %u.", + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != sizeof(int32_t), + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext shape type failed as infoLen must be %zu but %u.", + node_name_.c_str(), sizeof(int32_t), aicpu_ext_info->infoLen); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Size]Node[%s] parse ext shape type failed as infoLen must be %zu but %u.", node_name_.c_str(), sizeof(int32_t), aicpu_ext_info->infoLen); + return ACL_ERROR_GE_PARAM_INVALID;); auto type = reinterpret_cast(aicpu_ext_info->infoMsg); - GE_CHK_BOOL_RET_STATUS(*type == unknown_type_, ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] parse ext shape type failed as need %d but %d.", + GE_IF_BOOL_EXEC(*type != unknown_type_, + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext shape type failed as need %d but %d.", + node_name_.c_str(), unknown_type_, *type); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Type]Node[%s] parse ext shape type failed as need %d but %d.", node_name_.c_str(), unknown_type_, *type); + return ACL_ERROR_GE_PARAM_INVALID;); GELOGI("Node[%s] parse ext shape type success infoLen=%u.", node_name_.c_str(), aicpu_ext_info->infoLen); return SUCCESS; } Status AicpuExtInfoHandler::ParseExtInputShape(AicpuExtInfo *aicpu_ext_info) { auto need_len = input_num_ * sizeof(AicpuShapeAndType); - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == need_len, ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] parse ext input shape failed as infoLen must be " + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != need_len, + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext input shape failed as infoLen must be " + "input_num[%u]*sizeof(ShapeAndType)[%zu] but %u.", + node_name_.c_str(), input_num_, sizeof(AicpuShapeAndType), + aicpu_ext_info->infoLen); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][DataLen]Node[%s] parse ext input shape failed as infoLen must be " "input_num[%u]*sizeof(ShapeAndType)[%zu] but %u.", node_name_.c_str(), input_num_, sizeof(AicpuShapeAndType), aicpu_ext_info->infoLen); + return ACL_ERROR_GE_PARAM_INVALID;); auto input = reinterpret_cast(aicpu_ext_info->infoMsg); @@ -123,10 +143,16 @@ Status AicpuExtInfoHandler::ParseExtOutputShape(AicpuExtInfo *aicpu_ext_info) { return SUCCESS; } auto need_len = output_num_ * sizeof(AicpuShapeAndType); - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == need_len, ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] parse ext output shape failed as infoLen must be " + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != need_len, + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext output shape failed as infoLen must be " + "output_num[%u]*sizeof(ShapeAndType)[%zu] but %u.", + node_name_.c_str(), output_num_, sizeof(AicpuShapeAndType), + aicpu_ext_info->infoLen); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][DataLen]Node[%s] parse ext output shape failed as infoLen must be " "output_num[%u]*sizeof(ShapeAndType)[%zu] but %u.", node_name_.c_str(), output_num_, sizeof(AicpuShapeAndType), aicpu_ext_info->infoLen); + return ACL_ERROR_GE_PARAM_INVALID;); auto output = reinterpret_cast(aicpu_ext_info->infoMsg); for (uint32_t index = 0; index < output_num_; ++index) { @@ -137,9 +163,14 @@ Status AicpuExtInfoHandler::ParseExtOutputShape(AicpuExtInfo *aicpu_ext_info) { } Status AicpuExtInfoHandler::ParseExtSessionInfo(AicpuExtInfo *aicpu_ext_info) { - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == sizeof(AicpuSessionInfo), ACL_ERROR_GE_PARAM_INVALID, - "Node[%s] parse ext session info failed as infoLen must be %zu but %u.", + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != sizeof(AicpuSessionInfo), + REPORT_INNER_ERROR("E19999", + "Node[%s] parse ext session info failed as infoLen must be %zu but %u.", + node_name_.c_str(), sizeof(SessionInfo), aicpu_ext_info->infoLen); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][DataLen]Node[%s] parse ext session info failed as infoLen must be %zu but %u.", node_name_.c_str(), sizeof(SessionInfo), aicpu_ext_info->infoLen); + return ACL_ERROR_GE_PARAM_INVALID;); session_info_ = reinterpret_cast(aicpu_ext_info->infoMsg); GELOGI("Node[%s] parse session info success infoLen=%u.", node_name_.c_str(), aicpu_ext_info->infoLen); @@ -147,9 +178,14 @@ Status AicpuExtInfoHandler::ParseExtSessionInfo(AicpuExtInfo *aicpu_ext_info) { } Status AicpuExtInfoHandler::ParseExtBitMap(AicpuExtInfo *aicpu_ext_info) { - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == sizeof(uint64_t), PARAM_INVALID, - "Node[%s] parse bit_map info failed as infoLen must be %zu but %u.", + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != sizeof(uint64_t), + REPORT_INNER_ERROR("E19999", + "Node[%s] parse bit_map info failed as infoLen must be %zu but %u.", + node_name_.c_str(), sizeof(uint64_t), aicpu_ext_info->infoLen); + GELOGE(PARAM_INVALID, + "[Check][DataLen]Node[%s] parse bit_map info failed as infoLen must be %zu but %u.", node_name_.c_str(), sizeof(uint64_t), aicpu_ext_info->infoLen); + return PARAM_INVALID;); bit_map_ = reinterpret_cast(aicpu_ext_info->infoMsg); GELOGI("Node[%s] bit_map info success infoLen=%u.", node_name_.c_str(), aicpu_ext_info->infoLen); @@ -157,9 +193,14 @@ Status AicpuExtInfoHandler::ParseExtBitMap(AicpuExtInfo *aicpu_ext_info) { } Status AicpuExtInfoHandler::ParseExtUpdateAddr(AicpuExtInfo *aicpu_ext_info) { - GE_CHK_BOOL_RET_STATUS(aicpu_ext_info->infoLen == sizeof(uint32_t), PARAM_INVALID, - "Node[%s] parse update_addr info failed as infoLen must be %zu but %u.", + GE_IF_BOOL_EXEC(aicpu_ext_info->infoLen != sizeof(uint32_t), + REPORT_INNER_ERROR("E19999", + "Node[%s] parse update_addr info failed as infoLen must be %zu but %u.", + node_name_.c_str(), sizeof(uint32_t), aicpu_ext_info->infoLen); + GELOGE(PARAM_INVALID, + "[Check][DataLen]Node[%s] parse update_addr info failed as infoLen must be %zu but %u.", node_name_.c_str(), sizeof(uint32_t), aicpu_ext_info->infoLen); + return PARAM_INVALID;); update_addr_ = reinterpret_cast(aicpu_ext_info->infoMsg); GELOGI("Node[%s] update_addr info success infoLen=%u.", node_name_.c_str(), aicpu_ext_info->infoLen); @@ -207,15 +248,19 @@ Status AicpuExtInfoHandler::UpdateInputShapeAndType(uint32_t input_index, const const auto &shape = input_desc.GetShape(); GE_CHK_STATUS_RET(UpdateShapeAndType(shape, input_desc.GetDataType(), input_shape_and_type_[input_index]), - "Node[%s] input[%u] update input shape and type failed.", + "[Update][ShapeAndType] failed, Node[%s] input[%u] .", node_name_.c_str(), input_index); return SUCCESS; } Status AicpuExtInfoHandler::UpdateOutputShapeAndType(uint32_t output_index, const GeTensorDesc &output_desc) { - GE_CHK_BOOL_RET_STATUS((unknown_type_ != DEPEND_COMPUTE), ACL_ERROR_GE_INTERNAL_ERROR, - "Node[%s] is depend compute is no need update output shape and type by ext.", + GE_IF_BOOL_EXEC((unknown_type_ == DEPEND_COMPUTE), + REPORT_INNER_ERROR("E19999", "Node[%s] is depend compute is no need update output shape" + "and type by ext.", node_name_.c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Check][Type]Node[%s] is depend compute is no need update output shape and type by ext.", node_name_.c_str()); + return ACL_ERROR_GE_INTERNAL_ERROR;); GE_CHECK_LE(output_index, output_num_); auto shape = output_desc.GetShape(); @@ -223,9 +268,13 @@ Status AicpuExtInfoHandler::UpdateOutputShapeAndType(uint32_t output_index, cons if (unknown_type_ == DEPEND_SHAPE_RANGE) { std::vector> range; auto range_ret = output_desc.GetShapeRange(range); - GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, ACL_ERROR_GE_INTERNAL_ERROR, - "Node[%s] is shape range type but get GetShapeRange failed, ret=%u.", + GE_IF_BOOL_EXEC(range_ret != GRAPH_SUCCESS, + REPORT_INNER_ERROR("E19999", "Node[%s] is shape range type but get GetShapeRange failed, ret=%u", + node_name_.c_str(), range_ret); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Invoke][GetShapeRange]Node[%s] is shape range type but get GetShapeRange failed, ret=%u", node_name_.c_str(), range_ret); + return ACL_ERROR_GE_INTERNAL_ERROR;); for (size_t k = 0; k < range.size(); ++k) { if (shape.GetDim(k) < 0 && k < range.size()) { GELOGD("Node[%s] output[%u] update dim[%zu] from %ld to range max %ld.", @@ -239,9 +288,14 @@ Status AicpuExtInfoHandler::UpdateOutputShapeAndType(uint32_t output_index, cons } Status AicpuExtInfoHandler::GetOutputShapeAndType(uint32_t output_index, GeShape &shape, DataType &data_type) { - GE_CHK_BOOL_RET_STATUS((unknown_type_ != DEPEND_COMPUTE), INTERNAL_ERROR, - "Node[%s] is depend compute type can not get output shape and type by ext.", + GE_IF_BOOL_EXEC((unknown_type_ == DEPEND_COMPUTE), + REPORT_INNER_ERROR("E19999", + "Node[%s] is depend compute type can not get output shape and type by ext.", + node_name_.c_str()); + GELOGE(INTERNAL_ERROR, + "[Check][Type]Node[%s] is depend compute type can not get output shape and type by ext.", node_name_.c_str()); + return INTERNAL_ERROR;); GetShapeAndType(output_shape_and_type_[output_index], shape, data_type); return SUCCESS; } @@ -254,8 +308,11 @@ Status AicpuExtInfoHandler::UpdateShapeAndType(const GeShape &shape, DataType da AicpuShapeAndType *shape_and_type) { auto dim_num = shape.GetDimNum(); if (dim_num > aicpu::FWKAdapter::kMaxShapeDims) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Update shape and type failed, as dim_num %zu is over max shape dims %u.", + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][DimNum]Update shape and type failed, as dim_num %zu is over max shape dims %u.", dim_num, aicpu::FWKAdapter::kMaxShapeDims); + REPORT_INNER_ERROR("E19999", "Update shape and type failed, as dim_num %zu is over max shape dims %u.", + dim_num, aicpu::FWKAdapter::kMaxShapeDims); return ACL_ERROR_GE_PARAM_INVALID; } size_t index = 0; diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc index 1e2fbfe8..c2ebf654 100755 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc @@ -45,7 +45,9 @@ Status AicpuNodeTaskBase::InitExtInfo(const std::string &kernel_ext_info, int64_ if (kernel_ext_info.empty()) { if (node_item_->is_dynamic) { // dynamic node must have ext info - GELOGE(PARAM_INVALID, "Node[%s] parse ext info failed as ext info is empty.", node_name_.c_str()); + REPORT_INNER_ERROR("E19999", "Node[%s] parse ext info failed as ext info is empty.", node_name_.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param:kernel_ext_info]Node[%s] parse ext info failed as ext info is empty.", + node_name_.c_str()); return PARAM_INVALID; } else { // if no ext info no need copy to device. @@ -56,18 +58,19 @@ Status AicpuNodeTaskBase::InitExtInfo(const std::string &kernel_ext_info, int64_ } GE_CHK_STATUS_RET(aicpu_ext_handle_.Parse(kernel_ext_info), - "Node[%s] parse kernel ext info failed, kernel_ext_info_size=%zu.", + "[Invoke][Parse]Node[%s] parse kernel ext info failed, kernel_ext_info_size=%zu.", node_name_.c_str(), kernel_ext_info.size()); - GELOGD("To update aicpu_task ext_info session_info session_id to %lu", session_id); + GELOGD("To update aicpu_task ext_info session_info session_id to %ld", session_id); GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateSessionInfoSessionId(session_id), - "UpdateSessionInfoSessionId failed."); + "[Update][SessionInfoSessionId] failed, session_id:%ld.", session_id); bool execute_mode = !aicpu_ext_handle_.IsNeedRefreshIOAddr() && !node_item_->is_dynamic; - GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateExecuteMode(execute_mode), "UpdateExecuteMode failed."); + GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateExecuteMode(execute_mode), + "[Update][ExecuteMode] failed, node:%s.", node_name_.c_str()); // copy task args buf GE_CHK_STATUS_RET(AllocTensorBuffer(aicpu_ext_handle_.GetExtInfoLen(), ext_info_addr_dev_), - "Node[%s] alloc kernel_ext_info buf failed, size=%zu", + "[Invoke][AllocTensorBuffer]Node[%s] alloc kernel_ext_info buf failed, size=%zu", node_name_.c_str(), aicpu_ext_handle_.GetExtInfoLen()); // copy default ext info to device @@ -96,7 +99,7 @@ Status AicpuNodeTaskBase::UpdateOutputShapeFromExtInfo(TaskContext &task_context DataType data_type; aicpu_ext_handle_.GetOutputShapeAndType(i, shape, data_type); GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(task_context, shape, i), - "Update node %s [%d]th output shape failed.", + "[Invoke][UpdateShapeToOutputDesc]Update node %s [%d]th output shape failed.", node_name_.c_str(), i); } return SUCCESS; @@ -123,11 +126,11 @@ Status AicpuNodeTaskBase::UpdateShapeToOutputDesc(TaskContext &task_context, auto trans_ret = formats::TransShape(format, shape_new.GetDims(), output_desc->GetDataType(), origin_format, origin_dims_new); GE_CHK_STATUS_RET(trans_ret, - "Node[%s] out[%d] originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.", + "[Trans][Shape] failed for Node[%s] out[%d] originFormat[%d] is not same as format[%d], shape=%s.", node_name_.c_str(), output_index, origin_format, format, shape_new.ToString().c_str()); auto origin_shape_new = GeShape(origin_dims_new); GE_CHK_STATUS_RET(task_context.GetNodeState()->UpdateOutputShapes(output_index, shape_new, origin_shape_new), - "Node[%s] failed to update update shape, index = %d", node_name_.c_str(), output_index); + "[Update][OutputShapes] failed for Node[%s], index = %d", node_name_.c_str(), output_index); GELOGD("Node[%s] out[%d] originFormat[%d] is not same as format[%d], need update from %s ro %s.", node_name_.c_str(), output_index, origin_format, format, origin_shape_old.ToString().c_str(), origin_shape_new.ToString().c_str()); @@ -145,8 +148,7 @@ Status AicpuNodeTaskBase::UpdateExtInfo() { auto input_desc = node_item_->MutableInputDesc(i); GE_CHECK_NOTNULL(input_desc); GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateInputShapeAndType(i, *input_desc), - "Node[%s] input[%d] update input shape failed.", - node_name_.c_str(), i); + "[Update][InputShapeAndType] failed for Node[%s] input[%d].", node_name_.c_str(), i); } if (unknown_type_ != DEPEND_COMPUTE) { @@ -155,8 +157,7 @@ Status AicpuNodeTaskBase::UpdateExtInfo() { GE_CHECK_NOTNULL(output_desc); GE_CHK_STATUS_RET(aicpu_ext_handle_.UpdateOutputShapeAndType(j, *output_desc), - "Node[%s] output[%d] UpdateOutputShapeAndType failed.", - node_name_.c_str(), j); + "[Update][OutputShapeAndType] failed for Node[%s] output[%d].", node_name_.c_str(), j); } } @@ -179,13 +180,13 @@ Status AicpuNodeTaskBase::UpdateArgs(TaskContext &context) { return SUCCESS; } - GE_CHK_STATUS_RET(UpdateIoAddr(context), "Node[%s] update io addr failed.", node_name_.c_str()); + GE_CHK_STATUS_RET(UpdateIoAddr(context), "[Update][IoAddr] failed for Node[%s].", node_name_.c_str()); bool all_shape = false; const OpDescPtr op_desc = node_item_->GetOpDesc(); (void)AttrUtils::GetBool(op_desc, kAicpuAllshape, all_shape); if (node_item_->is_dynamic || all_shape) { // dynamic node and all_shape kernel need update ext info. - GE_CHK_STATUS_RET(UpdateExtInfo(), "Node[%s] update ext info failed.", node_name_.c_str()); + GE_CHK_STATUS_RET(UpdateExtInfo(), "[Update][ExtInfo] failed for Node[%s].", node_name_.c_str()); } GELOGD("Node[%s] update args end.", node_name_.c_str()); @@ -196,16 +197,19 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::functionnum_outputs; ++i) { GE_CHK_STATUS_RET(AllocTensorBuffer(result_summary_size, output_summary_[i]), - "Node[%s] alloc buffer for result summary info failed, size=%zu.", + "[Alloc][TensorBuffer] failed for Node[%s] to copy result summary info, size=%zu.", node_name_.c_str(), result_summary_size); } output_summary_host_.resize(node_item_->num_outputs); @@ -250,21 +254,21 @@ Status AicpuTfNodeTask::InitForDependComputeTask() { // copy task need copy output_data and output_shape, max len is 2 * output_num const size_t copy_input_buf_len = node_item_->num_outputs * 2 * sizeof(uint64_t); GE_CHK_STATUS_RET(AllocTensorBuffer(copy_input_buf_len, copy_input_release_flag_dev_), - "Node[%s] alloc copy task input release_flag failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task input release_flag, size=%zu", node_name_.c_str(), copy_input_buf_len); GE_CHK_STATUS_RET(AllocTensorBuffer(copy_input_buf_len, copy_input_data_size_dev_), - "Node[%s] alloc copy task input data_size failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task input data_size, size=%zu", node_name_.c_str(), copy_input_buf_len); GE_CHK_STATUS_RET(AllocTensorBuffer(copy_input_buf_len, copy_input_src_dev_), - "Node[%s] alloc copy task input src failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task input src, size=%zu", node_name_.c_str(), copy_input_buf_len); GE_CHK_STATUS_RET(AllocTensorBuffer(copy_input_buf_len, copy_input_dst_dev_), - "Node[%s] alloc copy task input dst failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task input dst, size=%zu", node_name_.c_str(), copy_input_buf_len); // copy task args buf GE_CHK_STATUS_RET(AllocTensorBuffer(sizeof(STR_FWK_OP_KERNEL), copy_task_args_buf_), - "Node[%s] alloc copy task args buf failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task args, size=%zu", node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL)); std::vector copy_io_addr; @@ -278,7 +282,7 @@ Status AicpuTfNodeTask::InitForDependComputeTask() { // can alloc in init, it can reuse GE_CHK_STATUS_RET(AllocTensorBuffer(copy_io_addr_size, copy_ioaddr_dev_), - "Node[%s] alloc copy task io buf failed, size=%zu", + "[Alloc][TensorBuffer] failed for Node[%s] to copy task ioaddr, size=%zu", node_name_.c_str(), copy_io_addr_size); GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_->GetData(), copy_io_addr_size, @@ -289,14 +293,17 @@ Status AicpuTfNodeTask::InitForDependComputeTask() { Status AicpuTfNodeTask::Init(const HybridModel &model) { GELOGI("Node[%s] init start.", node_name_.c_str()); - GE_CHK_BOOL_RET_STATUS(task_def_.has_kernel_ex(), FAILED, - "Node[%s] is tf node but task def does not has kernel ex.", + GE_IF_BOOL_EXEC(!task_def_.has_kernel_ex(), + REPORT_INNER_ERROR("E19999", "[Check][TaskDef]Node[%s] is tf node" + "but task def does not has kernel ex.", node_name_.c_str()); + GELOGE(FAILED, "[Check][TaskDef]Node[%s] is tf node but task def does not has kernel ex.", node_name_.c_str()); + return FAILED;); auto &kernel_ex_def = task_def_.kernel_ex(); auto kernel_workspace_size = kernel_ex_def.task_info().size(); GE_CHK_STATUS_RET(AllocTensorBuffer(kernel_workspace_size, kernel_workspace_), - "Node[%s] alloc buffer for kernel workspace failed, size=%zu.", + "[Alloc][TensorBuffer] failed for Node[%s] to copy kernel workspace, size=%zu.", node_name_.c_str(), kernel_workspace_size); GE_CHK_RT_RET(rtMemcpy(kernel_workspace_->GetData(), kernel_workspace_size, @@ -306,30 +313,38 @@ Status AicpuTfNodeTask::Init(const HybridModel &model) { auto input_output_size = (node_item_->num_inputs + node_item_->num_outputs) * sizeof(uint64_t); // alloc input output addr buf, allow alloc size 0 GE_CHK_STATUS_RET(AllocTensorBuffer(input_output_size, input_output_addr_), - "Node[%s] alloc buffer for io addr failed, size=%zu.", + "[Alloc][TensorBuffer] for Node[%s] to copy io addr, size=%zu.", node_name_.c_str(), input_output_size); auto &kernel_ext_info = kernel_ex_def.kernel_ext_info(); auto kernel_ext_info_size = kernel_ex_def.kernel_ext_info_size(); - GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED, - "Node[%s] task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", + GE_IF_BOOL_EXEC(kernel_ext_info.size() != kernel_ext_info_size, + REPORT_INNER_ERROR("E19999", "[Check][Size]Node[%s] task def kernel_ext_info.size=%zu," + "but kernel_ext_info_size=%u.", + node_name_.c_str(), kernel_ext_info.size(), kernel_ext_info_size); + GELOGE(FAILED, "[Check][Size]Node[%s] task def kernel_ext_info.size=%zu," + "but kernel_ext_info_size=%u.", node_name_.c_str(), kernel_ext_info.size(), kernel_ext_info_size); + return FAILED;); // init ext info uint64_t ext_session_id = model.GetSessionId(); - GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), "Node[%s] init ext info failed.", node_name_.c_str()); - GE_CHK_STATUS_RET(InitForDependComputeTask(), "Node[%s] init for depend compute task failed.", node_name_.c_str()); + GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), "[Init][ExtInfo] failed for Node[%s].", + node_name_.c_str()); + GE_CHK_STATUS_RET(InitForDependComputeTask(), "[Init][DependComputeTask] failed for Node[%s].", node_name_.c_str()); // build fwk_op_kernel. - GE_CHK_BOOL_RET_STATUS(sizeof(STR_FWK_OP_KERNEL) >= kernel_ex_def.args_size(), FAILED, - "Node[%s] sizeof STR_FWK_OP_KERNEL is: %zu, but args_size is: %u", + GE_IF_BOOL_EXEC(sizeof(STR_FWK_OP_KERNEL) < kernel_ex_def.args_size(), + REPORT_INNER_ERROR("E19999", "Node[%s] sizeof STR_FWK_OP_KERNEL is: %zu, but args_size is: %u", + node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL), kernel_ex_def.args_size()); + GELOGE(FAILED, "[Check][Size]Node[%s] sizeof STR_FWK_OP_KERNEL is: %zu, but args_size is: %u", node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL), kernel_ex_def.args_size()); - + return FAILED;); STR_FWK_OP_KERNEL fwk_op_kernel = {0}; errno_t sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), kernel_ex_def.args().data(), kernel_ex_def.args_size()); GE_CHK_BOOL_RET_STATUS(sec_ret == EOK, INTERNAL_ERROR, - "Node[%s] memcpy fwk_op_kernel failed, ret: %d.", node_name_.c_str(), sec_ret); + "[Update][fwk_op_kernel] failed for Node[%s], ret: %d.", node_name_.c_str(), sec_ret); fwk_op_kernel.fwkKernelBase.fwk_kernel.workspaceBaseAddr = reinterpret_cast(kernel_workspace_->GetData()); fwk_op_kernel.fwkKernelBase.fwk_kernel.inputOutputAddr = reinterpret_cast(input_output_addr_->GetData()); @@ -343,12 +358,13 @@ Status AicpuTfNodeTask::Init(const HybridModel &model) { fwk_op_kernel.fwkKernelBase.fwk_kernel.stepIDAddr = GetStepIdAddr(model); auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID; - GE_CHK_STATUS_RET(EnsureSessionCreated(session_id), "Node[%s] create session id %lu failed.", + GE_CHK_STATUS_RET(EnsureSessionCreated(session_id), + "[Invoke][EnsureSessionCreated]Node[%s] create session id %lu failed.", node_name_.c_str(), session_id); // alloc kernel_buf_ and copy to device. GE_CHK_STATUS_RET(AllocTensorBuffer(sizeof(STR_FWK_OP_KERNEL), kernel_buf_), - "Node[%s] alloc buffer for kernel buf failed, size=%zu.", + "[Alloc][TensorBuffer] for Node[%s] to copy kernel_buf, size=%zu.", node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL)); GE_CHK_RT_RET(rtMemcpy(kernel_buf_->GetData(), sizeof(STR_FWK_OP_KERNEL), @@ -378,20 +394,23 @@ Status AicpuTfNodeTask::SetMemCopyTask(const domi::TaskDef &task_def) { GELOGD("Start to set memcpy task for node[%s].", node_name_.c_str()); const domi::KernelExDef &kernel_def = task_def.kernel_ex(); if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) { - GELOGE(PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", + GELOGE(PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is:%lu, but args_size:%d is bigger", sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size()); + REPORT_INNER_ERROR("E19999", "sizeof STR_FWK_OP_KERNEL is:%lu, but args_size:%d is bigger.", + sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size()); return PARAM_INVALID; } STR_FWK_OP_KERNEL aicpu_task = {0}; auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL), kernel_def.args().data(), kernel_def.args_size()); if (sec_ret != EOK) { - GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); + GELOGE(FAILED, "[Update][aicpu_task] failed, ret: %d", sec_ret); + REPORT_CALL_ERROR("E19999", "update aicpu_task failed, ret: %d.", sec_ret); return FAILED; } GE_CHK_STATUS_RET(AllocTensorBuffer(kernel_def.task_info_size(), copy_workspace_buf_), - "Node[%s] alloc copy task workspace buf failed, size=%u.", + "[Alloc][TensorBuffer] for Node[%s] to copy task workspace buf, size=%u.", node_name_.c_str(), kernel_def.task_info_size()); GE_CHK_RT_RET(rtMemcpy(copy_workspace_buf_->GetData(), kernel_def.task_info_size(), @@ -422,7 +441,7 @@ Status AicpuTfNodeTask::EnsureSessionCreated(uint64_t session_id) { auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); GE_CHK_STATUS_RET(model_manager->CreateAicpuSession(session_id), - "Create aicpu session %lu failed", session_id); + "[Create][AicpuSession] failed, session_id:%lu", session_id); return SUCCESS; } @@ -437,15 +456,15 @@ Status AicpuTfNodeTask::ReadResultSummaryAndPrepareMemory(TaskContext &context, auto raw_data_size = result_summary.raw_data_size; std::unique_ptr tensor_buffer; GE_CHK_STATUS_RET(AllocTensorBuffer(raw_data_size, tensor_buffer), - "Node[%s] out[%d] alloc tensor buffer failed, raw_data_size=%lu", + "[Alloc][TensorBuffer] failed for Node[%s] out[%d] to copy tensor buffer, raw_data_size:%lu", node_name_.c_str(), i, raw_data_size); auto status = context.SetOutput(i, TensorValue(std::shared_ptr(tensor_buffer.release()))); - GE_CHK_STATUS_RET(status, "Node[%s] set output %d failed.", node_name_.c_str(), i); + GE_CHK_STATUS_RET(status, "[Set][Output] failed for Node[%s], output:%d.", node_name_.c_str(), i); auto shape_data_size = result_summary.shape_data_size; std::unique_ptr shape_buffer; GE_CHK_STATUS_RET(AllocTensorBuffer(shape_data_size, shape_buffer), - "Node[%s] out[%d] alloc shape buffer failed, shape_data_size=%lu", + "[Alloc][TensorBuffer] failed for Node[%s] out[%d] to copy shape buffer, shape_data_size:%lu", node_name_.c_str(), i, shape_data_size); out_shape_hbm.emplace_back(std::move(shape_buffer)); } @@ -456,7 +475,7 @@ Status AicpuTfNodeTask::CopyDataToHbm(TaskContext &context, const std::vector> &out_shape_hbm) { GE_CHK_BOOL_RET_STATUS(out_shape_hbm.size() == static_cast(node_item_->num_outputs), INTERNAL_ERROR, - "Node[%s] has %d outputs but out shape is %zu.", + "[Check][Size]Node[%s] has %d outputs but out shape is %zu not equal.", node_name_.c_str(), node_item_->num_outputs, out_shape_hbm.size()); GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(context, out_shape_hbm)); @@ -525,7 +544,7 @@ Status AicpuTfNodeTask::UpdateShapeByHbmBuffer(TaskContext &context, if (result_summary.shape_data_size > 0) { const auto &shape_hbm = out_shape_hbm[i]; GE_CHK_BOOL_RET_STATUS((result_summary.shape_data_size % sizeof(int64_t) == 0), INTERNAL_ERROR, - "Node[%s] [%d]th output shape data size is %lu is not divided by int64_t.", + "[Check][Size]Node[%s] [%d]th output shape data size is %lu is not divided by int64_t.", node_name_.c_str(), i, result_summary.shape_data_size); uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t); GELOGD("Node[%s] [%d]th output dim num=%u.", node_name_.c_str(), i, dim_num); @@ -539,7 +558,7 @@ Status AicpuTfNodeTask::UpdateShapeByHbmBuffer(TaskContext &context, } } GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(context, GeShape(shape_dims), i), - "Node[%s] update [%d]th output shape failed.", + "[Invoke][UpdateShapeToOutputDesc]Node[%s] update [%d]th output shape failed.", node_name_.c_str(), i); } return SUCCESS; @@ -550,20 +569,20 @@ Status AicpuTfNodeTask::UpdateShapeAndDataByResultSummary(TaskContext &context) std::vector> out_shape_hbm; GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(context, out_shape_hbm), - "Node[%s] read ResultSummary and update output shape failed.", + "[Invoke][ReadResultSummaryAndPrepareMemory] failed for Node[%s].", node_name_.c_str()); RECORD_CALLBACK_EVENT(context.GetExecutionContext(), node_name_.c_str(), "[ReadResultSummaryAndPrepareMemory] End"); GE_CHK_STATUS_RET(CopyDataToHbm(context, out_shape_hbm), - "Node[%s] copy data to output failed.", + "[Invoke][CopyDataToHbm] failed for Node[%s] copy data to output.", node_name_.c_str()); RECORD_CALLBACK_EVENT(context.GetExecutionContext(), node_name_.c_str(), "[CopyDataToHbm] End"); GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(context, out_shape_hbm), - "Node[%s] update shape by hbm buffer failed.", + "[Update][ShapeByHbmBuffer] failed for Node[%s].", node_name_.c_str()); GELOGD("Node[%s] update shape and data by result summary end.", node_name_.c_str()); @@ -598,7 +617,7 @@ Status AicpuTfNodeTask::UpdateIoAddr(TaskContext &context) { GELOGD("Node[%s] is depend compute node, use result summary as out addr.", node_name_.c_str()); GE_CHK_BOOL_RET_STATUS(output_summary_.size() == static_cast(node_item_->num_outputs), INTERNAL_ERROR, - "Node[%s] has %d output but %zu output summary.", + "[Check][Size]Node[%s] has %d output but %zu output summary not equal.", node_name_.c_str(), node_item_->num_outputs, output_summary_.size()); for (auto j = 0; j < node_item_->num_outputs; ++j) { @@ -655,10 +674,11 @@ Status AicpuNodeTask::Init(const HybridModel &model) { GELOGD("Node[%s] init start.", node_name.c_str()); GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED, - "Node[%s] unknown type[%d] is depend compute, it's not supported now.", + "[Check][Type]Node[%s] unknown type[%d] is depend compute, it's not supported now.", node_name.c_str(), unknown_type_); - GE_CHK_BOOL_RET_STATUS(task_def_.has_kernel(), FAILED, "Node[%s] task def does not has kernel.", node_name.c_str()); + GE_CHK_BOOL_RET_STATUS(task_def_.has_kernel(), FAILED, + "[Check][task_def_]Node[%s] task def does not has kernel.", node_name.c_str()); auto &kernel_def = task_def_.kernel(); auto &args = kernel_def.args(); @@ -671,52 +691,80 @@ Status AicpuNodeTask::Init(const HybridModel &model) { if (kernel_type == ccKernelType::CUST_AI_CPU) { bool loaded = false; GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name, loaded), - "load cust aicpu so failed."); + "[Load][CustAicpuSo] failed, op:%s, so:%s.", op_desc->GetName().c_str(), so_name.c_str()); if (!loaded) { - GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed."); + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), + "[Launch][CustAicpuSo] failed, node:%s.", node_name_.c_str()); } } - GE_CHK_BOOL_RET_STATUS(args.size() == args_size_, FAILED, - "Node[%s] task def args.size=%zu, but args_size=%u.", + GE_IF_BOOL_EXEC(args.size() != args_size_, + REPORT_INNER_ERROR("E19999", "Node[%s] task def args.size=%zu, but args_size=%u not equal.", + node_name.c_str(), args.size(), args_size_); + GELOGE(FAILED, "[Check][Size]Node[%s] task def args.size=%zu, but args_size=%u not equal.", node_name.c_str(), args.size(), args_size_); - - GE_CHK_BOOL_RET_STATUS(args_size_ >= sizeof(aicpu::AicpuParamHead), FAILED, - "Node[%s] task def args_size=%u is less than aicpu param head len=%zu.", + return FAILED;); + + GE_IF_BOOL_EXEC(args_size_ < sizeof(aicpu::AicpuParamHead), + REPORT_INNER_ERROR("E19999", + "Node[%s] task def args_size=%u is less than aicpu param head len=%zu.", + node_name.c_str(), args_size_, sizeof(aicpu::AicpuParamHead)); + GELOGE(FAILED, + "[Check][Size]Node[%s] task def args_size=%u is less than aicpu param head len=%zu.", node_name.c_str(), args_size_, sizeof(aicpu::AicpuParamHead)); + return FAILED;); args_.reset(new(std::nothrow) uint8_t[args_size_]()); - GE_CHK_BOOL_RET_STATUS(args_ != nullptr, FAILED, - "Node[%s] malloc args mem failed, args_size_=%u.", + GE_IF_BOOL_EXEC(args_ == nullptr, + REPORT_INNER_ERROR("E19999", "new memory failed for Node[%s], args_size_=%u.", + node_name.c_str(), args_size_); + GELOGE(FAILED, "[Malloc][Memory] failed for Node[%s], args_size_=%u.", node_name.c_str(), args_size_); + return FAILED;); errno_t sec_ret = memcpy_s(args_.get(), args_size_, args.c_str(), args.size()); - GE_CHK_BOOL_RET_STATUS(sec_ret == EOK, INTERNAL_ERROR, - "Node[%s] copy args failed, ret: %d", node_name_.c_str(), sec_ret); + GE_IF_BOOL_EXEC(sec_ret != EOK, + REPORT_INNER_ERROR("E19999", + "memcpy_s argc_ failed for Node[%s], ret: %d", node_name_.c_str(), sec_ret); + GELOGE(INTERNAL_ERROR, + "[Update][args] failed for Node[%s], ret: %d", node_name_.c_str(), sec_ret); + return sec_ret;); auto aicpu_param_head = reinterpret_cast(args_.get()); auto io_num = node_item_->num_inputs + node_item_->num_outputs; // check AicpuParamHead ioAddrNum is right. - GE_CHK_BOOL_RET_STATUS((aicpu_param_head->ioAddrNum == static_cast(io_num)), PARAM_INVALID, - "Node[%s] param head ioAddrNum=%u, but node has %d inputs and %d outputs.", + GE_IF_BOOL_EXEC((aicpu_param_head->ioAddrNum != static_cast(io_num)), + REPORT_INNER_ERROR("E19999", + "Node[%s] param head ioAddrNum=%u, but node has %d inputs and %d outputs.", + node_name.c_str(), aicpu_param_head->ioAddrNum, + node_item_->num_inputs, node_item_->num_outputs); + GELOGE(PARAM_INVALID, + "[Check][IoAddrNum]Node[%s] param head ioAddrNum=%u, but node has %d inputs and %d outputs.", node_name.c_str(), aicpu_param_head->ioAddrNum, node_item_->num_inputs, node_item_->num_outputs); + return PARAM_INVALID;); auto mini_len = sizeof(aicpu::AicpuParamHead) + io_num * sizeof(uint64_t); // check args len must over mini len. GE_CHK_BOOL_RET_STATUS((mini_len <= aicpu_param_head->length), PARAM_INVALID, - "Node[%s] param head length=%u, but min len need %zu.", + "[Check][DataLen]Node[%s] param head length=%u, but min len need %zu.", node_name.c_str(), aicpu_param_head->length, mini_len); auto &kernel_ext_info = kernel_def.kernel_ext_info(); auto kernel_ext_info_size = kernel_def.kernel_ext_info_size(); - GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED, - "Node[%s] task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", + GE_IF_BOOL_EXEC(kernel_ext_info.size() != kernel_ext_info_size, + REPORT_INNER_ERROR("E19999", + "Node[%s] task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", + node_name.c_str(), kernel_ext_info.size(), kernel_ext_info_size); + GELOGE(FAILED, + "[Check][Size]Node[%s] task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u", node_name.c_str(), kernel_ext_info.size(), kernel_ext_info_size); + return FAILED;); uint64_t ext_session_id = model.GetSessionId(); - GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), "Node[%s] init ext info failed.", node_name.c_str()); + GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), + "[Init][ExtInfo] failed for Node[%s].", node_name.c_str()); if (ext_info_addr_dev_ == nullptr) { aicpu_param_head->extInfoLength = 0; @@ -754,9 +802,14 @@ Status AicpuNodeTask::UpdateIoAddr(TaskContext &context) { // if has input and output, need copy to ioaddr int cpy_ret = memcpy_s(io_addr, args_size_ - sizeof(aicpu::AicpuParamHead), &io_addrs[0], sizeof(uint64_t) * io_addrs.size()); - GE_CHK_BOOL_RET_STATUS(cpy_ret == 0, INTERNAL_ERROR, - "Node[%s] memcpy io addr to AicpuParamHead failed, ret=%d, args_size=%u, io nums=%zu.", + GE_IF_BOOL_EXEC(cpy_ret != 0, + REPORT_INNER_ERROR("E19999", "Node[%s] memcpy io addr to AicpuParamHead failed," + "ret=%d, args_size=%u, io nums=%zu.", + node_name_.c_str(), cpy_ret, args_size_, io_addrs.size()); + GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[%s] memcpy io addr to AicpuParamHead failed," + "ret=%d, args_size=%u, io nums=%zu.", node_name_.c_str(), cpy_ret, args_size_, io_addrs.size()); + return INTERNAL_ERROR;); return SUCCESS; } @@ -815,12 +868,12 @@ Status AiCpuNodeExecutor::LoadTask(const HybridModel &model, auto task_defs = model.GetTaskDefs(node); GE_CHECK_NOTNULL(task_defs); if (node_item->shape_inference_type != DEPEND_COMPUTE) { - GE_CHK_BOOL_RET_STATUS((*task_defs).size() == 1, PARAM_INVALID, - "Node[%s] task_def num[%zu] != 1", node->GetName().c_str(), (*task_defs).size()); + GE_CHK_BOOL_RET_STATUS((*task_defs).size() == 1, PARAM_INVALID, "[Check][Size]Node[%s] task_def num[%zu] != 1", + node->GetName().c_str(), (*task_defs).size()); } else { // The number of tasks of the fourth type operator must be 2 GE_CHK_BOOL_RET_STATUS((*task_defs).size() == 2, PARAM_INVALID, - "Node[%s] DEPEND_COMPUTE task_def num[%zu] != 2", + "[Check][Size]Node[%s] DEPEND_COMPUTE task_def num[%zu] != 2", node->GetName().c_str(), (*task_defs).size()); } const auto &task_def = (*task_defs)[0]; @@ -832,15 +885,20 @@ Status AiCpuNodeExecutor::LoadTask(const HybridModel &model, GELOGI("Node[%s] task type=%u is AicpuNodeTask.", node->GetName().c_str(), task_def.type()); aicpu_task = MakeShared(node_item, task_def); } else { - GELOGE(UNSUPPORTED, "Node[%s] task type=%u is not supported by aicpu node executor.", + GELOGE(UNSUPPORTED, "[Check][Type]Node[%s] task type=%u is not supported by aicpu node executor," + "RT_MODEL_TASK_KERNEL_EX or RT_MODEL_TASK_KERNEL is supported.", node->GetName().c_str(), task_def.type()); + REPORT_INNER_ERROR("E19999", "Node[%s] task type=%u is not supported by aicpu node executor," + "RT_MODEL_TASK_KERNEL_EX or RT_MODEL_TASK_KERNEL is supported.", + node->GetName().c_str(), task_def.type()); return UNSUPPORTED; } GE_CHK_BOOL_RET_STATUS(aicpu_task != nullptr, MEMALLOC_FAILED, - "Load task for node %s failed.", node->GetName().c_str()); + "[Check][State]Load task for node %s failed.", node->GetName().c_str()); - GE_CHK_STATUS_RET(aicpu_task->Init(model), "Node[%s] task init failed.", node->GetName().c_str()); + GE_CHK_STATUS_RET(aicpu_task->Init(model), + "[Init][AicpuNodeTaskBase] failed for Node[%s].", node->GetName().c_str()); task = std::move(aicpu_task); GELOGD("Node[%s] load task end.", node->GetName().c_str()); diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc index cf5ac851..48c3ab9e 100755 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc @@ -18,6 +18,7 @@ #include "cce/aicpu_engine_struct.h" #include "framework/common/debug/ge_log.h" #include "framework/common/fmk_error_codes.h" +#include "common/dump/dump_manager.h" #include "common/ge/ge_util.h" #include "graph/attr_value.h" #include "graph/debug/ge_attr_define.h" @@ -29,7 +30,7 @@ namespace ge { namespace hybrid { REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH, KnownNodeExecutor); -Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function done_callback) { +Status KnownNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start"); GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName()); if (davinci_model_->GetTaskList().empty()) { @@ -55,7 +56,9 @@ Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::functionGetRtModelHandle(), context.GetStream(), 0); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, - GELOGE(rt_ret, "rtModelExecute error, ret: hybrid_model_executorOx%X", rt_ret); return FAILED;); + REPORT_CALL_ERROR("E19999", "rtModelExecute error, ret:Ox%X", rt_ret); + GELOGE(rt_ret, "[Invoke][rtModelExecute] error, ret:Ox%X", rt_ret); + return FAILED;); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] End"); GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback)); @@ -86,7 +89,7 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { } GE_CHK_STATUS_RET(davinci_model_->UpdateKnownNodeArgs(inputs, outputs), - "known node task update known node args failed."); + "[Update][KnownNodeArgs] failed for %s.", context.GetNodeName()); GELOGD("[%s] KnownNodeExecutor::UpdateArgs success, task_size = %zu", context.GetNodeName(), davinci_model_->GetTaskList().size()); return SUCCESS; @@ -94,71 +97,82 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { Status KnownNodeTask::Init(TaskContext &context) { // allocate output mem - GE_CHK_STATUS_RET(context.AllocateOutputs(), "known node task allocate output failed."); - - // init davinicmodel - if (!load_flag_) { - davinci_model_->InitRuntimeParams(); - GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed."); - } - + GE_CHK_STATUS_RET(context.AllocateOutputs(), "[Allocate][Outputs] failed for %s.", context.GetNodeName()); // allocate mem base void *buffer = nullptr; if (davinci_model_->TotalMemSize() != 0) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTask_AllocateWorkspace] Start"); - GE_CHK_STATUS_RET( - context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, davinci_model_->GetRuntimeParam().mem_base), - "known node task allocate workspace failed."); + GE_CHK_STATUS_RET(context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, + davinci_model_->GetRuntimeParam().mem_base), + "[Allocate][Workspace] failed for %s.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize()); - bool addr_not_changed = false; - if (davinci_model_->GetRuntimeParam().mem_base == buffer) { - addr_not_changed = true; - } - davinci_model_->SetKnownNodeAddrNotChanged(addr_not_changed); // update mem base davinci_model_->UpdateMemBase(static_cast(buffer)); GELOGI("KnownNodeTask::Init mem base is %p, size %lu.", davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size); } + GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), + davinci_model_->Id(), + davinci_model_->SubModelId()), + "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u", + davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId()); if (!load_flag_) { - auto dump_properties = context.GetDumpProperties(); - if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) { - davinci_model_->SetDumpProperties(dump_properties); - void *global_step = nullptr; - TensorValue *varible_global_step = context.GetVariable(NODE_NAME_GLOBAL_STEP); - if (varible_global_step != nullptr) { - global_step = varible_global_step->MutableData(); - } - davinci_model_->SetKnownShapeGlobalStep(global_step); - } - int32_t device_id = 0; - rtError_t rt_ret = rtGetDevice(&device_id); - if (rt_ret != RT_ERROR_NONE || device_id < 0) { - GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - davinci_model_->SetDeviceId(device_id); - GE_CHK_STATUS_RET(davinci_model_->Init(), "KnownNodeExecutor::InitDavinciModel failed."); + auto execution_context = const_cast(context.GetExecutionContext()); + GE_CHECK_NOTNULL(execution_context); + auto &davinci_model = execution_context->davinci_model; + davinci_model.emplace_back(davinci_model_); load_flag_ = true; - } else { - GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), - davinci_model_->Id(), davinci_model_->SubModelId()), "KnownNodeTask::Init destroy aicpu kernel failed."); } + GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); return SUCCESS; } +Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) { + GELOGD("[Init][DavinciModel] start"); + davinci_model_->InitRuntimeParams(); + GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), + "[Init][VariableMem] failed"); + int32_t device_id = 0; + GE_CHK_RT_RET(rtGetDevice(&device_id)); + davinci_model_->SetDeviceId(static_cast(device_id)); + + auto dump_properties = DumpManager::GetInstance().GetDumpProperties(model.GetSessionId()); + if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) { + davinci_model_->SetDumpProperties(dump_properties); + void *global_step = model.GetGlobalStep(); + davinci_model_->SetKnownShapeGlobalStep(global_step); + } + + void *weight = nullptr; + size_t weight_size = 0; + if (weight_buffer != nullptr) { + weight = weight_buffer->GetData(); + weight_size = weight_buffer->GetSize(); + } + GELOGD("Start to init davinci model, weight size = %zu", weight_size); + GE_CHK_STATUS_RET(DoInitDavinciModel(weight, weight_size), "[Init][Model] Failed to init davinci model."); + GELOGD("[Init][Model] success"); + return SUCCESS; +} + +Status KnownNodeTask::DoInitDavinciModel(void *weight, size_t weight_size) { + return davinci_model_->Init(nullptr, 0, weight, weight_size); +} + Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] Start"); - GE_CHK_STATUS_RET(task.Init(context), "known node init davinci model failed."); + GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init] %s known node init davinci model failed.", + context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] End"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] Start"); - GE_CHK_STATUS_RET(task.UpdateArgs(context), "known node task update args failed."); + GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Invoke][UpdateArgs] %s known node task update args failed.", + context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] End"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] End"); GELOGD("[%s] KnownNodeExecutor::PrepareTask success.", context.GetNodeName()); @@ -173,30 +187,38 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node const GeModelPtr ge_model = model.GetGeModel(node); GE_CHECK_NOTNULL(ge_model); + AscendString graph_name; + GE_CHK_GRAPH_STATUS_RET(ge_model->GetGraph().GetName(graph_name), "Failed to get graph name"); + auto weight_buffer = model.GetModelWeight(graph_name.GetString()); + std::shared_ptr davinci_model = MakeShared(0, nullptr); GE_CHECK_NOTNULL(davinci_model); // set known node flag as true davinci_model->SetKnownNode(true); davinci_model->SetId(model.GetModelId()); + davinci_model->SetDumpModelName(model.GetModelName()); davinci_model->SetOmName(model.GetOmName()); // set model id as root node's node id davinci_model->SetSubModelId(node->GetOpDesc()->GetId()); GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId()); - GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed."); + GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), + "[Invoke][Assign]KnownNodeExecutor::LoadTask davincimodel assign failed for node:%s.", + node->GetName().c_str()); - task = MakeShared(davinci_model); - GE_CHECK_NOTNULL(task); + auto known_node_task = MakeShared(davinci_model); + GE_CHECK_NOTNULL(known_node_task); + GE_CHK_STATUS_RET_NOLOG(known_node_task->InitDavinciModel(model, weight_buffer)); GELOGI("[%s] KnownNodeExecutor::LoadTask success.", node->GetName().c_str()); + task = std::move(known_node_task); return SUCCESS; } Status KnownNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] Start"); - GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), - "Failed to execute task. node = %s", + GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "[Invoke][ExecuteAsync]Failed to execute task. node = %s", context.GetNodeItem().NodeName().c_str()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] End"); return SUCCESS; diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h index 6e9740ad..629cb543 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h @@ -31,11 +31,15 @@ class KnownNodeTask : public NodeTask { : davinci_model_(davinci_model) {} - ~KnownNodeTask() {} + ~KnownNodeTask() = default; Status UpdateArgs(TaskContext &context) override; Status ExecuteAsync(TaskContext &context, std::function done_callback) override; Status Init(TaskContext &context) override; + Status InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer); + + protected: + virtual Status DoInitDavinciModel(void *weight, size_t weight_size); private: std::shared_ptr davinci_model_ = nullptr; bool load_flag_ = false; @@ -47,8 +51,6 @@ class KnownNodeExecutor : public NodeExecutor { Status PrepareTask(NodeTask &task, TaskContext &context) const; Status ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const; ~KnownNodeExecutor() {} - private: - std::shared_ptr davinci_model_ = nullptr; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.cc b/ge/hybrid/node_executor/controlop/control_op_executor.cc index 74920b22..df7da661 100644 --- a/ge/hybrid/node_executor/controlop/control_op_executor.cc +++ b/ge/hybrid/node_executor/controlop/control_op_executor.cc @@ -43,8 +43,7 @@ Status ControlOpNodeTask::ExecuteSubgraph(const GraphItem *subgraph, auto executor = MakeShared(subgraph, execution_context); GE_CHECK_NOTNULL(executor); GE_CHK_STATUS_RET(executor->ExecuteAsync(task_context), - "[%s] Failed to execute partitioned call.", - subgraph->GetName().c_str()); + "[Invoke][ExecuteAsync][%s] Failed to execute partitioned call.", subgraph->GetName().c_str()); auto callback = [executor, done_callback]() mutable { if (done_callback != nullptr) { @@ -127,7 +126,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi auto cond_tensor = task_context.GetInput(kIfCondIndex); GE_CHECK_NOTNULL(cond_tensor); GE_CHK_STATUS_RET(ToBool(*cond_tensor, data_type, cond_val), - "[%s] Failed to get cond value.", + "[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName()); } else { // true if num elements is non-zero @@ -141,9 +140,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi auto subgraph = cond_val ? then_ : else_; GELOGD("[%s] Taking subgraph [%s] by cond = [%d]", task_context.GetNodeName(), subgraph->GetName().c_str(), cond_val); GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), - "[%s] Failed to execute subgraph. cond = %d", - task_context.GetNodeName(), - cond_val); + "[Execute][Subgraph] failed for [%s]. cond = %d", task_context.GetNodeName(), cond_val); GELOGD("[%s] Done executing with cond = %d successfully.", task_context.GetNodeName(), cond_val); return SUCCESS; @@ -201,8 +198,7 @@ Status CaseOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::func } GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), - "[%s] Failed to execute else-subgraph.", - task_context.GetNodeName()); + "[Execute][Subgraph] failed for [%s].", task_context.GetNodeName()); GELOGD("[%s] Done executing subgraph[%d] successfully.", task_context.GetNodeName(), branch_index); return SUCCESS; @@ -228,18 +224,18 @@ Status WhileOpNodeTask::Init(const NodePtr &node, const HybridModel &model) { Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::function &done_callback) const { if (task_context.NumInputs() != task_context.NumOutputs()) { + REPORT_INNER_ERROR("E19999", + "[%s] Invalid while args. num_inputs = %d not equal num_outputs = %d", + task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); GELOGE(INTERNAL_ERROR, - "[%s] Invalid while args. num_inputs = %d, num_outputs = %d", - task_context.GetNodeName(), - task_context.NumInputs(), - task_context.NumOutputs()); + "[Check][Param:task_context][%s] Invalid while args. num_inputs = %d, num_outputs = %d", + task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); return INTERNAL_ERROR; } bool is_continue = false; - GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue), - "[%s] Failed to execute iteration 0.", - task_context.GetNodeName()); + GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), + "[Execute][Cond] failed for [%s]", task_context.GetNodeName()); if (!is_continue) { for (int i = 0; i < task_context.NumInputs(); ++i) { auto input_tensor = task_context.GetInput(i); @@ -259,42 +255,27 @@ Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::fun } // backup original input tensor desc - std::vector ori_input_desc; + std::vector ori_input_desc(task_context.NumInputs()); for (int i = 0; i < task_context.NumInputs(); ++i) { - auto tensor_desc = task_context.GetInputDesc(i); - GE_CHECK_NOTNULL(tensor_desc); - ori_input_desc.emplace_back(*tensor_desc); + GE_CHK_STATUS_RET_NOLOG(task_context.GetInputDesc(i, ori_input_desc[i])); } - int iteration = 1; - while (true) { + int iteration = 0; + while (is_continue) { + ++iteration; GELOGD("[%s] Start to execute, iteration = %d", task_context.GetNodeName(), iteration); GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue), - "[%s] Failed to execute iteration %d.", - task_context.GetNodeName(), - iteration); - - if (!is_continue) { - GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration); - break; - } - - ++iteration; + "[Invoke][ExecuteOneLoop][%s] Failed to execute iteration %d.", + task_context.GetNodeName(), iteration); } - - for (int i = 0; i < task_context.NumInputs(); ++i) { - auto input_tensor = task_context.GetInput(i); - auto tensor_desc = task_context.MutableInputDesc(i); - GE_CHECK_NOTNULL(input_tensor); - GE_CHECK_NOTNULL(tensor_desc); - // restore original input tensor desc - *tensor_desc = std::move(ori_input_desc[i]); - GE_CHK_STATUS_RET_NOLOG(task_context.SetOutput(i, *input_tensor)); - } - + GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration); if (done_callback) { done_callback(); } + + for (int i = 0; i < task_context.NumInputs(); ++i) { + GE_CHK_STATUS_RET_NOLOG(task_context.UpdateInputDesc(i, ori_input_desc[i])); + } return SUCCESS; } @@ -313,24 +294,27 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue auto executor = MakeShared(cond_, execution_context, task_context.IsForceInferShape()); GE_CHECK_NOTNULL(executor); GELOGD("[%s] Start to execute cond-subgraph.", task_context.GetNodeName()); - GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), "Failed to execute partitioned call."); + GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), + "[Invoke][ExecuteAsync] %s Failed to execute partitioned call.", task_context.GetNodeName()); GELOGD("[%s] Done executing cond-subgraph successfully.", cond_->GetName().c_str()); GE_CHK_STATUS_RET_NOLOG(task_context.RegisterCallback([executor]() mutable { executor.reset(); })); // get cond output - GE_CHK_STATUS_RET(executor->Synchronize(), "[%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); + GE_CHK_STATUS_RET(executor->Synchronize(), + "[Invoke][Synchronize][%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); std::vector cond_outputs; std::vector cond_output_desc_list; GE_CHK_STATUS_RET(executor->GetOutputs(cond_outputs, cond_output_desc_list), - "[%s] Failed to get cond-output.", - cond_->GetName().c_str()); + "[Invoke][GetOutputs][%s] Failed to get cond-output.", cond_->GetName().c_str()); if (cond_outputs.size() != kCondOutputSize || cond_output_desc_list.size() != kCondOutputSize) { + REPORT_INNER_ERROR("E19999", "[%s] Number of cond outputs(%zu) or size of cond output desc(%zu)" + "not equal %zu, check invalid", task_context.GetNodeName(), cond_outputs.size(), + cond_output_desc_list.size(), kCondOutputSize); GELOGE(INTERNAL_ERROR, - "[%s] Number of cond outputs is invalid. number = %zu", - task_context.GetNodeName(), - cond_outputs.size()); + "[Check][Size][%s] Number of cond outputs(%zu) or Number of cond output desc(%zu) not equal %zu", + task_context.GetNodeName(), cond_outputs.size(), cond_output_desc_list.size(), kCondOutputSize); return INTERNAL_ERROR; } @@ -339,8 +323,7 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue if (shape.IsScalar()) { auto data_type = cond_tensor_desc->GetDataType(); GE_CHK_STATUS_RET(ToBool(cond_outputs[0], data_type, is_continue), - "[%s] Failed to get cond value.", - task_context.GetNodeName()); + "[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName()); } else { // true if num elements is non-zero is_continue = shape.GetShapeSize() > 0; @@ -379,23 +362,25 @@ Status WhileOpNodeTask::MoveOutputs2Inputs(TaskContext &task_context) { } Status WhileOpNodeTask::ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const { - GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), - "[%s] Failed to execute cond-subgraph", - task_context.GetNodeName()); - if (!is_continue) { - return SUCCESS; - } - GELOGD("[%s] Start to execute body-subgraph.", task_context.GetNodeName()); GE_CHK_STATUS_RET(ExecuteSubgraph(body_, task_context, nullptr), - "[%s] Failed to execute cond-subgraph", task_context.GetNodeName()); + "[Execute][Subgraph] failed for [%s]", task_context.GetNodeName()); GELOGD("[%s] Done executing body-subgraph successfully.", task_context.GetNodeName()); // set outputs to inputs for next iteration GE_CHK_STATUS_RET(MoveOutputs2Inputs(task_context), - "[%s] Failed to move outputs to inputs", - task_context.GetNodeName()); + "[Move][Outputs2Inputs] failed for [%s]", task_context.GetNodeName()); + + GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), + "[Invoke][ExecuteCond][%s] Failed to execute cond-subgraph", task_context.GetNodeName()); + if (!is_continue) { + for (int i = 0; i < task_context.NumInputs(); ++i) { + auto input_desc = task_context.GetInput(i); + GE_CHECK_NOTNULL(input_desc); + GE_CHK_STATUS_RET_NOLOG(task_context.SetOutput(i, *input_desc)); + } + } return SUCCESS; } @@ -414,12 +399,14 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model, } else if (node_type == WHILE || node_type == STATELESSWHILE) { node_task.reset(new(std::nothrow) WhileOpNodeTask()); } else { - GELOGE(PARAM_INVALID, "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); + GELOGE(PARAM_INVALID, "[Check][NodeType][%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); return PARAM_INVALID; } GE_CHECK_NOTNULL(node_task); - GE_CHK_STATUS_RET(node_task->Init(node, model), "[%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); + GE_CHK_STATUS_RET(node_task->Init(node, model), + "[Invoke][Init][%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); task = std::move(node_task); return SUCCESS; diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.h b/ge/hybrid/node_executor/controlop/control_op_executor.h index 3becfaaa..fd02bd25 100644 --- a/ge/hybrid/node_executor/controlop/control_op_executor.h +++ b/ge/hybrid/node_executor/controlop/control_op_executor.h @@ -80,7 +80,6 @@ class WhileOpNodeTask : public ControlOpNodeTask { Status ExecuteCond(TaskContext &task_context, bool &is_continue) const; static Status MoveOutputs2Inputs(TaskContext &task_context); - Status ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const; private: diff --git a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc index 3d2e3084..43a4ca84 100755 --- a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc +++ b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc @@ -33,6 +33,7 @@ const std::map> {RESHAPE, {}}, {EXPANDDIMS, {}}, {SQUEEZE, {}}, + {UNSQUEEZE, {}}, {BROADCASTGRADIENTARGS, {}} }; @@ -46,7 +47,9 @@ Status RefInputTask::UpdateArgs(TaskContext &) { Status RefInputTask::Execute(TaskContext &context) { auto iter = out_ref_input_index_.find(node_type_); if (iter == out_ref_input_index_.end()) { - GELOGE(UNSUPPORTED, "node %s type %s can not use RefInputTask.", + REPORT_INNER_ERROR("E19999", "node %s type %s can not use RefInputTask.", + node_name_.c_str(), node_type_.c_str()); + GELOGE(UNSUPPORTED, "[Find][Node]node %s type %s can not use RefInputTask.", node_name_.c_str(), node_type_.c_str()); return UNSUPPORTED; } @@ -64,7 +67,9 @@ Status RefInputTask::RefOneByOne(TaskContext &context) { int input_num = context.NumInputs(); int output_num = context.NumOutputs(); if (output_num > input_num) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", + node_name_.c_str(), node_type_.c_str(), output_num, input_num); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only %d inputs, can't ref one by one.", node_name_.c_str(), node_type_.c_str(), output_num, input_num); return INTERNAL_ERROR; } @@ -83,7 +88,9 @@ Status RefInputTask::RefByOrder(const std::vector &ref_order, TaskCont GELOGI("node %s type %s ref input by order begin.", node_name_.c_str(), node_type_.c_str()); int32_t output_num = context.NumOutputs(); if (ref_order.size() != static_cast(output_num)) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only has %zu out ref index.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only has %zu out ref index.", + node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only has %zu out ref index.", node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); return INTERNAL_ERROR; } @@ -101,7 +108,7 @@ Status RefInputTask::RefByOrder(const std::vector &ref_order, TaskCont Status RefInputTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[RefInputTaskExecuteAsync] Start"); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s ref input task execute failed", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s ref input task execute failed", node_name_.c_str(), node_type_.c_str()); if (done_callback != nullptr) { // host cpu no need register callback, call it directly. @@ -125,20 +132,26 @@ Status DependInputShapeTask::Execute(TaskContext &context) { std::string node_type = node_->GetType(); auto kernel = factory.Create(node_type); if (kernel == nullptr) { - GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", + REPORT_CALL_ERROR("E19999", "create failed for node %s type %s is not supported by host kernel.", + node_->GetName().c_str(), node_type.c_str()); + GELOGE(UNSUPPORTED, "[Invoke][Create]node %s type %s is not supported by host kernel.", node_->GetName().c_str(), node_type.c_str()); return UNSUPPORTED; } std::vector outputs; Status compute_ret = kernel->Compute(node_, outputs); if (compute_ret != SUCCESS) { - GELOGE(compute_ret, "node %s type %s compute failed or not imply.", + REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", node_->GetName().c_str(), node_type.c_str()); + GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", node_->GetName().c_str(), node_type.c_str()); return compute_ret; } int32_t output_num = context.NumOutputs(); if (static_cast(output_num) != outputs.size()) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d output," + "but kernel compute only has %zu output. check invalid", + node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d output, but kernel compute only has %zu output.", node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); return INTERNAL_ERROR; } @@ -154,7 +167,11 @@ Status DependInputShapeTask::Execute(TaskContext &context) { auto tensor_value = context.MutableOutput(i); GE_CHECK_NOTNULL(tensor_value); if (tensor_data.GetSize() > tensor_value->GetSize()) { - GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", + REPORT_INNER_ERROR("E19999", "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu." + "check invalid", + node_->GetName().c_str(), node_type.c_str(), i, + tensor_data.GetSize(), tensor_value->GetSize()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", node_->GetName().c_str(), node_type.c_str(), i, tensor_data.GetSize(), tensor_value->GetSize()); return INTERNAL_ERROR; } @@ -179,7 +196,7 @@ Status DependInputShapeTask::Execute(TaskContext &context) { Status DependInputShapeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[DependInputShapeTaskExecuteAsync] Start"); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s depend input shape task execute failed", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s depend input shape task execute failed", node_->GetName().c_str(), node_->GetType().c_str()); if (done_callback != nullptr) { // host cpu no need register callback, call it directly. @@ -212,7 +229,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, node->GetName().c_str(), node_type.c_str()); task = MakeShared(node); if (task == nullptr) { - GELOGE(MEMALLOC_FAILED, "create RefInputTask for node %s failed.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Create RefInputTask failed for node %s.", node->GetName().c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][RefInputTask] failed for node %s.", node->GetName().c_str()); return MEMALLOC_FAILED; } } else if (DependInputShapeTask::IsBelong(node_type)) { @@ -220,7 +238,9 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, node->GetName().c_str(), node_type.c_str()); task = MakeShared(node); if (task == nullptr) { - GELOGE(MEMALLOC_FAILED, "create DependInputShapeTask for node %s type %s failed.", + REPORT_CALL_ERROR("E19999", "Create DependInputShapeTask failed for node %s type %s.", + node->GetName().c_str(), node_type.c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][DependInputShapeTask]failed for node %s type %s.", node->GetName().c_str(), node_type.c_str()); return MEMALLOC_FAILED; } @@ -228,7 +248,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str()); auto tensor = model.GetTensor(node); if (tensor == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to get tensor by name: %s", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetTensor failed for name: %s", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][Tensor] failed for name: %s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -250,7 +271,7 @@ Status ConstantNodeTask::UpdateArgs(TaskContext &context) { Status ConstantNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGD("[%s] Start execute.", context.GetNodeName()); - GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[Set][Output] failed for [%s].", context.GetNodeName()); if (done_callback) { GELOGD("[%s] Start invoke callback.", context.GetNodeName()); done_callback(); diff --git a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc index 48b5fe9a..20684194 100644 --- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc +++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc @@ -43,13 +43,15 @@ REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::HCCL, HcclNode Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGI("[%s] HcclNodeTask::ExecuteAsync in.", context.GetNodeName()); if (context.handle_ == nullptr) { - GELOGE(FAILED, "hccl handle is nullptr! "); + REPORT_INNER_ERROR("E19999", " %s invalid, hccl handle is nullptr!", context.GetNodeName()); + GELOGE(FAILED, "[Check][Param:context] %s hccl handle is nullptr!", context.GetNodeName()); return FAILED; } auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function))dlsym( context.handle_, "HcomExecEnqueueOperation"); if (HcomExecEnqueueOperation == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecEnqueueOperation] failed for %s hcom unknown node function.", + context.GetNodeName()); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -83,24 +85,35 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do ge::DataType src_data_type = input_desc->GetDataType(); auto iter = kConstOpHcclDataType.find(static_cast(src_data_type)); if (iter == kConstOpHcclDataType.end()) { - GELOGE(PARAM_INVALID, "kConstOpHcclDataType find failed."); + REPORT_INNER_ERROR("E19999", "%s inputdesc0 datatype:%s not support.", + op_desc->GetName().c_str(), + TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + GELOGE(PARAM_INVALID, "[Find][DataType]%s inputdesc0 datatype:%s not support.", + op_desc->GetName().c_str(), + TypeUtils::DataTypeToSerialString(src_data_type).c_str()); return PARAM_INVALID; } op_info.dataType = iter->second; HcclReduceOp op_type = HCCL_REDUCE_SUM; if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER || op_desc->GetType() == HVDCALLBACKALLREDUCE || op_desc->GetType() == HCOMREDUCE) { - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed"); + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), + "[Get][HcclOperationType] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); op_info.opType = op_type; } int64_t root_id = 0; if (op_desc->GetType() == HCOMBROADCAST) { - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed"); + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), + "[Get][HcclRootId] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); } op_info.root = root_id; auto callback = [op_desc, done_callback](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", + REPORT_CALL_ERROR("E19999", "call HcomExecEnqueueOperation failed for node %s, ret: 0x%X", + op_desc->GetName().c_str(), status); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node %s, ret: 0x%X", op_desc->GetName().c_str(), status); } @@ -110,14 +123,18 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do int32_t count = 0; GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast(op_info.dataType), op_desc->GetType() == HCOMALLGATHER, count), - "GetHcomCount failed"); + "[Get][HcomCount] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.", context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); op_info.count = count; HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); + REPORT_CALL_ERROR("E19999", "Call HcomExecEnqueueOperation failed for node:%s(%s), ret: 0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node:%s(%s), ret: 0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); return HCCL_E_INTERNAL; } @@ -173,13 +190,23 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vectorGetTensor(offset_index_.first, offset_index_.second, offset_tensor)) if (static_cast(offset_tensor.GetSize() / GetSizeByDataType(data_type)) != row_num) { - GELOGE(PARAM_INVALID, "num of offset and remote addr mismatch, offset size=%zu, remote_addr size=%ld, dtype=%s", + REPORT_INNER_ERROR("E19999", "num of offset and remote addr mismatch, check invalid" + "offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num, + TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGE(PARAM_INVALID, "[Check][Size]num of offset and remote addr mismatch," + "offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num, TypeUtils::DataTypeToSerialString(data_type).c_str()); return PARAM_INVALID; } @@ -244,7 +275,9 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector(reinterpret_cast(tv->MutableData())); auto device_len = tv->GetSize() / row_num; if (device_len <= 0 || device_len > data[kVarTableIdxLen]) { - GELOGE(FAILED, "Local embedding length is out of range, expect %ld, but %ld exactly.", + REPORT_INNER_ERROR("E19999", "Local embedding length is out of range, expect %ld, but %ld exactly.", + data[kVarTableIdxLen], device_len); + GELOGE(FAILED, "[Check][Size]Local embedding length is out of range, expect %ld, but %ld exactly.", data[kVarTableIdxLen], device_len); return FAILED; } @@ -267,7 +300,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do (HcclResult(*)(const string &, const vector &, std::function))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); if (HcomExecEnqueueRemoteAccess == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecEnqueueRemoteAccess] failed for node:%s(%s) hcom unknown node function.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -283,7 +317,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do TaskContext *p_ctx = &context; auto callback = [p_ctx, done_callback](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExcutorInitialize] failed for node:%s(%s), ret: 0x%X", + p_ctx->GetNodeName(), p_ctx->GetNodeItem().NodeType().c_str(), status); p_ctx->SetStatus(FAILED); } done_callback(); @@ -296,7 +331,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do } HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueRemoteAccess] failed for node:%s(%s), ret: 0x%X", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str(), hccl_ret); return HCCL_E_INTERNAL; } @@ -314,13 +350,17 @@ Status HcclNodeTask::Init(TaskContext &context) { Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName()); - GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed."); + GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init]hccl node %s(%s) load hccl so failed.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); // allocate output mem, output mem or remote read will be calculated when node execute. if (kRdmaReadTypes.count(context.GetNodeItem().NodeType()) == 0) { - GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed."); + GE_CHK_STATUS_RET(context.AllocateOutputs(), + "[Invoke][AllocateOutputs]hccl node %s(%s) task allocate output failed.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); } - GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed."); + GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Update][Args] failed for hccl node %s(%s).", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName()); return SUCCESS; } @@ -341,8 +381,9 @@ Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, Status HcclNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { context.handle_ = handle_; - GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "Failed to execute task. node = %s", - context.GetNodeItem().NodeName().c_str()); + GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), + "[Invoke][ExecuteAsync] failed to execute task. node:%s(%s)", + context.GetNodeItem().NodeName().c_str(), context.GetNodeItem().NodeType().c_str()); return SUCCESS; } @@ -359,12 +400,13 @@ Status HcclNodeExecutor::Initialize() { GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); handle_ = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (handle_ == nullptr) { - GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); + REPORT_CALL_ERROR("E19999", "Open SoFile %s failed, error:%s! ", canonical_path.c_str(), dlerror()); + GELOGE(GE_PLGMGR_SO_NOT_EXIST, "[Open][SoFile] %s failed, error:%s! ", canonical_path.c_str(), dlerror()); return FAILED; } auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); if (HcomExecInitialize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecInitialize] Failed for hcom unknown node function."); return FAILED; } HcclResult hccl_ret = HcomExecInitialize(); @@ -373,7 +415,7 @@ Status HcclNodeExecutor::Initialize() { } else if (hccl_ret == HCCL_SUCCESS) { GELOGI("Hcom executor initialize success."); } else { - GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "[Call][HcomExecInitialize] failed, ret: 0x%X", hccl_ret); return FAILED; } return SUCCESS; @@ -382,12 +424,12 @@ Status HcclNodeExecutor::Initialize() { Status HcclNodeExecutor::Finalize() { auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); if (HcomExecFinalize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecFinalize] failed for hcom unknown node function."); return FAILED; } HcclResult hccl_ret = HcomExecFinalize(); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "[Call][HcomExecFinalize] failed, ret: 0x%X", hccl_ret); return FAILED; } // dlclose file handle diff --git a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc index 0cc635e4..6e8a1eb9 100755 --- a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc +++ b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc @@ -33,7 +33,7 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) { Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGD("[%s] Start execute.", context.GetNodeName()); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute] failed for node:%s type:%s.", node_->GetName().c_str(), node_->GetType().c_str()) if (done_callback) { GELOGD("[%s] Start invoke callback.", context.GetNodeName()); @@ -70,7 +70,8 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) { AllocationAttr attr; attr.SetMemType(HOST_DDR); if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) { - GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i); + REPORT_CALL_ERROR("E19999", "node:%s Failed to allocate output %d", context.GetNodeName(), i); + GELOGE(FAILED, "[Invoke][AllocateOutput]node:%s Failed to allocate output %d", context.GetNodeName(), i); return FAILED; } auto tensor = context.GetOutput(i); @@ -92,14 +93,18 @@ Status HostCpuNodeTask::Execute(TaskContext &context) { RunContext run_context; auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_); if (host_kernel == nullptr) { - GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", + REPORT_CALL_ERROR("E19999", "CreateKernel failed for node %s type %s is not supported by host kernel.", + node_->GetName().c_str(), node_->GetType().c_str()); + GELOGE(UNSUPPORTED, "[Create][Kernel]node %s type %s is not supported by host kernel.", node_->GetName().c_str(), node_->GetType().c_str()); return UNSUPPORTED; } Status compute_ret = host_kernel->Compute(context); if (compute_ret != SUCCESS) { - GELOGE(compute_ret, "node %s type %s compute failed or not imply.", + REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", + node_->GetName().c_str(), node_->GetType().c_str()); + GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", node_->GetName().c_str(), node_->GetType().c_str()); return compute_ret; } @@ -131,7 +136,10 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no task = MakeShared(node); GE_CHECK_NOTNULL(task); } else { - GELOGE(UNSUPPORTED, "node %s type %s is not support in HostCpuNodeExecutor now.", name.c_str(), type.c_str()); + REPORT_INNER_ERROR("E19999", "Create NodeTask failed for node %s type %s.", + name.c_str(), type.c_str()); + GELOGE(UNSUPPORTED, "[Create][NodeTask]node %s type %s is not support in HostCpuNodeExecutor now.", + name.c_str(), type.c_str()); return UNSUPPORTED; } return SUCCESS; diff --git a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc index d54195d6..370bb286 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc @@ -34,7 +34,9 @@ Status AssignKernel::Compute(TaskContext& context) { const auto value_tensor = context.GetInput(kAssignValueInputIndex); GE_CHECK_NOTNULL(value_tensor); if (value_tensor->GetSize() > ref_tensor->GetSize()) { - GELOGE(INTERNAL_ERROR, "[%s] value_input_size=%zu, but ref_input_size=%zu.", + REPORT_INNER_ERROR("E19999", "[%s] value_input_size=%zu bigger than ref_input_size=%zu. check invalid", + node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] value_input_size=%zu, but ref_input_size=%zu.", node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); return INTERNAL_ERROR; } @@ -46,7 +48,7 @@ Status AssignKernel::Compute(TaskContext& context) { value_tensor->GetSize(), RT_MEMCPY_HOST_TO_HOST)); } GE_CHK_STATUS_RET(context.SetOutput(kAssignRefOutputIndex, *ref_tensor), - "[%s] Failed to set output.", context.GetNodeName()); + "[Set][Output] failed for[%s].", context.GetNodeName()); GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; diff --git a/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc index e34f601a..8bf50096 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc @@ -30,7 +30,8 @@ namespace host_cpu { Status DataKernel::Compute(TaskContext& context) { auto input = context.MutableInput(kDataInputIndex); GE_CHECK_NOTNULL(input); - GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), "[%s] Failed to set output.", context.GetNodeName()) + GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), + "[Set][Output] failed for [%s].", context.GetNodeName()) GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc index b1b4e68c..1d450166 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc @@ -28,6 +28,7 @@ Status NoOpKernel::Compute(TaskContext& context) { } REGISTER_KERNEL_CREATOR(NoOp, NoOpKernel); +REGISTER_KERNEL_CREATOR(NetOutput, NoOpKernel); } // namespace host_cpu } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc index 52d48821..17692f36 100755 --- a/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc @@ -36,31 +36,41 @@ Status RandomUniformKernel::Compute(TaskContext& context) { (void)AttrUtils::GetInt(node_->GetOpDesc(), "seed2", seed2); DataType data_type = DT_FLOAT; if (!AttrUtils::GetDataType(node_->GetOpDesc(), kAttrDtype, data_type)) { - GELOGE(PARAM_INVALID, "[%s] get attr dtype failed.", node_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetDataType failed for [%s].", node_->GetName().c_str()); + GELOGE(PARAM_INVALID, "[Get][DataType] failed for [%s].", node_->GetName().c_str()); return PARAM_INVALID; } switch (data_type) { case DT_FLOAT16: if (GenerateFP16(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); + GELOGE(FAILED, "[Invoke][GenerateFP16]Generate random_distribution failed for %s, data_type=DT_FLOAT16", + node_->GetName().c_str()); return FAILED; } break; case DT_FLOAT: if (Generate(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); + GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_FLOAT", + node_->GetName().c_str()); return FAILED; } break; case DT_DOUBLE: if (Generate(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_DOUBLE"); + GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_DOUBLE", + node_->GetName().c_str()); return FAILED; } break; default: - GELOGE(UNSUPPORTED, "Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE, but data_type=%s", - TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_INNER_ERROR("E19999", "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," + "but data_type=%s, node:%s", + TypeUtils::DataTypeToSerialString(data_type).c_str(), + node_->GetName().c_str()); + GELOGE(UNSUPPORTED, "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," + "but data_type=%s, node:%s", + TypeUtils::DataTypeToSerialString(data_type).c_str(), + node_->GetName().c_str()); return UNSUPPORTED; } @@ -79,7 +89,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s auto tensor_size = data_num * sizeof(T); TensorValue tensor; GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), - "[%s] Failed to allocate output of size %zu", + "[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", context.GetNodeName(), tensor_size); @@ -101,7 +111,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s *(buf + i) = distribution(gen); } - GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); return SUCCESS; } @@ -115,7 +125,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 auto tensor_size = data_num * sizeof(fp16_t); TensorValue tensor; GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), - "[%s] Failed to allocate output of size %zu", + "[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", context.GetNodeName(), tensor_size); @@ -137,7 +147,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 *(buf + i) = static_cast(distribution(gen)); } - GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output]failed for [%s].", context.GetNodeName()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc index 16738c2a..902a07c2 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc @@ -25,11 +25,12 @@ namespace host_cpu { Status VariableKernel::Compute(TaskContext& context) { auto tensor = context.GetVariable(node_->GetName()); if (tensor == nullptr) { - GELOGE(PARAM_INVALID, "tensor is NULL."); + REPORT_INNER_ERROR("E19999", "Get Variable from task context for node:%s failed.", context.GetNodeName()); + GELOGE(PARAM_INVALID, "[Check][Param]Get Variable from task context for node:%s failed.", context.GetNodeName()); return PARAM_INVALID; } // Constant & Variable Op has and only has one output - GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel_factory.cc b/ge/hybrid/node_executor/host_cpu/kernel_factory.cc index aabae999..7d3ef703 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel_factory.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel_factory.cc @@ -34,7 +34,10 @@ std::shared_ptr KernelFactory::CreateKernel(const NodePtr &node) { if (iter != kernel_creator_map_.end()) { return iter->second(node); } - GELOGE(FAILED, "Not supported, type = %s, name = %s", node->GetType().c_str(), node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Not supported because kernel_creator_map_ not contain type:%s, name = %s", + node->GetType().c_str(), node->GetName().c_str()); + GELOGE(FAILED, "[Find][NodeType]Not supported because kernel_creator_map_ not contain type = %s, name = %s", + node->GetType().c_str(), node->GetName().c_str()); return nullptr; } diff --git a/ge/hybrid/node_executor/node_executor.cc b/ge/hybrid/node_executor/node_executor.cc index e74256f2..d5d868ab 100755 --- a/ge/hybrid/node_executor/node_executor.cc +++ b/ge/hybrid/node_executor/node_executor.cc @@ -45,8 +45,7 @@ Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback), - "Failed to execute task. node = %s", - context.GetNodeItem().NodeName().c_str()); + "[Execute][Task] failed. node = %s", context.GetNodeItem().NodeName().c_str()); return SUCCESS; } @@ -106,7 +105,10 @@ NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node const auto &lib_name = op_desc->GetOpKernelLibName(); auto it = engine_mapping_.find(lib_name); if (it == engine_mapping_.end()) { - GELOGE(UNSUPPORTED, "KernelLib not supported. node = %s, lib_name = %s", node.GetName().c_str(), lib_name.c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get ExecutorType by lib_name:%s, node:%s", + lib_name.c_str(), node.GetName().c_str()); + GELOGE(UNSUPPORTED, "[Find][ExecutorType]Failed to get ExecutorType by lib_name:%s, node:%s", + lib_name.c_str(), node.GetName().c_str()); return ExecutorType::RESERVED; } @@ -117,7 +119,10 @@ Status NodeExecutorManager::GetExecutor(Node &node, const NodeExecutor **executo auto executor_type = ResolveExecutorType(node); const auto it = executors_.find(executor_type); if (it == executors_.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get executor by type: %d.", static_cast(executor_type)); + REPORT_INNER_ERROR("E19999", "Failed to get executor by type: %d.", + static_cast(executor_type)); + GELOGE(INTERNAL_ERROR, "[Check][ExecutorType]Failed to get executor by type: %d.", + static_cast(executor_type)); return INTERNAL_ERROR; } @@ -155,16 +160,16 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const { GeShape output_shape = output_tensor.GetShape(); int64_t output_mem_size = 0; GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size), - "hccl calc tensor mem size failed."); + "[Calc][TensorMemSize] failed, node:%s.", node.GetName().c_str()); GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1), - "[%s] Invalid output mem size: %ld", + "[Check][Overflow][%s] Invalid output mem size: %ld", node.GetName().c_str(), output_mem_size); output_mem_size = ((output_mem_size + MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE; TensorUtils::SetSize(output_tensor, output_mem_size); GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast(i), output_tensor), - "hccl update output size failed."); + "[Update][OutputDesc] failed, node:%s.", node.GetName().c_str()); GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i, output_tensor.GetShape().GetDimNum(), output_mem_size); } @@ -189,14 +194,17 @@ Status NodeExecutorManager::InitializeExecutors() { GE_CHECK_NOTNULL(build_fn); auto executor = std::unique_ptr(build_fn()); if (executor == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to create executor for engine type = %d", static_cast(engine_type)); + REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for engine type = %d", + static_cast(engine_type)); + GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for engine type = %d", static_cast(engine_type)); return INTERNAL_ERROR; } GELOGD("Executor of engine type = %d was created successfully", static_cast(engine_type)); auto ret = executor->Initialize(); if (ret != SUCCESS) { - GELOGE(ret, "Failed to initialize NodeExecutor of type = %d, clear executors", static_cast(engine_type)); + REPORT_CALL_ERROR("E19999", "Initialize NodeExecutor failed for type = %d", static_cast(engine_type)); + GELOGE(ret, "[Initialize][NodeExecutor] failed for type = %d", static_cast(engine_type)); for (auto &executor_it : executors_) { executor_it.second->Finalize(); } diff --git a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc index f01cb21e..28a5dea1 100755 --- a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc +++ b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc @@ -38,15 +38,14 @@ Status PartitionedCallNodeTask::Init(TaskContext &context) { Status PartitionedCallNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GE_CHK_STATUS_RET(subgraph_executor_->ExecuteAsync(context), - "[%s] Failed to set inputs", graph_item_->GetName().c_str()); + "[Invoke][ExecuteAsync] failed for[%s]", graph_item_->GetName().c_str()); auto callback = [=]() { Callback(done_callback); }; GE_CHK_STATUS_RET(context.RegisterCallback(callback), - "[%s] Failed to register callback", - graph_item_->GetName().c_str()); + "[Register][Callback] failed for [%s]", graph_item_->GetName().c_str()); GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str()); return SUCCESS; } @@ -83,7 +82,7 @@ Status PartitionedCallNodeExecutor::LoadTask(const ge::hybrid::HybridModel &mode Status PartitionedCallNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] Start"); - GE_CHK_STATUS_RET(task.Init(context), "[%s] Failed to init task.", context.GetNodeName()); + GE_CHK_STATUS_RET(task.Init(context), "[Init][Task] failed for [%s].", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] End"); return SUCCESS; } diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc index 84dd8fd8..7fa89196 100644 --- a/ge/hybrid/node_executor/task_context.cc +++ b/ge/hybrid/node_executor/task_context.cc @@ -63,17 +63,22 @@ std::unique_ptr TaskContext::Create(NodeState *node_state, node_item.output_start, node_item.num_outputs); if (node_item.input_start < 0 || node_item.output_start < 0) { + REPORT_INNER_ERROR("E19999", "NodeItem:%s(%s) not property initialized." + "input_start:%d or output_start:%d less than 0", + node_item.NodeName().c_str(), node_item.NodeType().c_str(), + node_item.input_start, node_item.output_start); GELOGE(INTERNAL_ERROR, - "NodeItem not property initialized. input_start = %d, output_start = %d", - node_item.input_start, - node_item.output_start); + "[Check][Param]NodeItem:%s(%s) not property initialized. input_start = %d, output_start = %d", + node_item.NodeName().c_str(), node_item.NodeType().c_str(), + node_item.input_start, node_item.output_start); return nullptr; } auto task_context = std::unique_ptr( new(std::nothrow)TaskContext(execution_context, node_state, subgraph_context)); if (task_context == nullptr) { - GELOGE(MEMALLOC_FAILED, "[%s] Failed to create instance of TaskContext.", node_item.NodeName().c_str()); + REPORT_CALL_ERROR("E19999", "Create TaskContext failed for [%s].", node_item.NodeName().c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][TaskContext] failed for [%s].", node_item.NodeName().c_str()); return nullptr; } @@ -94,7 +99,12 @@ int TaskContext::NumOutputs() const { TensorValue *TaskContext::MutableInput(int index) { if (index < 0 || index >= node_item_->num_inputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_inputs = %d, node:%s(%s)", + index, node_item_->num_inputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_inputs = %d, node:%s(%s)", + index, node_item_->num_inputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -103,7 +113,12 @@ TensorValue *TaskContext::MutableInput(int index) { const TensorValue *TaskContext::GetOutput(int index) const { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -112,7 +127,12 @@ const TensorValue *TaskContext::GetOutput(int index) const { TensorValue *TaskContext::MutableOutput(int index) { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -125,7 +145,10 @@ std::size_t TaskContext::NumWorkspaces() const { void *TaskContext::MutableWorkspace(int index) { if (index < 0 || static_cast(index) >= workspaces_.size()) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_workspaces = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. number:%zu of workspaces_, node:%s(%s)", + index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. number:%zu of workspaces_, node:%s(%s)", + index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -134,7 +157,11 @@ void *TaskContext::MutableWorkspace(int index) { const TensorValue *TaskContext::GetInput(int index) const { if (index < 0 || index >= node_item_->num_inputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); + REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. num_inputs:%d node:%s(%s)", + index, node_item_->num_inputs, node_item_->NodeName().c_str(), + node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. num_inputs:%d node:%s(%s)", + index, node_item_->num_inputs, node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -146,7 +173,10 @@ Status TaskContext::AllocateWorkspaces() { for (auto size : workspace_sizes) { void *workspace = execution_context_->allocator->Allocate(size); if (workspace == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size: %ld", size); + REPORT_CALL_ERROR("E19999", "node:%s(%s) Allocate workspace failed, size: %ld", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); + GELOGE(MEMALLOC_FAILED, "[Allocate][workspace] failed for node:%s(%s), size: %ld", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); return MEMALLOC_FAILED; } @@ -162,7 +192,8 @@ Status TaskContext::RegisterCallback(const std::function &callback_fun) } auto ret = execution_context_->callback_manager->RegisterCallback(GetStream(), callback_fun); if (ret != SUCCESS) { - GELOGE(ret, "[%s] Failed to register callback", GetNodeName()); + REPORT_CALL_ERROR("E19999", "RegisterCallback failed for [%s]", GetNodeName()); + GELOGE(ret, "[Register][Callback] failed for [%s]", GetNodeName()); execution_context_->callback_manager->Destroy(); return ret; } @@ -187,7 +218,8 @@ string TaskContext::TensorDesc2String(const GeTensorDesc &desc) { Status TaskContext::AllocateTensor(const GeTensorDesc &tensor_desc, TensorValue &tensor, AllocationAttr *attr) { int64_t size = 0; if (ge::TensorUtils::GetSize(tensor_desc, size) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to get tensor size"); + REPORT_CALL_ERROR("E19999", "Get TensorSize failed, tensor:%s", tensor_desc.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][TensorSize] failed, tensor:%s", tensor_desc.GetName().c_str()); return INTERNAL_ERROR; } @@ -211,7 +243,12 @@ Status TaskContext::AllocateOutput(int index, TensorDesc2String(tensor_desc).c_str()); if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); + REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); + GELOGE(PARAM_INVALID, "[Check][Param] %s(%s) output index out of range. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); return PARAM_INVALID; } @@ -236,7 +273,7 @@ Status TaskContext::AllocateOutput(int index, ref_node->GetName().c_str(), ref_node->GetType().c_str()); - TensorValue *ref_tensor = execution_context_->model->GetVariable(ref_node->GetName()); + TensorValue *ref_tensor = execution_context_->model->GetTensor(ref_node); GE_CHECK_NOTNULL(ref_tensor); outputs_start_[index] = *ref_tensor; } else { @@ -289,7 +326,10 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) { Status TaskContext::AllocateTensor(size_t size, TensorValue &tensor, AllocationAttr *attr) { auto buffer = TensorBuffer::Create(execution_context_->allocator, size, attr); if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate buffer of size: %zu", size); + REPORT_CALL_ERROR("E19999", "%s(%s) Allocate buffer failed, size: %zu", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); + GELOGE(MEMALLOC_FAILED, "[Allocate][buffer] failed for %s(%s), size: %zu", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); return MEMALLOC_FAILED; } @@ -303,7 +343,12 @@ const NodeItem &TaskContext::GetNodeItem() const { Status TaskContext::SetOutput(int index, const TensorValue &tensor) { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); + REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); + GELOGE(PARAM_INVALID, "[Check][Param]%s(%s) output index out of range. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); return PARAM_INVALID; } @@ -368,7 +413,8 @@ Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr } if (*buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size = %zu", size); + REPORT_CALL_ERROR("E19999", "Allocate Workspace failed, size = %zu", size); + GELOGE(MEMALLOC_FAILED, "[Allocate][Workspace] failed, size = %zu", size); return MEMALLOC_FAILED; } @@ -400,11 +446,11 @@ Status TaskContext::PropagateOutputs() { input_offset); if (subgraph_context_->all_inputs_.size() <= static_cast(input_offset)) { - GELOGE(INTERNAL_ERROR, - "[%s] input index out of range. index = %d, total input num = %zu", - GetNodeName(), - input_offset, - subgraph_context_->all_inputs_.size()); + REPORT_INNER_ERROR("E19999", + "[%s] input index out of range check invalid. index = %d, total input num = %zu", + GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] input index out of range. index = %d, total input num = %zu", + GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); return INTERNAL_ERROR; } @@ -554,5 +600,16 @@ NodeState *TaskContext::GetNodeState() const { return node_state_; } +Status TaskContext::GetInputDesc(int index, GeTensorDesc &tensor_desc) const { + return node_item_->GetInputDesc(index, tensor_desc); +} + +Status TaskContext::UpdateInputDesc(int index, const GeTensorDesc &tensor_desc) { + return const_cast(node_item_)->UpdateInputDesc(index, tensor_desc); +} + +Status TaskContext::GetOutputDesc(int index, GeTensorDesc &tensor_desc) const { + return node_item_->GetOutputDesc(index, tensor_desc); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h index e00c5048..ba4c62e6 100644 --- a/ge/hybrid/node_executor/task_context.h +++ b/ge/hybrid/node_executor/task_context.h @@ -50,9 +50,12 @@ class TaskContext { const char *GetNodeName() const; TensorValue *MutableInput(int index); ConstGeTensorDescPtr GetInputDesc(int index) const; + Status GetInputDesc(int index, GeTensorDesc &tensor_desc) const; ConstGeTensorDescPtr GetOutputDesc(int index) const; + Status GetOutputDesc(int index, GeTensorDesc &tensor_desc) const; GeTensorDescPtr MutableInputDesc(int index) const; GeTensorDescPtr MutableOutputDesc(int index) const; + Status UpdateInputDesc(int index, const GeTensorDesc &tensor_desc); void ReleaseInputsAndOutputs(); bool NeedCallback(); void ReleaseInput(int index); diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc index 17e257c0..ab7fbb29 100644 --- a/ge/init/gelib.cc +++ b/ge/init/gelib.cc @@ -68,7 +68,8 @@ Status GELib::Initialize(const map &options) { // Multiple initializations are not allowed instancePtr_ = MakeShared(); if (instancePtr_ == nullptr) { - GELOGE(GE_CLI_INIT_FAILED, "GeLib initialize failed, malloc shared_ptr failed."); + GELOGE(GE_CLI_INIT_FAILED, "[Create][GELib]GeLib initialize failed, malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "GELib Init failed for new GeLib failed."); return GE_CLI_INIT_FAILED; } @@ -76,13 +77,15 @@ Status GELib::Initialize(const map &options) { map new_options; Status ret = instancePtr_->SetRTSocVersion(options, new_options); if (ret != SUCCESS) { - GELOGE(ret, "GeLib initial failed."); + GELOGE(ret, "[Set][RTSocVersion]GeLib initial: SetRTSocVersion failed."); + REPORT_CALL_ERROR("E19999", "SetRTSocVersion failed."); return ret; } ret = instancePtr_->SetAiCoreNum(new_options); if (ret != SUCCESS) { - GELOGE(ret, "GeLib initial: SetAiCoreNum failed."); + GELOGE(ret, "[Set][AiCoreNum]GeLib initial: SetAiCoreNum failed."); + REPORT_CALL_ERROR("E19999", "SetAiCoreNum failed."); return ret; } @@ -97,7 +100,8 @@ Status GELib::Initialize(const map &options) { GE_TIMESTAMP_START(Init); ret = instancePtr_->InnerInitialize(new_options); if (ret != SUCCESS) { - GELOGE(ret, "GeLib initial failed."); + GELOGE(ret, "[Init][GeLib]GeLib initial failed."); + REPORT_CALL_ERROR("E19999", "GELib::InnerInitialize failed."); instancePtr_ = nullptr; return ret; } @@ -118,7 +122,7 @@ Status GELib::InnerInitialize(const map &options) { Status initSystemStatus = SystemInitialize(options); GE_TIMESTAMP_END(SystemInitialize, "InnerInitialize::SystemInitialize"); if (initSystemStatus != SUCCESS) { - GELOGE(initSystemStatus, "GE system initial failed."); + GELOGE(initSystemStatus, "[Init][GESystem]GE system initial failed."); RollbackInit(); return initSystemStatus; } @@ -129,7 +133,8 @@ Status GELib::InnerInitialize(const map &options) { Status initEmStatus = engineManager_.Initialize(options); GE_TIMESTAMP_END(EngineInitialize, "InnerInitialize::EngineInitialize"); if (initEmStatus != SUCCESS) { - GELOGE(initEmStatus, "GE engine manager initial failed."); + GELOGE(initEmStatus, "[Init][EngineManager]GE engine manager initial failed."); + REPORT_CALL_ERROR("E19999", "EngineManager initialize failed."); RollbackInit(); return initEmStatus; } @@ -140,7 +145,8 @@ Status GELib::InnerInitialize(const map &options) { Status initOpsStatus = opsManager_.Initialize(options); GE_TIMESTAMP_END(OpsManagerInitialize, "InnerInitialize::OpsManagerInitialize"); if (initOpsStatus != SUCCESS) { - GELOGE(initOpsStatus, "GE ops manager initial failed."); + GELOGE(initOpsStatus, "[Init][OpsManager]GE ops manager initial failed."); + REPORT_CALL_ERROR("E19999", "OpsManager initialize failed."); RollbackInit(); return initOpsStatus; } @@ -151,7 +157,8 @@ Status GELib::InnerInitialize(const map &options) { Status initOpsBuilderStatus = OpsKernelBuilderManager::Instance().Initialize(options); GE_TIMESTAMP_END(OpsKernelBuilderManagerInitialize, "InnerInitialize::OpsKernelBuilderManager"); if (initOpsBuilderStatus != SUCCESS) { - GELOGE(initOpsBuilderStatus, "GE ops builder manager initial failed."); + GELOGE(initOpsBuilderStatus, "[Init][OpsKernelBuilderManager]GE ops builder manager initial failed."); + REPORT_CALL_ERROR("E19999", "OpsBuilderManager initialize failed."); RollbackInit(); return initOpsBuilderStatus; } @@ -162,7 +169,8 @@ Status GELib::InnerInitialize(const map &options) { Status initSmStatus = sessionManager_.Initialize(options); GE_TIMESTAMP_END(SessionManagerInitialize, "InnerInitialize::SessionManagerInitialize"); if (initSmStatus != SUCCESS) { - GELOGE(initSmStatus, "GE session manager initial failed."); + GELOGE(initSmStatus, "[Init][SessionManager] GE session manager initial failed."); + REPORT_CALL_ERROR("E19999", "SessionManager initialize failed."); RollbackInit(); return initSmStatus; } @@ -172,7 +180,8 @@ Status GELib::InnerInitialize(const map &options) { Status initHostCpuEngineStatus = HostCpuEngine::GetInstance().Initialize(); GE_TIMESTAMP_END(HostCpuEngineInitialize, "InnerInitialize::HostCpuEngineInitialize"); if (initHostCpuEngineStatus != SUCCESS) { - GELOGE(initHostCpuEngineStatus, "Failed to initialize HostCpuEngine"); + GELOGE(initHostCpuEngineStatus, "[Init][HostCpuEngine]Failed to initialize HostCpuEngine."); + REPORT_CALL_ERROR("E19999", "HostCpuEngine initialize failed."); RollbackInit(); return initHostCpuEngineStatus; } @@ -180,7 +189,8 @@ Status GELib::InnerInitialize(const map &options) { GELOGI("Start to init Analyzer!"); Status init_analyzer_status = ge::Analyzer::GetInstance()->Initialize(); if (init_analyzer_status != SUCCESS) { - GELOGE(init_analyzer_status, "Failed to initialize HostCpuEngine"); + GELOGE(init_analyzer_status, "[Init][Analyzer]Failed to initialize Analyzer."); + REPORT_CALL_ERROR("E19999", "ge::Analyzer initialize failed."); RollbackInit(); return init_analyzer_status; } @@ -205,7 +215,8 @@ Status GELib::SystemInitialize(const map &options) { auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS, - GELOGE(FAILED, "Enable exception dump failed"); + REPORT_CALL_ERROR("E19999", "ModelManager EnableExceptionDump failed."); + GELOGE(FAILED, "[Enable][ExceptionDump] failed."); return FAILED); // 1.`is_train_mode_` means case: train // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer @@ -259,7 +270,10 @@ Status GELib::SetRTSocVersion(const map &options, map &options) { options.emplace(std::make_pair(AICORE_NUM, std::to_string(aicore_num))); return SUCCESS; } - GELOGE(FAILED, "rtGetAiCoreCount failed."); + GELOGE(FAILED, "[Get][AiCoreCount]rtGetAiCoreCount failed."); + REPORT_CALL_ERROR("E19999", "rtGetAiCoreCount failed."); return FAILED; } @@ -355,7 +370,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt mem_type.push_back(RT_MEMORY_P2P_DDR); Status initMmStatus = MemManager::Instance().Initialize(mem_type); if (initMmStatus != SUCCESS) { - GELOGE(initMmStatus, "[Initialize] MemoryAllocatorManager initialize failed."); + GELOGE(initMmStatus, "[Init][MemManager] MemoryAllocatorManager initialize failed."); + REPORT_CALL_ERROR("E19999", "MemManager initialize failed."); return initMmStatus; } @@ -363,7 +379,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt // Update CSA file CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId()); Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT); - GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret); + GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); // set device id GELOGI("set logical device id:%u", options.device_id); @@ -394,7 +410,7 @@ Status GELib::SystemShutdownWithOptions(const Options &options) { // Update CSA file Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_SUCCEED); - GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret); + GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); is_system_inited = false; is_shutdown = true; @@ -410,7 +426,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithout mem_type.push_back(RT_MEMORY_P2P_DDR); Status initMmStatus = MemManager::Instance().Initialize(mem_type); if (initMmStatus != SUCCESS) { - GELOGE(initMmStatus, "[Initialize] MemoryAllocatorManager initialize failed."); + GELOGE(initMmStatus, "[Init][MemoryManager] initialize failed."); + REPORT_CALL_ERROR("E19999", "MemManager initialize failed."); return initMmStatus; } GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize()); @@ -506,7 +523,8 @@ Status GELib::Finalize() { instancePtr_ = nullptr; init_flag_ = false; if (final_state != SUCCESS) { - GELOGE(FAILED, "finalization failed."); + GELOGE(FAILED, "[Check][State]finalization failed."); + REPORT_INNER_ERROR("E19999", "GELib::Finalize failed."); return final_state; } GELOGI("finalization success."); diff --git a/ge/ir_build/atc_ir_common.cc b/ge/ir_build/atc_ir_common.cc index 42a78dde..6ce6ce7b 100755 --- a/ge/ir_build/atc_ir_common.cc +++ b/ge/ir_build/atc_ir_common.cc @@ -19,7 +19,9 @@ #include "framework/common/string_util.h" #include "framework/common/types.h" #include "framework/common/util.h" +#include "graph/compute_graph.h" #include "graph/utils/type_utils.h" +#include "graph/utils/tensor_utils.h" using std::pair; using std::string; @@ -32,6 +34,8 @@ const int64_t kDynamicImageSizeNum = 2; const size_t kMaxDynamicDimNum = 100; const size_t kMaxNDDimNum = 4; const size_t kMinNDDimNum = 1; +const size_t kSquareBracketsSize = 2; +const size_t kRangePairSize = 2; // datatype/formats from user to GE, Unified to util interface file later const std::map kOutputTypeSupportDatatype = { {"FP32", ge::DT_FLOAT}, {"FP16", ge::DT_FLOAT16}, {"UINT8", ge::DT_UINT8}}; @@ -52,6 +56,11 @@ const char *const kCompressWeightError = "it must be appointed when appoint para const char *const kSelectImplmodeError = "only support high_performance, high_precision"; const char *const kDynamicBatchSizeError = "It can only contains digit, \",\", \" \""; const char *const kKeepDtypeError = "file not found"; +const char *const kInputShapeRangeInvalid = "format of shape range is invalid"; +const char *const kShapeRangeValueConvertError = "transfer from string to int64 error"; +const char *const kInputShapeRangeSample1 = "\"input_name1:[n1~n2,c1,h1,w1]\""; +const char *const kInputShapeRangeSample2 = "\"[1~20]\""; +const char *const kInputShapeRangeSample3 = "\"[1~20,3,3~6,-1]\""; vector SplitInputShape(const std::string &input_shape) { vector shape_pair_vec; @@ -71,7 +80,7 @@ Status CheckInputFormat(const string &input_format) { if (!ge::TypeUtils::IsFormatValid(input_format.c_str())) { ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", input_format, "input format is invalid!"}); - GELOGE(ge::PARAM_INVALID, "input format [%s] is invalid!", input_format.c_str()); + GELOGE(ge::PARAM_INVALID, "[Check][InputFormat] --input_format[%s] is invalid!", input_format.c_str()); return ge::PARAM_INVALID; } return ge::SUCCESS; @@ -84,7 +93,8 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map vector shape = iter->second; if (shape.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10012"); - GELOGE(ge::PARAM_INVALID, "--input_shape's shape size can not be less than 1 when set --dynamic_batch_size."); + GELOGE(ge::PARAM_INVALID, + "[Check][DynamicBatchSizeInputShape] shape size can not be less than 1 when set --dynamic_batch_size."); return false; } @@ -100,7 +110,8 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map if (size == 0) { ErrorManager::GetInstance().ATCReportErrMessage("E10031"); - GELOGE(ge::PARAM_INVALID, "At least one batch n must be equal to -1 when set --dynamic_batch_size."); + GELOGE(ge::PARAM_INVALID, + "[Check][DynamicBatchSizeInputShape]At least one batch n must be equal to -1 when set dynamic_batch_size."); return false; } @@ -108,8 +119,8 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map if (!isdigit(c) && (c != ',') && (c != ' ')) { ErrorManager::GetInstance().ATCReportErrMessage( "E10033", {"value", "reason"}, {dynamic_batch_size, kDynamicBatchSizeError}); - GELOGE(ge::PARAM_INVALID, "Input parameter[--dynamic_batch_size]'s value[%s] is invalid. reason: %s", - dynamic_batch_size.c_str(), kDynamicBatchSizeError); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicBatchSizeInputShape] --dynamic_batch_size:%s is invalid. reason: %s", + dynamic_batch_size.c_str(), kDynamicBatchSizeError); return false; } } @@ -122,7 +133,9 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map bool CheckDynamicImagesizeInputShapeValid(map> shape_map, const std::string input_format, std::string &dynamic_image_size) { if (!input_format.empty() && !ge::TypeUtils::IsFormatValid(input_format.c_str())) { - GELOGE(ge::PARAM_INVALID, "user input format [%s] is not found!", input_format.c_str()); + GELOGE(ge::PARAM_INVALID, + "[Check][DynamicImagesizeInputShape] input_format [%s] invalid, can not support now.", input_format.c_str()); + REPORT_INPUT_ERROR("E10414", std::vector({"input_format"}), std::vector({input_format})); return false; } int32_t size = 0; @@ -132,8 +145,8 @@ bool CheckDynamicImagesizeInputShapeValid(map> shape_map if (shape.size() != DIM_DEFAULT_SIZE) { if (std::count(shape.begin(), shape.end(), kDynamicInputDim) > 0) { ErrorManager::GetInstance().ATCReportErrMessage("E10019"); - GELOGE(ge::PARAM_INVALID, - "--input_shape's shape is invalid, only height and width can be -1 when set --dynamic_image_size."); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicImagesizeInputShape] --input_shape invalid," + " only height and width can be -1 when set --dynamic_image_size."); return false; } continue; @@ -151,8 +164,8 @@ bool CheckDynamicImagesizeInputShapeValid(map> shape_map } if (size == 0) { ErrorManager::GetInstance().ATCReportErrMessage("E10019"); - GELOGE(ge::PARAM_INVALID, - "--input_shape's shape is invalid, only height and width can be -1 when set --dynamic_image_size."); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicImagesizeInputShape]--input shape invalid, " + "only height and width can be -1 when set --dynamic_image_size."); return false; } @@ -167,9 +180,8 @@ bool CheckDynamicImagesizeInputShapeValid(map> shape_map ErrorManager::GetInstance().ATCReportErrMessage("E10020", {"DynamicImageSizeNum"}, {std::to_string(kDynamicImageSizeNum)}); GELOGE(ge::PARAM_INVALID, - "--dynamic_image_size's number of dimensions of each " - "group must be %ld.", - kDynamicImageSizeNum); + "[Check][DynamicImagesizeInputShape] invalid value:%s number of dimensions of each group must be %ld.", + dynamic_image_size.c_str(), kDynamicImageSizeNum); return false; } } @@ -183,7 +195,7 @@ bool CheckDynamicDimsInputShapeValid(const map> &shape_m ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", input_format.c_str(), "input_format must be ND when set dynamic_dims"}); - GELOGE(ge::PARAM_INVALID, "input_format must be ND when set dynamic_dims."); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicDimsInputShape]--input_format must be ND when set dynamic_dims."); return false; } @@ -194,7 +206,8 @@ bool CheckDynamicDimsInputShapeValid(const map> &shape_m ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_shape's dim", std::to_string(shapes.size()), "Dim num must within [1, 4] when set dynamic_dims"}); - GELOGE(ge::PARAM_INVALID, "Dim num must within [%zu, %zu] when set dynamic_dims.", kMinNDDimNum, kMaxNDDimNum); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicDimsInputShape]Dim num must within [%zu, %zu] when set dynamic_dims.", + kMinNDDimNum, kMaxNDDimNum); return false; } dynamic_dim += std::count(shapes.begin(), shapes.end(), kDynamicInputDim); @@ -203,12 +216,14 @@ bool CheckDynamicDimsInputShapeValid(const map> &shape_m ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_shape's dynamic dim num", "0", "at least one dim should be -1 when set dynamic_dims"}); - GELOGE(ge::PARAM_INVALID, "input_shape's shape is invalid, at least one dim should be -1 when set dynamic_dims."); + GELOGE(ge::PARAM_INVALID, + "[Check][DynamicDimsInputShape]--input_shape invalid," + "at least one dim should be -1 when set dynamic_dims."); return false; } if (!CheckAndParseDynamicDims(dynamic_dim, dynamic_dims)) { - GELOGE(ge::PARAM_INVALID, "Check and parse dynamic dims: %s failed.", dynamic_dims.c_str()); + GELOGE(ge::PARAM_INVALID, "[CheckAndParse][DynamicDims]failed, %s invalid.", dynamic_dims.c_str()); return false; } @@ -221,7 +236,7 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--dynamic_dims", dynamic_dims.c_str(), "dynamic_dims can not be empty"}); - GELOGE(ge::PARAM_INVALID, "dynamic_dims can not be empty."); + GELOGE(ge::PARAM_INVALID, "[CheckAndParse][DynamicDims]--dynamic_dims can not be empty."); return false; } // Different parameter sets are split by ';' @@ -229,7 +244,8 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims if (split_set.size() > kMaxDynamicDimNum) { ErrorManager::GetInstance().ATCReportErrMessage( "E10042", {"parameter", "reason"}, {"dynamic_dims", "dynamic_dims's num of parameter set can not exceed 100"}); - GELOGE(ge::PARAM_INVALID, "dynamic_dims's num of parameter set can not exceed %zu.", kMaxDynamicDimNum); + GELOGE(ge::PARAM_INVALID, + "[CheckAndParse][DynamicDims]dynamic_dims's num of parameter set can not exceed %zu.", kMaxDynamicDimNum); return false; } for (auto split_dim : split_set) { @@ -238,8 +254,9 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims ErrorManager::GetInstance().ATCReportErrMessage( "E10042", {"parameter", "reason"}, {"dynamic_dims", "Each gear setting needs to be consistent with the number of -1 in the inputshape"}); - GELOGE(ge::PARAM_INVALID, "Input parameter --dynamic_dims parse failed, " - "reason: Each gear setting needs to be consistent with the number of -1 in the inputshape."); + GELOGE(ge::PARAM_INVALID, "[CheckAndParse][DynamicDims] --dynamic_dims:%s invalid. " + "reason: Each gear setting needs to be consistent with the number of -1 in the inputshape.", + dynamic_dims.c_str()); return false; } for (auto dim : one_set) { @@ -248,7 +265,9 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--dynamic_dims's parameter", dim.c_str(), "must be positive integer"}); - GELOGE(ge::PARAM_INVALID, "dynamic_dims's parameter must be positive integer."); + GELOGE(ge::PARAM_INVALID, + "[CheckAndParse][DynamicDims]--dynamic_dims:%s parameter must be positive integer.", + dynamic_dims.c_str()); return false; } } @@ -257,18 +276,146 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims return true; } +bool StringToLongNoThrow(const string &str, long &val) { + try { + val = std::stol(str); + return true; + } catch (const std::invalid_argument) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {str, kShapeRangeValueConvertError, kInputShapeRangeSample3}); + GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s invalid, reason: %s, correct sample is %s.", + str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); + } catch (const std::out_of_range) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {str, kShapeRangeValueConvertError, kInputShapeRangeSample3}); + GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s invalid, reason: %s, correct sample is %s.", + str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); + } + return false; +} + +bool ParseSingleShapeRange(std::string &shape_range, vector> &shape_range_vec) { + vector square_brackets; + for (auto ch : shape_range) { + if (ch == '[' || ch == ']') { + square_brackets.push_back(ch); + } + } + + bool is_square_brackets = (square_brackets.size() == kSquareBracketsSize) && + (square_brackets[0] == '[') && (square_brackets[1] == ']'); + if (!is_square_brackets) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample2}); + GELOGE(PARAM_INVALID, "[Parse][Parameter] shape_range:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample2); + return false; + } + // trim start bytes, after that, single input should be "1~20,3,3~6,-1" + if (ge::StringUtils::StartWith(shape_range, "[")) { + shape_range = shape_range.substr(1, shape_range.size() - 1); + } + // parse shape_range of single input. eg. "1~20,3,3~6,-1" + vector dim_range_set = ge::StringUtils::Split(shape_range, ','); + for (const auto &range_pair_str : dim_range_set) { + vector range_pair_set = ge::StringUtils::Split(range_pair_str, '~'); + pair range_pair; + if (range_pair_set.size() == 1) { + long range_value = 0; + if (!StringToLongNoThrow(range_pair_set.at(0), range_value)) { + return false; + } + if (range_value < 0) { + range_pair = std::make_pair(1, range_value); + } else { + range_pair = std::make_pair(range_value, range_value); + } + } else if (range_pair_set.size() == kRangePairSize) { + // unknown dim, should get range. + long range_left = 0; + if (!StringToLongNoThrow(range_pair_set.at(0), range_left)) { + return false; + } + long range_right = 0; + if (!StringToLongNoThrow(range_pair_set.at(1), range_right)) { + return false; + } + if (range_left < 0 || (range_right < 0)) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3}); + GELOGE(PARAM_INVALID, + "[Parse][InputParameter] [--input_shape_range]'s shape range[%s] failed," + "reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); + return false; + } + range_pair = std::make_pair(range_left, range_right); + } else { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3}); + GELOGE(PARAM_INVALID, "[Parse][Parameter]shape_range:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); + return false; + } + shape_range_vec.emplace_back(range_pair); + } + return true; +} + +bool ParseInputShapeRange(const std::string &shape_range, + std::map>> &shape_range_map) { + GELOGD("Input shape range %s", shape_range.c_str()); + + vector shape_range_vec = StringUtils::Split(shape_range, ';'); + const int DEFAULT_SHAPE_RANGE_PAIR_SIZE = 2; + for (const auto &shape_range_item : shape_range_vec) { + vector shape_range_pair_vec = SplitInputShape(shape_range_item); + if (shape_range_pair_vec.size() != DEFAULT_SHAPE_RANGE_PAIR_SIZE) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, + {shape_range, kSplitError1, kInputShapeRangeSample1}); + GELOGE(PARAM_INVALID, "[Parse][Parameter]--input shape_range:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kSplitError1, kInputShapeRangeSample1); + return false; + } + if (shape_range_pair_vec[1].empty()) { + ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape", "reason", "sample"}, + {shape_range, kEmptyError, kInputShapeRangeSample1}); + GELOGE(PARAM_INVALID, "[Parse][Parameter]shape_range:%s invalid,reason: %s, correct sample is %s.", + shape_range.c_str(), kEmptyError, kInputShapeRangeSample1); + return false; + } + + string shape_range_str = shape_range_pair_vec[1]; + vector> shape_range_val; + if (!ParseSingleShapeRange(shape_range_str, shape_range_val)) { + GELOGE(PARAM_INVALID, "[Parse][Parameter] shape_range_str: %s invalid.", shape_range_str.c_str()); + return false; + } + shape_range_map.emplace(make_pair(StringUtils::Trim(shape_range_pair_vec[0]), shape_range_val)); + } + return true; +} + Status CheckDynamicInputParamValid(string &dynamic_batch_size, string &dynamic_image_size, string &dynamic_dims, - const string input_shape, const string input_format, bool &is_dynamic_input) { + const string input_shape, const string input_shape_range, const string input_format, bool &is_dynamic_input) { int32_t param_size = static_cast(!dynamic_batch_size.empty()) + - static_cast(!dynamic_image_size.empty()) + static_cast(!dynamic_dims.empty()); + static_cast(!dynamic_image_size.empty()) + static_cast(!dynamic_dims.empty()); if (param_size > 1) { ErrorManager::GetInstance().ATCReportErrMessage("E10009", {"parameter0", "parameter1", "parameter2"}, {"dynamic_batch_size", "dynamic_image_size", "dynamic_dims"}); - GELOGE(ge::PARAM_INVALID, "dynamic_batch_size, dynamic_image_size and dynamic_dims can only be set one"); + GELOGE(ge::PARAM_INVALID, + "[Parse][Parameter]dynamic_batch_size, dynamic_image_size and dynamic_dims can only be set one"); return ge::PARAM_INVALID; } if (param_size == 0) { + if (!input_shape_range.empty()) { + std::map>> shape_range_map; + if (!ParseInputShapeRange(input_shape_range, shape_range_map)) { + GELOGE(ge::PARAM_INVALID, "[Parse][InputShapeRange] failed, range: %s", input_shape_range.c_str()); + return ge::PARAM_INVALID; + } + } return ge::SUCCESS; } @@ -277,32 +424,34 @@ Status CheckDynamicInputParamValid(string &dynamic_batch_size, string &dynamic_i is_dynamic_input = true; if (input_shape.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"input_shape"}); - GELOGE(ge::PARAM_INVALID, "The input_shape can not be empty in dynamic input size scenario."); + GELOGE(ge::PARAM_INVALID, + "[Check][Parameter:input_shape]The input_shape can not be empty in dynamic input size scenario."); return ge::PARAM_INVALID; } if (!ParseInputShape(input_shape, shape_map, user_shape_map, is_dynamic_input)) { - GELOGE(ge::PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); + GELOGE(ge::PARAM_INVALID, "[Parse][InputShape]input_shape: %s invalid.", input_shape.c_str()); return ge::PARAM_INVALID; } if (!dynamic_batch_size.empty()) { if (!CheckDynamicBatchSizeInputShapeValid(shape_map, dynamic_batch_size)) { - GELOGE(ge::PARAM_INVALID, "Check dynamic batch size input shape failed: %s", input_shape.c_str()); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicBatchSizeInputShape] input_shape: %s invalid.", input_shape.c_str()); return ge::PARAM_INVALID; } } if (!dynamic_image_size.empty()) { if (!CheckDynamicImagesizeInputShapeValid(shape_map, input_format, dynamic_image_size)) { - GELOGE(ge::PARAM_INVALID, "Check dynamic image size input shape failed: %s", input_shape.c_str()); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicImagesizeInputShape] %s invalid. dynamic_image_size:%s ", + input_shape.c_str(), dynamic_image_size.c_str()); return ge::PARAM_INVALID; } } if (!dynamic_dims.empty()) { if (!CheckDynamicDimsInputShapeValid(shape_map, input_format, dynamic_dims)) { - GELOGE(ge::PARAM_INVALID, "Check dynamic dims: %s of input shape: %s failed.", dynamic_dims.c_str(), + GELOGE(ge::PARAM_INVALID, "[Check][DynamicDimsInputShape]: %s of input shape: %s failed.", dynamic_dims.c_str(), input_shape.c_str()); return ge::PARAM_INVALID; } @@ -354,7 +503,8 @@ bool ParseInputShape(const string &input_shape, map> &sh if (!isdigit(c)) { ErrorManager::GetInstance().ATCReportErrMessage("E10002", {"shape", "reason", "sample"}, {shape, kDigitError, kInputShapeSample2}); - GELOGE(PARAM_INVALID, "--input_shape's shape value[%s] is not digit", shape_value_str.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]--input_shape's shape value[%s] is not digit", + shape_value_str.c_str()); return false; } } @@ -377,7 +527,8 @@ bool ParseInputShape(const string &input_shape, map> &sh int64_t result = left_result; // - 1 is not currently supported if (!is_dynamic_input && result <= 0) { - ErrorManager::GetInstance().ATCReportErrMessage("E10011", {"shape", "result"}, {shape, std::to_string(result)}); + ErrorManager::GetInstance().ATCReportErrMessage("E10011", {"shape", "result"}, + {shape, std::to_string(result)}); GELOGW( "Input parameter[--input_shape]’s shape value[%s] is invalid, " "expect positive integer, but value is %ld.", @@ -396,10 +547,10 @@ bool ParseInputShape(const string &input_shape, map> &sh Status CheckOutputTypeParamValid(const std::string output_type) { if ((!output_type.empty()) && (kOutputTypeSupportDatatype.find(output_type) == kOutputTypeSupportDatatype.end())) { - ErrorManager::GetInstance().ATCReportErrMessage( - "E10001", {"parameter", "value", "reason"}, {"--output_type", output_type, kOutputTypeSupport}); + ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, + {"--output_type", output_type, kOutputTypeSupport}); GELOGE(ge::PARAM_INVALID, - "Invalid value for --output_type[%s], %s.", output_type.c_str(), kOutputTypeSupport); + "[Check][Param]Invalid value for --output_type[%s], %s.", output_type.c_str(), kOutputTypeSupport); return ge::PARAM_INVALID; } return ge::SUCCESS; @@ -408,33 +559,37 @@ Status CheckOutputTypeParamValid(const std::string output_type) { Status CheckBufferOptimizeParamValid(const std::string buffer_optimize) { if ((!buffer_optimize.empty()) && (kBufferOptimizeSupportOption.find(buffer_optimize) == kBufferOptimizeSupportOption.end())) { - ErrorManager::GetInstance().ATCReportErrMessage( - "E10001", {"parameter", "value", "reason"}, {"--buffer_optimize", buffer_optimize, kBufferOptimizeSupport}); + ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, + {"--buffer_optimize", buffer_optimize, kBufferOptimizeSupport}); GELOGE(ge::PARAM_INVALID, - "Invalid value for --buffer_optimize[%s], %s.", buffer_optimize.c_str(), kBufferOptimizeSupport); + "[Check][BufferOptimize]Invalid value for [%s], %s.", buffer_optimize.c_str(), kBufferOptimizeSupport); return ge::PARAM_INVALID; } return ge::SUCCESS; } -Status CheckCompressWeightParamValid(const std::string enable_compress_weight, const std::string compress_weight_conf) { +Status CheckCompressWeightParamValid(const std::string enable_compress_weight, + const std::string compress_weight_conf) { if ((!compress_weight_conf.empty()) && (!CheckInputPathValid(compress_weight_conf, "--compress_weight_conf"))) { - GELOGE(ge::PARAM_INVALID, "compress weight config file not found, file_name:%s", compress_weight_conf.c_str()); + GELOGE(ge::PARAM_INVALID, "[Check][InputPath]compress weight config file not found, file_name:%s", + compress_weight_conf.c_str()); return ge::PARAM_INVALID; } if ((enable_compress_weight != "") && (enable_compress_weight != "true") && (enable_compress_weight != "false")) { - ErrorManager::GetInstance().ATCReportErrMessage( - "E10005", {"parameter", "value"}, {"enable_compress_weight", enable_compress_weight}); - GELOGE(ge::PARAM_INVALID, - "Input parameter[--enable_compress_weight]'s value[%s] must be true or false.", enable_compress_weight.c_str()); + ErrorManager::GetInstance().ATCReportErrMessage("E10005", {"parameter", "value"}, + {"enable_compress_weight", enable_compress_weight}); + GELOGE(ge::PARAM_INVALID, "[Check][Param:enable_compress_weight]" + "Input parameter[--enable_compress_weight]'s value:%s must be true or false.", + enable_compress_weight.c_str()); return ge::PARAM_INVALID; } if ((enable_compress_weight == "true") && (!compress_weight_conf.empty())) { ErrorManager::GetInstance().ATCReportErrMessage("E10047", {"parameter0", "parameter1"}, {"enable_compress_weight", "compress_weight_conf"}); - GELOGE(ge::PARAM_INVALID, "enable_compress_weight and compress_weight_conf can not both exist!!"); + GELOGE(ge::PARAM_INVALID, + "[Check][CompressWeight]enable_compress_weight and compress_weight_conf can not both exist!!"); return ge::PARAM_INVALID; } return ge::SUCCESS; @@ -442,9 +597,9 @@ Status CheckCompressWeightParamValid(const std::string enable_compress_weight, c Status CheckKeepTypeParamValid(const std::string &keep_dtype) { if ((!keep_dtype.empty()) && (!CheckInputPathValid(keep_dtype, "--keep_dtype"))) { - ErrorManager::GetInstance().ATCReportErrMessage( - "E10001", {"parameter", "value", "reason"}, {"--keep_dtype", keep_dtype, kKeepDtypeError}); - GELOGE(ge::PARAM_INVALID, "keep dtype config file not found, file_name:%s", keep_dtype.c_str()); + ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, + {"--keep_dtype", keep_dtype, kKeepDtypeError}); + GELOGE(ge::PARAM_INVALID, "[Check][InputPath::--keep_dtype] file not found, file_name:%s", keep_dtype.c_str()); return ge::PARAM_INVALID; } @@ -466,11 +621,15 @@ int CheckLogParamValidAndSetLogLevel(const std::string log) { } else if (log == "error") { ret = dlog_setlevel(-1, DLOG_ERROR, 1); } else { - GELOGE(ge::PARAM_INVALID, "invalid value for log:%s, only support debug, info, warning, error, null", log.c_str()); + GELOGE(ge::PARAM_INVALID, + "[Check][LogParam]log:%s invalid, only support debug, info, warning, error, null", log.c_str()); + REPORT_INPUT_ERROR("E10417", std::vector({"loglevel"}), std::vector({log})); return ret; } if (ret != 0) { - GELOGE(ge::PARAM_INVALID, "Log setlevel fail !"); + GELOGE(ge::PARAM_INVALID, "[Set][LogLevel] fail, level:%s.", log.c_str()); + REPORT_INPUT_ERROR("E10417", std::vector({"loglevel"}), std::vector({log})); + } return ret; } @@ -478,7 +637,7 @@ int CheckLogParamValidAndSetLogLevel(const std::string log) { Status CheckInsertOpConfParamValid(const std::string insert_op_conf) { if ((!insert_op_conf.empty()) && (!CheckInputPathValid(insert_op_conf, "--insert_op_conf"))) { - GELOGE(ge::PARAM_INVALID, "insert op config file not found: %s", insert_op_conf.c_str()); + GELOGE(ge::PARAM_INVALID, "[Check][InputPath]file not found: %s", insert_op_conf.c_str()); return ge::PARAM_INVALID; } return ge::SUCCESS; @@ -487,7 +646,7 @@ Status CheckInsertOpConfParamValid(const std::string insert_op_conf) { Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory) { if ((disable_reuse_memory != "") && (disable_reuse_memory != "0") && (disable_reuse_memory != "1")) { ErrorManager::GetInstance().ATCReportErrMessage("E10006", {"parameter"}, {"disable_reuse_memory"}); - GELOGE(ge::PARAM_INVALID, "Input parameter[--disable_reuse_memory]'s value must be 1 or 0."); + GELOGE(ge::PARAM_INVALID, "[Check][DisableReuseMemory]disable_reuse_memory must be 1 or 0."); return ge::PARAM_INVALID; } return ge::SUCCESS; @@ -495,9 +654,9 @@ Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory) Status CheckEnableSingleStreamParamValid(const std::string enable_single_stream) { if ((enable_single_stream != "") && (enable_single_stream != "true") && (enable_single_stream != "false")) { - ErrorManager::GetInstance().ATCReportErrMessage( - "E10005", {"parameter", "value"}, {"enable_single_stream", enable_single_stream}); - GELOGE(ge::PARAM_INVALID, "Input parameter[--enable_single_stream]'s value[%s] must be true or false.", + ErrorManager::GetInstance().ATCReportErrMessage("E10005", {"parameter", "value"}, + {"enable_single_stream", enable_single_stream}); + GELOGE(ge::PARAM_INVALID, "[Check][Param:--enable_single_stream] value:%s must be true or false.", enable_single_stream.c_str()); return ge::PARAM_INVALID; } @@ -508,9 +667,10 @@ Status CheckImplmodeParamValid(const std::string &optypelist_for_implmode, std:: // only appointed op_select_implmode, can user appoint optypelist_for_implmode if (optypelist_for_implmode != "" && op_select_implmode == "") { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, - {"--op_select_implmode", op_select_implmode.c_str(), kCompressWeightError}); - GELOGE(ge::PARAM_INVALID, "Invalid value for --op_select_implmode[%s], %s.", - op_select_implmode.c_str(), kCompressWeightError); + {"--op_select_implmode", op_select_implmode.c_str(), + kCompressWeightError}); + GELOGE(ge::PARAM_INVALID, "[Check][Param:--op_select_implmode]value:%s invalid, %s.", + op_select_implmode.c_str(), kCompressWeightError); return ge::PARAM_INVALID; } // op_select_implmode default value is high_performance @@ -520,9 +680,10 @@ Status CheckImplmodeParamValid(const std::string &optypelist_for_implmode, std:: if (op_select_implmode != IR_OPTION_OP_SELECT_IMPLMODE_DEFAULT && op_select_implmode != IR_OPTION_OP_SELECT_IMPLMODE_PRECISON) { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, - {"--op_select_implmode", op_select_implmode.c_str(), kSelectImplmodeError}); - GELOGE(ge::PARAM_INVALID, "Invalid value for --op_select_implmode[%s], %s.", - op_select_implmode.c_str(), kSelectImplmodeError); + {"--op_select_implmode", op_select_implmode.c_str(), + kSelectImplmodeError}); + GELOGE(ge::PARAM_INVALID, "[Check][Implmode]Invalid value for --op_select_implmode[%s], %s.", + op_select_implmode.c_str(), kSelectImplmodeError); return ge::PARAM_INVALID; } } @@ -546,4 +707,122 @@ void EraseEndSemicolon(string ¶m) { param.erase(param.end() - 1); } } + +Status UpdateDataOpShape(const OpDescPtr &op, map> &shape_map) { + GE_CHECK_NOTNULL(op); + if (shape_map.empty()) { + GELOGI("Shape map of data op [%s] is empty, no need to update.", op->GetName().c_str()); + return SUCCESS; + } + + auto tensor_input = op->MutableInputDesc(0); + auto tensor_output = op->MutableOutputDesc(0); + GE_CHECK_NOTNULL(tensor_input); + GE_CHECK_NOTNULL(tensor_output); + string data_op_name = op->GetName(); + auto iter = shape_map.find(data_op_name); + if (iter != shape_map.end()) { + tensor_input->SetShape(ge::GeShape(iter->second)); + tensor_output->SetShape(ge::GeShape(iter->second)); + GELOGI("Update input [%s] shape info", data_op_name.c_str()); + } else { + GELOGI("No need update input [%s] attr because not found from input_shape.", data_op_name.c_str()); + } + + return SUCCESS; +} + +Status UpdateDataOpShapeRange(const OpDescPtr &op, + map>> &shape_range_map) { + GE_CHECK_NOTNULL(op); + if (shape_range_map.empty()) { + GELOGI("Shape range map of data op [%s] is empty.", op->GetName().c_str()); + return SUCCESS; + } + + auto tensor_input = op->MutableInputDesc(0); + auto tensor_output = op->MutableOutputDesc(0); + GE_CHECK_NOTNULL(tensor_input); + GE_CHECK_NOTNULL(tensor_output); + string data_op_name = op->GetName(); + auto origin_shape = tensor_input->GetShape(); + auto iter = shape_range_map.find(data_op_name); + if (iter != shape_range_map.end()) { + auto cur_shape_range = iter->second; + if (TensorUtils::CheckShapeByShapeRange(origin_shape, cur_shape_range) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Check][OpDescPtr] Check shape by shape range failed for op:%s.", data_op_name.c_str()); + return PARAM_INVALID; + } + for (size_t idx = 0; idx < cur_shape_range.size(); idx++) { + auto left_range = cur_shape_range[idx].first; + auto right_range = cur_shape_range[idx].second; + if (left_range != right_range) { + origin_shape.SetDim(idx, UNKNOWN_DIM); + } + } + tensor_input->SetShape(origin_shape); + tensor_input->SetShapeRange(cur_shape_range); + tensor_output->SetShape(origin_shape); + tensor_output->SetShapeRange(cur_shape_range); + GELOGI("Update input [%s] shape range info", data_op_name.c_str()); + } else { + GELOGI("No need to update input [%s] attr because not found from input_shape_range.", data_op_name.c_str()); + } + + return SUCCESS; +} + +static Status CheckInputShapeRangeNode(const ComputeGraphPtr &compute_graph, + const map>> &shape_range_map) { + for (const auto &it : shape_range_map) { + std::string node_name = it.first; + ge::NodePtr node = compute_graph->FindNode(node_name); + if (node == nullptr) { + REPORT_INPUT_ERROR("E10016", std::vector({"parameter", "opname"}), + std::vector({"input_shape_range", node_name})); + GELOGE(PARAM_INVALID, "[Check][InputNode]Input parameter[--input_shape_range]'s opname[%s] is not exist in model", + node_name.c_str()); + return PARAM_INVALID; + } + if (node->GetType() != DATA) { + REPORT_INPUT_ERROR("E10017", std::vector({"parameter", "opname"}), + std::vector({"input_shape_range", node_name})); + GELOGE(PARAM_INVALID, "[Check][InputNode]Input parameter[--input_shape_range]'s opname[%s] is not a input opname", + node_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status UpdateDynamicInputShapeRange(const ge::ComputeGraphPtr &compute_graph, const string &input_shape_range) { + if (input_shape_range.empty()) { + return SUCCESS; + } + GE_CHECK_NOTNULL(compute_graph); + + map>> shape_range_map; + if (!ParseInputShapeRange(input_shape_range, shape_range_map)) { + GELOGE(PARAM_INVALID, "[Parse][InputShapeRange] input_shape_range:%s invalid.", input_shape_range.c_str()); + return PARAM_INVALID; + } + + if (CheckInputShapeRangeNode(compute_graph, shape_range_map) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Check][InputShapeRange]check input shape range:%s failed.", input_shape_range.c_str()); + return PARAM_INVALID; + } + + for (NodePtr &input_node : compute_graph->GetDirectNode()) { + GE_CHECK_NOTNULL(input_node); + OpDescPtr op = input_node->GetOpDesc(); + GE_CHECK_NOTNULL(op); + if (op->GetType() == DATA) { + if (UpdateDataOpShapeRange(op, shape_range_map) != SUCCESS) { + GELOGE(FAILED, "[Update][InputShapeRange] fail for op:%s.", op->GetName().c_str()); + return FAILED; + } + } + } + return SUCCESS; +} } // namespace ge diff --git a/ge/ir_build/atc_ir_common.h b/ge/ir_build/atc_ir_common.h index 2ad4efa8..6ff40547 100644 --- a/ge/ir_build/atc_ir_common.h +++ b/ge/ir_build/atc_ir_common.h @@ -31,7 +31,7 @@ namespace ge { static std::set caffe_support_input_format = {"NCHW", "ND"}; static std::set tf_support_input_format = {"NCHW", "NHWC", "ND", "NCDHW", "NDHWC"}; -static std::set onnx_support_input_format = {"NCHW", "ND"}; +static std::set onnx_support_input_format = {"NCHW", "ND", "NCDHW"}; static std::map input_format_str_to_geformat = { {"ND", domi::DOMI_TENSOR_ND}, @@ -59,10 +59,13 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims Status CheckDynamicInputParamValid(std::string &dynamic_batch_size, std::string &dynamic_image_size, std::string &dynamic_dims, const std::string input_shape, - const std::string input_format, bool &is_dynamic_input); + const std::string input_shape_range, const std::string input_format, + bool &is_dynamic_input); bool ParseInputShape(const std::string &input_shape, std::map> &shape_map, std::vector>> &user_shape_map, bool is_dynamic_input = false); +bool ParseInputShapeRange(const std::string &shape_range, + std::map>> &shape_range_map); Status CheckOutputTypeParamValid(const std::string output_type); Status CheckBufferOptimizeParamValid(const std::string buffer_optimize); @@ -76,5 +79,9 @@ Status CheckInputFormat(const string &input_format); Status CheckKeepTypeParamValid(const std::string &keep_dtype); void PrintOptionMap(std::map &options, std::string tips); void EraseEndSemicolon(std::string ¶m); +Status UpdateDataOpShape(const OpDescPtr &op, std::map> &shape_map); +Status UpdateDataOpShapeRange(const OpDescPtr &op, + std::map>> &shape_range_map); +Status UpdateDynamicInputShapeRange(const ge::ComputeGraphPtr &compute_graph, const string &input_shape_range); } #endif // FRAMEWORK_DOMI_ATC_IR_COMMON_H_ diff --git a/ge/ir_build/attr_options/keep_dtype_option.cc b/ge/ir_build/attr_options/keep_dtype_option.cc index 26954b82..c2d87d51 100644 --- a/ge/ir_build/attr_options/keep_dtype_option.cc +++ b/ge/ir_build/attr_options/keep_dtype_option.cc @@ -60,12 +60,14 @@ graphStatus KeepDtypeFunc(ComputeGraphPtr &graph, const std::string &cfg_path) { } std::string real_path = RealPath(cfg_path.c_str()); if (real_path.empty()) { - GELOGE(GRAPH_PARAM_INVALID, "Can not get real path for %s.", cfg_path.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Get][Path]Can not get real path for %s.", cfg_path.c_str()); + REPORT_INPUT_ERROR("E10410", std::vector({"cfgpath"}), std::vector({cfg_path})); return GRAPH_PARAM_INVALID; } std::ifstream ifs(real_path); if (!ifs.is_open()) { - GELOGE(GRAPH_FAILED, "Open file %s failed", cfg_path.c_str()); + GELOGE(GRAPH_FAILED, "[Open][File] %s failed.", cfg_path.c_str()); + REPORT_INNER_ERROR("E19999", "open file:%s failed.", cfg_path.c_str()); return GRAPH_FAILED; } diff --git a/ge/ir_build/attr_options/weight_compress_option.cc b/ge/ir_build/attr_options/weight_compress_option.cc index 0b8af37e..3c057d04 100644 --- a/ge/ir_build/attr_options/weight_compress_option.cc +++ b/ge/ir_build/attr_options/weight_compress_option.cc @@ -30,12 +30,14 @@ graphStatus WeightCompressFunc(ComputeGraphPtr &graph, const string &cfg_path) { } std::string real_path = RealPath(cfg_path.c_str()); if (real_path.empty()) { - GELOGE(GRAPH_PARAM_INVALID, "Can not get real path for %s.", cfg_path.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Get][Path]Can not get real path for %s.", cfg_path.c_str()); + REPORT_INPUT_ERROR("E10410", std::vector({"cfgpath"}), std::vector({cfg_path})); return GRAPH_PARAM_INVALID; } std::ifstream ifs(real_path); if (!ifs.is_open()) { - GELOGE(GRAPH_FAILED, "Open file %s failed", cfg_path.c_str()); + GELOGE(GRAPH_FAILED, "[Open][File] %s failed", cfg_path.c_str()); + REPORT_INNER_ERROR("E19999", "open file:%s failed.", cfg_path.c_str()); return GRAPH_FAILED; } @@ -55,7 +57,8 @@ graphStatus WeightCompressFunc(ComputeGraphPtr &graph, const string &cfg_path) { if ((op_desc->GetName() == compress_node_vec[i]) || IsOriginalOpFind(op_desc, compress_node_vec[i])) { is_find = true; if (!ge::AttrUtils::SetBool(op_desc, ge::ATTR_NAME_COMPRESS_WEIGHT, true)) { - GELOGE(GRAPH_FAILED, "node %s SetBool failed.", compress_node_vec[i].c_str()); + GELOGE(GRAPH_FAILED, "[Set][Bool] failed, node:%s.", compress_node_vec[i].c_str()); + REPORT_CALL_ERROR("E19999", "SetBool failed, node:%s.", compress_node_vec[i].c_str()); return GRAPH_FAILED; } } diff --git a/ge/ir_build/ge_ir_build.cc b/ge/ir_build/ge_ir_build.cc index 62684e3a..336102d4 100644 --- a/ge/ir_build/ge_ir_build.cc +++ b/ge/ir_build/ge_ir_build.cc @@ -55,6 +55,7 @@ const std::string IR_OPTION_DISABLE_REUSE_MEMORY_DEFAULT = "0"; const std::string IR_OPTION_ENABLE_COMPRESS_WEIGHT_DEFAULT = "false"; const std::string KEEP_DTYPE_OPTION = "keep_dtype"; const std::string kInputShape = "input_shape"; +const std::string kInputShapeRange = "input_shape_range"; const std::string kInputFormat = "input_format"; /** @@ -84,21 +85,21 @@ static graphStatus CheckGlobalOptions(std::map &global ? IR_OPTION_DISABLE_REUSE_MEMORY_DEFAULT : global_options[ge::ir_option::EXEC_DISABLE_REUSED_MEMORY]; GE_CHK_BOOL_EXEC(ge::CheckDisableReuseMemoryParamValid(disable_reuse_memory) == ge::SUCCESS, - return ge::GRAPH_PARAM_INVALID, "check disable_reuse_memory failed!"); + return ge::GRAPH_PARAM_INVALID, "[Check][DisableReuseMemory] failed!"); global_options[ge::ir_option::EXEC_DISABLE_REUSED_MEMORY] = disable_reuse_memory; // check buffer_optimize std::string buffer_optimize = global_options.find(ge::ir_option::BUFFER_OPTIMIZE) == global_options.end() ? IR_OPTION_BUFFER_OPTIMIZE_DEFAULT : global_options[ge::ir_option::BUFFER_OPTIMIZE]; GE_CHK_BOOL_EXEC(ge::CheckBufferOptimizeParamValid(buffer_optimize) == ge::SUCCESS, - return ge::GRAPH_PARAM_INVALID, "check buffer optimize failed!"); + return ge::GRAPH_PARAM_INVALID, "[Check][BufferOptimize] failed!"); global_options[ge::ir_option::BUFFER_OPTIMIZE] = buffer_optimize; // check enable_single_stream std::string enable_single_stream = global_options.find(ge::ir_option::ENABLE_SINGLE_STREAM) == global_options.end() ? "" : global_options[ge::ir_option::ENABLE_SINGLE_STREAM]; GE_CHK_BOOL_EXEC(ge::CheckEnableSingleStreamParamValid(enable_single_stream) == ge::SUCCESS, - return ge::GRAPH_PARAM_INVALID, "check enable single stream failed!"); + return ge::GRAPH_PARAM_INVALID, "[Check][EnableSingleStream] failed!"); // check compress_weight std::string enable_compress_weight = global_options.find(ge::ir_option::ENABLE_COMPRESS_WEIGHT) == global_options.end() @@ -108,7 +109,7 @@ static graphStatus CheckGlobalOptions(std::map &global ? "" : global_options[ge::ir_option::COMPRESS_WEIGHT_CONF]; GE_CHK_BOOL_EXEC(ge::CheckCompressWeightParamValid(enable_compress_weight, compress_weight_conf) == ge::SUCCESS, - return ge::GRAPH_PARAM_INVALID, "check compress weight failed!"); + return ge::GRAPH_PARAM_INVALID, "[Check][CompressWeight] failed!"); global_options[ge::ir_option::ENABLE_COMPRESS_WEIGHT] = (enable_compress_weight == "true") ? ge::kEnableCompressWeightTrue : ge::kEnableCompressWeightFalse; @@ -123,7 +124,7 @@ static graphStatus CheckGlobalOptions(std::map &global : global_options[ge::ir_option::OP_SELECT_IMPL_MODE]; GE_CHK_BOOL_EXEC( ge::CheckImplmodeParamValid(optypelist_for_implmode, op_select_implmode) == ge::SUCCESS, - return ge::GRAPH_PARAM_INVALID, "check optypelist_for_implmode and op_select_implmode failed!"); + return ge::GRAPH_PARAM_INVALID, "[Check][Implmode] failed!"); global_options[ge::ir_option::OP_SELECT_IMPL_MODE] = op_select_implmode; // set precision mode default value @@ -143,7 +144,7 @@ static void GetOpsProtoPath(string &opsproto_path) { string path = path_env; string file_path = RealPath(path.c_str()); if (file_path.empty()) { - GELOGE(FAILED, "File path %s is invalid.", path.c_str()); + GELOGE(FAILED, "[Check][Path] %s is invalid.", path.c_str()); return; } opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/"); @@ -171,7 +172,7 @@ graphStatus aclgrphBuildInitializeImpl(std::map &globa GELOGD("Enter aclgrphInitialize start!"); // check global options if (CheckGlobalOptions(global_options) != GRAPH_SUCCESS) { - GELOGE(GRAPH_PARAM_INVALID, "Check global options falied!"); + GELOGE(GRAPH_PARAM_INVALID, "[Check][Global Options] falied!"); return GRAPH_PARAM_INVALID; } @@ -185,7 +186,7 @@ graphStatus aclgrphBuildInitializeImpl(std::map &globa GELOGI("aclgrphInitialize start!"); auto ret = ge::GELib::Initialize(global_options); if (ret != ge::SUCCESS) { - GELOGE(ret, "GE initialize failed!"); + GELOGE(ret, "[Init][GELib] failed!"); return GRAPH_FAILED; } } @@ -210,7 +211,7 @@ graphStatus aclgrphBuildInitialize(std::map &global_ std::map tmp_global_options; for (auto &option : global_options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { - GELOGE(GRAPH_FAILED, "AclgrphBuildInitialize option is nullptr."); + GELOGE(GRAPH_FAILED, "[Check][Options]AclgrphBuildInitialize option is nullptr."); return GRAPH_FAILED; } std::string key = option.first.GetString(); @@ -280,7 +281,7 @@ graphStatus Impl::InferShapePrepare(const ComputeGraphPtr &compute_graph) { auto ret = prepare_infershape.Run(compute_graph); if ((ret != SUCCESS) && (ret != NOT_CHANGED)) { - GELOGE(ret, "Prepair for infershape failed, ret:%d", ret); + GELOGE(ret, "[Prepair][InferShape] failed, ret:%d", ret); return ret; } GELOGD("Prepair for infershape success!"); @@ -289,13 +290,20 @@ graphStatus Impl::InferShapePrepare(const ComputeGraphPtr &compute_graph) { graphStatus Impl::UpdateDataOpAttr(const Graph &graph) { GELOGD("Enter Update Data Attr Process!"); - if (options_.find(kInputShape) == options_.end()) { - return GRAPH_SUCCESS; - } + std::string input_shape = (options_.find(kInputShape) == options_.end()) ? "" : options_[kInputShape]; + std::string input_shape_range = (options_.find(kInputShapeRange) == options_.end()) ? "" : options_[kInputShapeRange]; + map> shape_map; vector>> user_shape_map; - GE_CHK_BOOL_EXEC(ParseInputShape(options_[kInputShape], shape_map, user_shape_map, true), - return GRAPH_PARAM_INVALID, "parse input shape failed!"); + if (!input_shape.empty()) { + GE_CHK_BOOL_EXEC(ParseInputShape(input_shape, shape_map, user_shape_map, true), + return GRAPH_PARAM_INVALID, "[Parse][InputShape] failed!"); + } + std::map>> shape_range_map; + if (!input_shape_range.empty()) { + GE_CHK_BOOL_EXEC(ParseInputShapeRange(input_shape_range, shape_range_map), + return GRAPH_PARAM_INVALID, "[Parse][InputShapeRange] failed."); + } auto compute_graph = ge::GraphUtils::GetComputeGraph(graph); GE_CHECK_NOTNULL(compute_graph); for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) { @@ -303,21 +311,17 @@ graphStatus Impl::UpdateDataOpAttr(const Graph &graph) { ge::OpDescPtr op = input_node->GetOpDesc(); GE_CHECK_NOTNULL(op); if (op->GetType() == DATA) { - auto tensor_input = op->MutableInputDesc(0); - auto tensor_output = op->MutableOutputDesc(0); - GE_CHECK_NOTNULL(tensor_input); - GE_CHECK_NOTNULL(tensor_output); - string data_op_name = op->GetName(); - auto iter = shape_map.find(data_op_name); - if (iter != shape_map.end()) { - tensor_input->SetShape(ge::GeShape(iter->second)); - tensor_output->SetShape(ge::GeShape(iter->second)); - GELOGD("update input [%s] shape info", data_op_name.c_str()); - } else { - GELOGI("no need update input [%s] attr because not found from input_shape.", data_op_name.c_str()); + if (UpdateDataOpShape(op, shape_map) != SUCCESS) { + GELOGE(GRAPH_FAILED, "[Update][DataOpShape] fail for op:%s.", op->GetName().c_str()); + return GRAPH_FAILED; } + if (UpdateDataOpShapeRange(op, shape_range_map) != SUCCESS) { + GELOGE(GRAPH_FAILED, "[Update][DataOpShapeRange] fail for op:%s.", op->GetName().c_str()); + return GRAPH_FAILED; + } } } + return GRAPH_SUCCESS; } @@ -327,8 +331,8 @@ graphStatus Impl::CheckOptions(const std::map &options if (it == ge::ir_option::ir_builder_suppported_options.end()) { auto it_lx_fusion = ir_builder_supported_options_for_lx_fusion.find(ele.first); if (it_lx_fusion == ir_builder_supported_options_for_lx_fusion.end()) { - GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", - ele.first.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Check][Options] unsupported option(%s), Please check!", + ele.first.c_str()); return GRAPH_PARAM_INVALID; } } @@ -339,7 +343,7 @@ graphStatus Impl::CheckOptions(const std::map &options auto it = options_.find(BUILD_MODE); if (it != options_.end() && !(it->second.empty())) { if (build_mode_options.find(it->second) == build_mode_options.end()) { - GELOGE(GRAPH_PARAM_INVALID, "Build mode:%s is unsupported. Please check!", it->second.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Check][BuildMode]:%s is unsupported. Please check!", it->second.c_str()); return GRAPH_PARAM_INVALID; } build_mode = it->second; @@ -347,12 +351,12 @@ graphStatus Impl::CheckOptions(const std::map &options it = options_.find(BUILD_STEP); if (it != options_.end() && !(it->second.empty())) { if (build_step_options.find(it->second) == build_step_options.end()) { - GELOGE(GRAPH_PARAM_INVALID, "Build step:%s is unsupported. Please check!", it->second.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Check][BuildStep]:%s is unsupported. Please check!", it->second.c_str()); return GRAPH_PARAM_INVALID; } } else { if (build_mode == BUILD_MODE_TUNING) { - GELOGE(GRAPH_PARAM_INVALID, "Build mode tuning must specify build step. Please check!"); + GELOGE(GRAPH_PARAM_INVALID, "[Check][BuildMode] tuning must specify build step. Please check!"); return GRAPH_PARAM_INVALID; } } @@ -372,7 +376,7 @@ graphStatus Impl::Init(const Graph &graph, const std::map(string(IR_OPTION_MODE), to_string(0))); @@ -442,7 +448,7 @@ graphStatus Impl::Init(const Graph &graph, const std::map> shape_range; + if (tensor.GetShapeRange(shape_range) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[Creat][Input] Data op [%s] get shape range failed.", data_op_name.c_str()); + return FAILED; + } ge::GeTensor inputTensor; ge::GeTensorDesc desc(data_shape, ge::Format(data_format), data_type); + if (desc.SetShapeRange(shape_range) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[Creat][Input] Data op [%s] set shape range failed.", data_op_name.c_str()); + return FAILED; + } inputTensor.SetTensorDesc(desc); inputs.push_back(inputTensor); } @@ -511,7 +526,7 @@ graphStatus Impl::BuildModel(const Graph &graph, const std::mapsecond; } else { - GELOGE(GRAPH_PARAM_INVALID, "Input format %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", + GELOGE(GRAPH_PARAM_INVALID, + "[Check][Param:InputForamt] %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", input_format.c_str()); return GRAPH_PARAM_INVALID; } @@ -558,7 +574,7 @@ graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &in } if (!ParseInputShape(input_shape, omg_context_.input_dims, omg_context_.user_input_dims, is_dynamic_input)) { - GELOGE(GRAPH_PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Parse][InputShape:input_shape] Failed, shape: %s", input_shape.c_str()); return GRAPH_PARAM_INVALID; } return GRAPH_SUCCESS; @@ -579,7 +595,7 @@ graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map tmp_build_options; for (auto &option : build_options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { - GELOGE(GRAPH_FAILED, "AclgrphBuildInitialize option is nullptr."); + GELOGE(GRAPH_FAILED, "[Check][Options]AclgrphBuildInitialize option is nullptr."); return GRAPH_FAILED; } std::string key = option.first.GetString(); @@ -595,7 +611,7 @@ graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &m ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGD("Enter aclmdlSaveModel process!"); if (model.data.get() == nullptr || model.length == 0) { - GELOGE(GRAPH_PARAM_INVALID, "input model is illegal"); + GELOGE(GRAPH_PARAM_INVALID, "[Check][ModelBufferData] model is illegal"); return GRAPH_PARAM_INVALID; } return FileSaver::SaveToFile((output_file + ".om"), reinterpret_cast(model.data.get()), @@ -606,11 +622,11 @@ graphStatus aclgrphSaveModel(const char *output_file, const ModelBufferData &mod ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); GELOGD("Enter aclmdlSaveModel process!"); if (model.data.get() == nullptr || model.length == 0) { - GELOGE(GRAPH_PARAM_INVALID, "Input model is illegal"); + GELOGE(GRAPH_PARAM_INVALID, "[Check][ModelBufferData]model is illegal"); return GRAPH_PARAM_INVALID; } if (output_file == nullptr) { - GELOGE(GRAPH_PARAM_INVALID, "Output file is nullptr."); + GELOGE(GRAPH_PARAM_INVALID, "[Check][OutputFile]file is nullptr."); return GRAPH_PARAM_INVALID; } std::string str_output_file = output_file; @@ -635,7 +651,7 @@ graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const siz GE_CHECK_NOTNULL(file); if (len > PATH_MAX || len != strlen(file) || strlen(file) == 0) { - GELOGE(GRAPH_PARAM_INVALID, "File path invalid."); + GELOGE(GRAPH_PARAM_INVALID, "[Check][FilePath]file invalid."); return GRAPH_PARAM_INVALID; } @@ -669,7 +685,7 @@ graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const siz char path[PATH_MAX] = {0}; if (realpath(file_path.c_str(), path) == nullptr) { - GELOGE(GRAPH_PARAM_INVALID, "Dump file path:%s is invalid.", file); + GELOGE(GRAPH_PARAM_INVALID, "[Check][DumpFile] path:%s is invalid.", file); return GRAPH_PARAM_INVALID; } @@ -704,7 +720,7 @@ graphStatus aclgrphGenerateForOp(const AscendString &op_type, const vectorAddInputDesc(tensor_desc) != ge::GRAPH_SUCCESS) { - GELOGE(ge::FAILED, "AddInputDesc fail."); + GELOGE(ge::FAILED, "[Add][InputDesc] fail."); return ge::FAILED; } input_tensors.emplace_back(tensor_desc); @@ -728,7 +744,7 @@ graphStatus aclgrphGenerateForOp(const AscendString &op_type, const vectorsecond; + return SUCCESS; + } + GELOGW("No session id were found with model id [%u].", model_id); + return INTERNAL_ERROR; +} } // namespace ge diff --git a/ge/model/ge_model.h b/ge/model/ge_model.h index 5676c3b6..08db8cc3 100755 --- a/ge/model/ge_model.h +++ b/ge/model/ge_model.h @@ -71,6 +71,11 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder void SetModelId(uint32_t model_id) { model_id_ = model_id; } uint32_t GetModelId() const { return model_id_; } + Status GetSessionId(uint32_t model_id, uint64_t &session_id) const; + void InsertSessionMap(uint32_t model_id, uint64_t session_id) { + model_id_to_session_id_map_.insert({model_id, session_id}); + } + protected: ConstProtoAttrMapHelper GetAttrMap() const override; @@ -90,6 +95,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder std::string platform_version_; uint8_t platform_type_ = {0}; uint32_t model_id_ = INVALID_MODEL_ID; + std::map model_id_to_session_id_map_; }; } // namespace ge using GeModelPtr = std::shared_ptr; diff --git a/ge/model/ge_root_model.h b/ge/model/ge_root_model.h index aa5a4d47..8c44272d 100755 --- a/ge/model/ge_root_model.h +++ b/ge/model/ge_root_model.h @@ -32,15 +32,36 @@ class GeRootModel { return subgraph_instance_name_to_model_; }; - const ComputeGraphPtr &GetRootGraph() const { return root_graph_; }; - void SetModelId(uint32_t model_id) { model_id_ = model_id; } + const ComputeGraphPtr &GetRootGraph() const { return root_graph_; } + void SetModelId(uint32_t model_id) { + model_id_ = model_id; + // cached for removement + model_ids_.emplace_back(model_id); + } uint32_t GetModelId() const { return model_id_; } + + std::vector GetAllModelId() const { return model_ids_; } + + void SetModelName(const std::string &model_name) { model_name_ = model_name; } + + const std::string &GetModelName() const { return model_name_; } + Status CheckIsUnknownShape(bool &is_dynamic_shape); + void SetRootGraph(ComputeGraphPtr graph) { root_graph_ = graph; } + + void SetTrainFlag(bool flag) { train_flag_ = flag; } + + bool GetTrainFlag() const { return train_flag_; } + private: ComputeGraphPtr root_graph_ = nullptr; std::map subgraph_instance_name_to_model_; uint32_t model_id_ = 0; + // In multithread online secenario, same graph can owns different davinci_model for for concurrency + std::vector model_ids_; + bool train_flag_ = false; + std::string model_name_; }; } // namespace ge using GeRootModelPtr = std::shared_ptr; diff --git a/ge/offline/main.cc b/ge/offline/main.cc index 28df9969..54a1d8fb 100755 --- a/ge/offline/main.cc +++ b/ge/offline/main.cc @@ -70,7 +70,7 @@ const char *const kModeSupport = "only support 0(model to framework model), " const char *const kModelToJsonSupport = "only support 0(Caffe) 3(TensorFlow) 5(Onnx)"; const char *const kCaffeFormatSupport = "only support NCHW, ND in Caffe model"; const char *const kTFFormatSupport = "only support NCHW, NHWC, ND, NCDHW, NDHWC in TF model"; -const char *const kONNXFormatSupport = "only support NCHW, ND in ONNX model"; +const char *const kONNXFormatSupport = "only support NCHW, ND, NCDHW in ONNX model"; // limit available mem size 2G const long kMinAvailableMem = 2097152; // 2 * 1024 * 1024 } // namespace @@ -84,6 +84,10 @@ DEFINE_string(input_shape, "", "Optional; shape of input data. Required when framework is caffe " "or TensorFLow or MindSpore or Onnx. " "Format: \"input_name1:n1,c1,h1,w1;input_name2:n2,c2,h2,w2\""); +DEFINE_string(input_shape_range, "", + "Optional; shape range of input data. Required when framework is caffe " + "or TensorFLow or Onnx. " + "Format: \"input_name1:[n1~n2,c1,h1,w1];input_name2:[n2~n3,c2,h2,w2]\""); DEFINE_bool(h, false, "show this help message"); DEFINE_string(cal_conf, "", "Optional; the calibration config file."); @@ -212,6 +216,10 @@ DEFINE_string(op_bank_path, "", "Optional; op bank path"); DEFINE_string(display_model_info, "0", "Optional; display model info"); +DEFINE_string(performance_mode, "", "Optional; express high compile performance or high execute performance." + "normal: no need to compile, used saved .o files directly;" + "high: need to recompile, high execute performance mode."); + class GFlagUtils { public: /** @@ -242,6 +250,9 @@ class GFlagUtils { " --input_shape Shape of input data. Separate multiple nodes with semicolons (;). " "Use double quotation marks (\") to enclose each argument.\n" " E.g.: \"input_name1:n1,c1,h1,w1;input_name2:n2,c2,h2,w2\"\n" + " --input_shape_range Shape range of input data. Separate multiple nodes with semicolons (;)." + "Use double quotation marks (\") to enclose each argument.\n" + " E.g.: \"input_name1:[n1~n2,c1,h1,w1];input_name2:[n2,c2~c3,h2,w2]\"\n" " --dynamic_batch_size Set dynamic batch size. E.g.: \"batchsize1,batchsize2,batchsize3\"\n" " --dynamic_image_size Set dynamic image size. Separate multiple nodes with semicolons (;). " "Use double quotation marks (\") to enclose each argument.\n" @@ -323,7 +334,8 @@ class GFlagUtils { "Default value: $HOME/atc_data\n" " --op_compiler_cache_mode Set the operator compilation cache mode." "Options are disable(default), enable and force(force to refresh the cache)\n" - " --display_model_info enable for display model info; 0(default): close display, 1: open display"); + " --display_model_info enable for display model info; 0(default): close display, 1: open display.\n" + " --performance_mode Set high performance mode of compile or execute."); gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true); // Using gflags to analyze input parameters @@ -334,10 +346,10 @@ class GFlagUtils { static Status CheckDumpInfershapeJsonFlags() { Status ret = CheckFrameWorkValid(FLAGS_framework, FLAGS_weight); GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, - "check custom aicpu run so failed!"); + "[Check][Param:FrameWork]%d value is invalid.", FLAGS_framework); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_weight != "" && !ge::CheckInputPathValid(FLAGS_weight, "--weight"), - return domi::FAILED, "Input parameter[--weight]'s value[%s] is invalid!", + return domi::FAILED, "[Check][Param:weight]value:%s: is invalid, path can not reach.", FLAGS_weight.c_str()); return domi::SUCCESS; } @@ -348,34 +360,34 @@ class GFlagUtils { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_model == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"model"}); - ret = ge::FAILED, "Input parameter[--model]'s value is empty!"); + ret = ge::FAILED, "[Check][Param]Input parameter[--model]'s value is empty!"); // check param disable_reuse_memory GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ge::CheckDisableReuseMemoryParamValid(to_string(FLAGS_disable_reuse_memory)) != ge::SUCCESS, - ret = ge::FAILED, "check disable_reuse_memory failed!"); + ret = ge::FAILED, "[Check][DisableReuseMemory]failed!"); // check optypelist_for_implmode and op_select_implmode GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ge::CheckImplmodeParamValid(FLAGS_optypelist_for_implmode, FLAGS_op_select_implmode) != ge::SUCCESS, - ret = ge::FAILED, "check optypelist_for_implmode and op_select_implmode failed!"); + ret = ge::FAILED, "[Check][ImplMode]check optypelist_for_implmode and op_select_implmode failed!"); // No output file information passed in GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_mode == GEN_OM_MODEL && FLAGS_output == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"output"}); - ret = ge::FAILED, "Input parameter[--output]'s value is empty!"); + ret = ge::FAILED, "[Check][Param]Input parameter[--output]'s value is empty!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( CheckFrameWorkValid(FLAGS_framework, FLAGS_weight) != ge::SUCCESS, ret = ge::FAILED, - "CheckFrameWorkValid failed"); + "[Check][FrameWork] failed for input --FLAGS_framework and --FLAGS_weight invalid."); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ge::CheckDynamicInputParamValid(FLAGS_dynamic_batch_size, FLAGS_dynamic_image_size, - FLAGS_dynamic_dims, FLAGS_input_shape, + FLAGS_dynamic_dims, FLAGS_input_shape, FLAGS_input_shape_range, FLAGS_input_format, is_dynamic_input) != ge::SUCCESS, - ret = ge::FAILED, "check dynamic size(batch size, image size or dims) failed!"); + ret = ge::FAILED, "[Check][DynamicInput]dynamic size(batch size, image size or dims) invalid!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( !FLAGS_insert_op_conf.empty() && !FLAGS_dynamic_dims.empty(), @@ -383,26 +395,26 @@ class GFlagUtils { {"parameter", "value", "reason"}, {"--insert_op_conf", FLAGS_insert_op_conf, "dynamic dims function does not support aipp"}); - ret = ge::FAILED, "dynamic dims function does not support aipp"); + ret = ge::FAILED, "[Check][Param]dynamic dims function does not support aipp"); #if !defined(__ANDROID__) && !defined(ANDROID) GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!CheckEncryptModeValid(FLAGS_encrypt_mode), ret = ge::FAILED, - "encrypt_mode %d not valid!!", FLAGS_encrypt_mode); + "[Check][EncryptMode]value %d not valid!!", FLAGS_encrypt_mode); if (FLAGS_encrypt_mode == 0) { // Encryption mode GELOGI("ge will run with encrypt!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ge::CheckInputPathValid(FLAGS_encrypt_key), ret = ge::FAILED, - "encrypt_key file not found!!"); + "[Check][InputPath]encrypt_key file not found!!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ge::CheckInputPathValid(FLAGS_certificate), ret = ge::FAILED, - "certificate file not found!!"); + "[Check][InputPath]certificate file not found!!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ge::CheckInputPathValid(FLAGS_hardware_key), ret = ge::FAILED, - "hardware_key file not found!!"); + "[Check][InputPath]hardware_key file not found!!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ge::CheckInputPathValid(FLAGS_private_key), ret = ge::FAILED, - "private_key file not found!!"); + "[Check][InputPath]private_key file not found!!"); } else { // No encryption GELOGI("ge will run without encrypt!"); } @@ -413,41 +425,41 @@ class GFlagUtils { */ GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_model != "" && !ge::CheckInputPathValid(FLAGS_model, "--model"), ret = ge::FAILED, - "model file %s not found!!", FLAGS_model.c_str()); + "[Check][InputPath]model file %s not found!!", FLAGS_model.c_str()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_weight != "" && !ge::CheckInputPathValid(FLAGS_weight, "--weight"), - ret = ge::FAILED, "weight file %s not found!!", + ret = ge::FAILED, "[Check][InputPath]weight file %s not found!!", FLAGS_weight.c_str()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_cal_conf != "" && !ge::CheckInputPathValid(FLAGS_cal_conf, "--cal_conf"), - ret = ge::FAILED, "calibration config file %s not found!!", + ret = ge::FAILED, "[Check][InputPath]calibration config file %s not found!!", FLAGS_cal_conf.c_str()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_op_name_map != "" && !ge::CheckInputPathValid(FLAGS_op_name_map, "--op_name_map"), - ret = ge::FAILED, "op config file %s not found!!", + ret = ge::FAILED, "[Check][InputPath]op config file %s not found!!", FLAGS_op_name_map.c_str()); GE_CHK_BOOL_EXEC(ge::CheckInsertOpConfParamValid(std::string(FLAGS_insert_op_conf)) == ge::SUCCESS, - ret = ge::FAILED, "check insert op conf failed!"); + ret = ge::FAILED, "[Check][InsertOpConf]failed!"); GE_CHK_BOOL_EXEC(ge::CheckCompressWeightParamValid( FLAGS_enable_compress_weight, FLAGS_compress_weight_conf) == ge::SUCCESS, - ret = ge::FAILED, "check compress weight failed!"); + ret = ge::FAILED, "[Check][CompressWeight]failed!"); GE_CHK_BOOL_EXEC(ge::CheckKeepTypeParamValid(FLAGS_keep_dtype) == ge::SUCCESS, - ret = ge::FAILED, "check keep dtype failed!"); + ret = ge::FAILED, "[Check][KeepType]failed!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( !ge::CheckOutputPathValid(FLAGS_check_report, "--check_report"), ret = ge::FAILED, - "check_report file %s not found!!", FLAGS_check_report.c_str()); + "[Check][OutputPath]]check_report file %s not found!!", FLAGS_check_report.c_str()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_mode == GEN_OM_MODEL && FLAGS_output != "" && (!ge::CheckOutputPathValid(FLAGS_output, "--output") || !CheckPathWithName(FLAGS_output)), - ret = ge::FAILED, "output path %s is not valid!!", FLAGS_output.c_str()); + ret = ge::FAILED, "[Check][OutputPath]output path %s is not valid!!", FLAGS_output.c_str()); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_save_original_model != "" && @@ -456,18 +468,18 @@ class GFlagUtils { ErrorManager::GetInstance().ATCReportErrMessage( "E10005", {"parameter", "value"}, {"save_original_model", FLAGS_save_original_model}); ret = ge::FAILED, - "Input parameter[--save_original_model]'s value[%s] must be true or false.", + "[Check][Parameter]Input parameter[--save_original_model]'s value[%s] must be true or false.", FLAGS_save_original_model.c_str()); GE_CHK_BOOL_EXEC(ge::CheckBufferOptimizeParamValid(FLAGS_buffer_optimize) == ge::SUCCESS, - ret = ge::FAILED, "check output type failed!"); + ret = ge::FAILED, "[Check][BufferOptimize]check output type failed!"); GE_CHK_BOOL_EXEC( ge::CheckEnableSingleStreamParamValid(std::string(FLAGS_enable_single_stream)) == ge::SUCCESS, - ret = ge::FAILED, "check enable single stream failed!"); + ret = ge::FAILED, "[Check][EnableSingleStream]failed!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((FLAGS_display_model_info != "0") && (FLAGS_display_model_info != "1"), ErrorManager::GetInstance().ATCReportErrMessage("E10006", {"parameter"}, {"display_model_info"}); - ret = ge::FAILED, "Input parameter[--display_model_info]'s value must be 1 or 0."); + ret = ge::FAILED, "[Check][Parameter]Input parameter[--display_model_info]'s value must be 1 or 0."); return ret; } @@ -484,25 +496,25 @@ class GFlagUtils { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(FLAGS_om == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"om"}); ret = ge::FAILED, - "Input parameter[--om]'s value is empty!!"); + "[Check][Parameter]Input parameter[--om]'s value is empty!!"); // JSON path not passed in GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(FLAGS_json == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"json"}); ret = ge::FAILED, - "Input parameter[--json]'s value is empty!!"); + "[Check][Parameter]Input parameter[--json]'s value is empty!!"); // Check if the model path is valid GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_om != "" && !ge::CheckInputPathValid(FLAGS_om, "--om"), ret = ge::FAILED, - "model file path is invalid: %s.", FLAGS_om.c_str()); + "[Check][InputPath]model file path is invalid: %s.", FLAGS_om.c_str()); // Check whether the JSON path is valid GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_json != "" && !ge::CheckOutputPathValid(FLAGS_json, "--json"), ret = ge::FAILED, - "json file path is invalid: %s.", FLAGS_json.c_str()); + "[Check][OutputPath]json file path is invalid: %s.", FLAGS_json.c_str()); return ret; } @@ -567,7 +579,8 @@ class GFlagUtils { if (fileName.size() > static_cast(PATH_MAX)) { ErrorManager::GetInstance().ATCReportErrMessage( "E10021", {"parameter", "size"}, {"output", std::to_string(PATH_MAX)}); - GELOGE(ge::FAILED, "Input parameter[--output]'s path is too long, it must be less than %d", PATH_MAX); + GELOGE(ge::FAILED, + "[Check][Path]Input parameter[--output]'s path is too long, it must be less than %d", PATH_MAX); return false; } @@ -625,8 +638,8 @@ static bool CheckInputFormat() { // only support NCHW ND ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kCaffeFormatSupport}); - GELOGE(ge::FAILED, - "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kCaffeFormatSupport); + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + FLAGS_input_format.c_str(), kCaffeFormatSupport); return false; } else if ((FLAGS_framework == static_cast(domi::TENSORFLOW))) { // tf if (ge::tf_support_input_format.find(FLAGS_input_format) != ge::tf_support_input_format.end()) { @@ -635,8 +648,8 @@ static bool CheckInputFormat() { // only support NCHW NHWC ND NCDHW NDHWC ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kTFFormatSupport}); - GELOGE(ge::FAILED, - "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kTFFormatSupport); + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + FLAGS_input_format.c_str(), kTFFormatSupport); return false; } else if (FLAGS_framework == static_cast(domi::ONNX)) { if (ge::onnx_support_input_format.find(FLAGS_input_format) != ge::onnx_support_input_format.end()) { @@ -645,8 +658,8 @@ static bool CheckInputFormat() { // only support NCHW ND ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kONNXFormatSupport}); - GELOGE(ge::FAILED, - "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kONNXFormatSupport); + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + FLAGS_input_format.c_str(), kONNXFormatSupport); return false; } return true; @@ -839,11 +852,11 @@ Status CreateInputsForInference(const ge::Graph &graph, vector &in domi::Status GenerateInfershapeJson() { if (!CheckInputFormat()) { - GELOGE(ge::FAILED, "Check input_format failed"); + GELOGE(ge::FAILED, "[Check][InputFormat] failed."); return domi::FAILED; } Status ret = GFlagUtils::CheckDumpInfershapeJsonFlags(); - GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "Check flags failed!"); + GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "[Check][DumpInfershapeJsonFlags] failed!"); ge::GeGenerator ge_generator; std::map options; @@ -890,13 +903,14 @@ static Status ConvertModelToJson(int fwk_type, const string &model_file, const s ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--framework", std::to_string(fwk_type), kModelToJsonSupport}); - GELOGE(ge::FAILED, "Invalid value for --framework[%d], %s.", fwk_type, kModelToJsonSupport); + GELOGE(ge::FAILED, "[Convert][ModelToJson]Invalid value for --framework[%d], %s.", + fwk_type, kModelToJsonSupport); ret = ge::FAILED; } if (FLAGS_dump_mode != "0" && FLAGS_dump_mode != "1") { ErrorManager::GetInstance().ATCReportErrMessage("E10006", {"parameter"}, {"dump_mode"}); - GELOGE(ge::FAILED, "Input parameter[--dump_mode]'s value must be 1 or 0."); + GELOGE(ge::FAILED, "[Convert][ModelToJson] Input parameter[--dump_mode]'s value must be 1 or 0."); ret = ge::FAILED; } @@ -971,12 +985,13 @@ domi::Status GenerateModel(std::map &options, std::string output graph = load_model.GetGraph(); GE_CHK_STATUS_EXEC(ge::InitDomiOmgContext(FLAGS_input_shape, FLAGS_input_format, "", is_dynamic_input), - GELOGE(ge::FAILED, "ATC Generate call InitDomiOmgContext ret fail"); + GELOGE(ge::FAILED, "[Init][DomiOmgContext]ATC Generate call InitDomiOmgContext ret fail"); (void)ge_generator.Finalize(); (void)ge::GELib::GetInstance()->Finalize(); return domi::FAILED); Status ret = CreateInputsForInference(graph, inputs); if (ret != ge::SUCCESS) { - GELOGE(ge::FAILED, "create inputs for inference failed."); + GELOGE(ge::FAILED, "[Create][InputsForInference] failed."); + REPORT_CALL_ERROR("E19999", "CreateInputsForInference failed for input --graph and --inputs."); (void)ge_generator.Finalize(); (void)ge::GELib::GetInstance()->Finalize(); return domi::FAILED; @@ -985,6 +1000,7 @@ domi::Status GenerateModel(std::map &options, std::string output } else { std::map atc_params; atc_params.insert(std::pair("input_shape", FLAGS_input_shape)); + atc_params.insert(std::pair(ge::INPUT_SHAPE_RANGE, FLAGS_input_shape_range)); atc_params.insert(std::pair("out_nodes", FLAGS_out_nodes)); atc_params.insert(std::pair("input_format", FLAGS_input_format)); atc_params.insert(std::pair("check_report", FLAGS_check_report)); @@ -1067,6 +1083,7 @@ static void SetEnvForSingleOp(std::map &options) { options.emplace(ge::OP_COMPILER_CACHE_MODE, FLAGS_op_compiler_cache_mode); options.emplace(ge::MDL_BANK_PATH_FLAG, FLAGS_mdl_bank_path); options.emplace(ge::OP_BANK_PATH_FLAG, FLAGS_op_bank_path); + options.emplace(ge::PERFORMANCE_MODE, FLAGS_performance_mode); } domi::Status GenerateSingleOp(const std::string& json_file_path) { @@ -1077,7 +1094,7 @@ domi::Status GenerateSingleOp(const std::string& json_file_path) { // check optypelist_for_implmode and op_select_implmode GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ge::CheckImplmodeParamValid(FLAGS_optypelist_for_implmode, FLAGS_op_select_implmode) != ge::SUCCESS, - return ge::FAILED, "check optypelist_for_implmode and op_select_implmode failed!"); + return ge::FAILED, "[Check][ImplmodeParam] fail for input optypelist_for_implmode and op_select_implmode."); std::map options; // need to be changed when ge.ini plan is done @@ -1113,7 +1130,7 @@ domi::Status GenerateSingleOp(const std::string& json_file_path) { output_path = FLAGS_output + "/"; } output_path += param.file_name; - ret = generator.BuildSingleOpModel(param.op_desc, param.inputs, param.outputs, output_path); + ret = generator.BuildSingleOpModel(param.op_desc, param.inputs, param.outputs, output_path, param.compile_flag); if (ret != SUCCESS) { DOMI_LOGE("Compile op failed. ge ret = %u, op index = %d", ret, index); ret = domi::FAILED; @@ -1130,12 +1147,12 @@ domi::Status GenerateSingleOp(const std::string& json_file_path) { domi::Status GenerateOmModel() { if (!CheckInputFormat()) { - GELOGE(ge::FAILED, "Check input_format failed"); + GELOGE(ge::FAILED, "[Check][InputFormat]failed."); return domi::FAILED; } Status ret = GFlagUtils::CheckFlags(); GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, - "Check flags failed! Please check whether some atc params that include semicolons[;] use double " + "[Check][Flags] failed! Please check whether some atc params that include semicolons[;] use double " "quotation marks (\") to enclose each argument such as out_nodes, input_shape, dynamic_image_size"); #if !defined(__ANDROID__) && !defined(ANDROID) // Load custom operator Library @@ -1143,7 +1160,7 @@ domi::Status GenerateOmModel() { SaveCustomCaffeProtoPath(); - GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "check custom aicpu run so failed!"); + GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "[Check][Flags]check custom aicpu run so failed!"); #endif const int f_stream_num = 1; @@ -1218,6 +1235,8 @@ domi::Status GenerateOmModel() { options.insert(std::pair(string(ge::OP_BANK_PATH_FLAG), FLAGS_op_bank_path)); options.insert(std::pair(string(ge::DISPLAY_MODEL_INFO), FLAGS_display_model_info)); + + options.insert(std::pair(string(ge::PERFORMANCE_MODE), FLAGS_performance_mode)); // set enable scope fusion passes SetEnableScopeFusionPasses(FLAGS_enable_scope_fusion_passes); // print atc option map @@ -1242,7 +1261,7 @@ domi::Status GenerateOmModel() { domi::Status ConvertModelToJson() { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); Status ret = GFlagUtils::CheckConverJsonParamFlags(); - GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "Check convert json params flags failed!"); + GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "[CheckConver][JsonParamFlags] failed!"); ret = ConvertModelToJson(FLAGS_framework, FLAGS_om, FLAGS_json); @@ -1256,13 +1275,13 @@ domi::Status DisplayModelInfo() { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(FLAGS_om == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"om"}); return ge::FAILED, - "Input parameter[--om]'s value is empty!!"); + "[Check][Parameter]Input parameter[--om]'s value is empty!!"); // Check if the model path is valid GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( FLAGS_om != "" && !ge::CheckInputPathValid(FLAGS_om, "--om"), return ge::FAILED, - "model file path is invalid: %s.", FLAGS_om.c_str()); + "[Check][InputPath]model file path is invalid: %s.", FLAGS_om.c_str()); if (FLAGS_framework == -1) { return ge::ConvertOm(FLAGS_om.c_str(), "", false); @@ -1303,13 +1322,15 @@ domi::Status ConvertPbtxtToJson() { ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); Status ret = GFlagUtils::CheckConverJsonParamFlags(); if (ret != domi::SUCCESS) { - GELOGE(ge::FAILED, "Check convert json params flags failed!"); + GELOGE(ge::FAILED, "[CheckConver][JsonParamFlags] failed!"); return domi::FAILED; } ret = ge::ConvertPbtxtToJson(FLAGS_om.c_str(), FLAGS_json.c_str()); if (ret != domi::SUCCESS) { - GELOGE(ge::FAILED, "ConvertPbtxtToJson fail."); + GELOGE(ge::FAILED, "[Convert][PbtxtToJson] fail."); + REPORT_CALL_ERROR("E19999", "ConvertPbtxtToJson failed, FLAGS_om:%s, FLAGS_json:%s.", + FLAGS_om.c_str(), FLAGS_json.c_str()); return domi::FAILED; } @@ -1378,8 +1399,8 @@ bool CheckMemInfo() { GELOGI("Get mem available [%lu kB].", current_mem_available); std::cout << "Current available mem is " << current_mem_available << "kB." << std::endl; if ((current_mem_available > 0) && (current_mem_available < kMinAvailableMem)) { - GELOGE(ge::PARAM_INVALID, "Current available mem [%lu kB] can not be smaller than [%lu kB] .", - current_mem_available, kMinAvailableMem); + GELOGE(ge::PARAM_INVALID, "[Check][MemSize]Current available mem [%lu kB] can not be smaller than [%lu kB] .", + current_mem_available, kMinAvailableMem); ErrorManager::GetInstance().ATCReportErrMessage("E10044", {"value", "min_value"}, {to_string(current_mem_available), to_string(kMinAvailableMem)}); return false; @@ -1399,7 +1420,7 @@ int main(int argc, char* argv[]) { } do { if (!CheckMemInfo()) { - GELOGE(ge::PARAM_INVALID, "Current available mem is too small"); + GELOGE(ge::PARAM_INVALID, "[Check][MemInfo]Current available mem is too small."); ret = domi::FAILED; break; } @@ -1413,17 +1434,17 @@ int main(int argc, char* argv[]) { GE_IF_BOOL_EXEC(GenerateOmModel() != domi::SUCCESS, ret = domi::FAILED; break); } else if (MODEL_TO_JSON == FLAGS_mode) { // Mode 1, transfer model to JSON GE_CHK_BOOL_EXEC(ConvertModelToJson() == domi::SUCCESS, ret = domi::FAILED; - break, "ATC ConvertJson execute failed!!"); + break, "[Convert][ModelToJson]ATC ConvertJson execute failed!!"); } else if (FLAGS_mode == ge::RunMode::PBTXT_TO_JSON) { GE_CHK_BOOL_EXEC(ConvertPbtxtToJson() == domi::SUCCESS, ret = domi::FAILED; - break, "ATC convert pbtxt to json execute failed!!"); + break, "[Convert][PbtxtToJson]ATC convert pbtxt to json execute failed!!"); } else if (FLAGS_mode == ge::RunMode::DISPLAY_OM_INFO) { GE_CHK_BOOL_EXEC(DisplayModelInfo() == domi::SUCCESS, ret = domi::FAILED; - break, "ATC DisplayModelInfo failed!!"); + break, "[Display][ModelInfo]ATC DisplayModelInfo failed!!"); } else { ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--mode", std::to_string(FLAGS_mode), kModeSupport}); - GELOGE(ge::PARAM_INVALID, "Invalid value for --mode[%d], %s.", FLAGS_mode, kModeSupport); + GELOGE(ge::PARAM_INVALID, "[Check][Parameter]Invalid value for --mode[%d], %s.", FLAGS_mode, kModeSupport); ret = domi::FAILED; break; } diff --git a/ge/offline/proto/insert_op.proto b/ge/offline/proto/insert_op.proto index bf918b20..7d708865 100644 --- a/ge/offline/proto/insert_op.proto +++ b/ge/offline/proto/insert_op.proto @@ -88,6 +88,7 @@ message AippOpParams { int32 right_padding_size = 69; int32 top_padding_size = 70; int32 bottom_padding_size = 71; + float padding_value = 72; int32 mean_chn_0 = 10; int32 mean_chn_1 = 11; diff --git a/ge/offline/single_op_parser.cc b/ge/offline/single_op_parser.cc index b1e0da6d..ce9448d5 100644 --- a/ge/offline/single_op_parser.cc +++ b/ge/offline/single_op_parser.cc @@ -53,6 +53,7 @@ constexpr char const *kKeyOriginFormat = "origin_format"; constexpr char const *kFileSuffix = ".om"; constexpr char const *kKeyDynamicInput = "dynamic_input"; constexpr char const *kKeyDynamicOutput = "dynamic_output"; +constexpr char const *kKeyCompileFlag = "compile_flag"; constexpr int kDumpJsonIndent = 2; constexpr int kShapeRangePairSize = 2; constexpr int kShapeRangeLow = 0; @@ -217,7 +218,10 @@ void from_json(const Json &j, SingleOpAttr &attr) { attr.type = j.at(kKeyType).get(); auto it = kAttrTypeDict.find(attr.type); if (it == kAttrTypeDict.end()) { - GELOGE(UNSUPPORTED, "Parse attr[%s] failed. Unsupported type: %s", attr.name.c_str(), attr.type.c_str()); + GELOGE(UNSUPPORTED, "[Find][JsonAttr] name=%s, type=%s failed for Unsupported type.", + attr.name.c_str(), attr.type.c_str()); + REPORT_INNER_ERROR("E19999", "Find jsonattr name=%s, type=%s failed for Unsupported type.", + attr.name.c_str(), attr.type.c_str()); return; } @@ -253,13 +257,19 @@ void from_json(const Json &j, SingleOpAttr &attr) { SetAttrValue(j, attr); break; default: - GELOGE(UNSUPPORTED, "Parse attr[%s] failed. Unsupported type: %s", attr.name.c_str(), attr.type.c_str()); + GELOGE(UNSUPPORTED, "[Find][JsonAttr] name=%s, type=%s failed for Unsupported type.", + attr.name.c_str(), attr.type.c_str()); + REPORT_INNER_ERROR("E19999", "Find jsonattr name=%s, type=%s failed for Unsupported type.", + attr.name.c_str(), attr.type.c_str()); break; } } void from_json(const Json &j, SingleOpDesc &desc) { - desc.op = j.at(kKeyOp).get(); + auto op = j.find(kKeyOp); + if (op != j.end()) { + desc.op = j.at(kKeyOp).get(); + } auto input_desc = j.find(kKeyInputDesc); if (input_desc != j.end()) { @@ -275,28 +285,34 @@ void from_json(const Json &j, SingleOpDesc &desc) { if (attr_field != j.end()) { desc.attrs = attr_field->get>(); } + + auto compile_flag = j.find(kKeyCompileFlag); + if (compile_flag != j.end()) { + desc.compile_flag = compile_flag->get(); + } } Status SingleOpParser::ReadJsonFile(const std::string &file, Json &json_obj) { std::string real_path = RealPath(file.c_str()); if (real_path.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10023", {"value"}, {file}); - GELOGE(FAILED, "Input parameter[--singleop]'s value[%s] is not a valid path.", file.c_str()); + GELOGE(FAILED, "[Read][JsonFile]Input parameter[--singleop]'s value[%s] is not a valid path.", file.c_str()); return INTERNAL_ERROR; } std::ifstream ifs(real_path); if (!ifs.is_open()) { ErrorManager::GetInstance().ATCReportErrMessage("E10024", {"value"}, {file}); - GELOGE(FAILED, "Open file[%s] provided in input parameter[--singleop] failed.", file.c_str()); + GELOGE(FAILED, "[Open][JsonFile] failed for file[%s] provided in input parameter[--singleop].", file.c_str()); return FAILED; } try { ifs >> json_obj; } catch (const std::exception &e) { ErrorManager::GetInstance().ATCReportErrMessage("E10025", {"realpath", "errmsg"}, {real_path, e.what()}); - GELOGE(PARAM_INVALID, "Parse file[%s] provided in input parameter[--singleop] failed, exception = %s.", - real_path.c_str(), e.what()); + GELOGE(PARAM_INVALID, + "[Parse][JsonFile] fail for file[%s] provided in input parameter[--singleop], exception = %s.", + real_path.c_str(), e.what()); return PARAM_INVALID; } @@ -307,7 +323,7 @@ Status SingleOpParser::ReadJsonFile(const std::string &file, Json &json_obj) { bool SingleOpParser::Validate(const SingleOpDesc &op_desc) { if (op_desc.op.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10026"); - GELOGE(PARAM_INVALID, "Op name is empty"); + GELOGE(PARAM_INVALID, "[Check][Param] fail for name of input SingleOpDesc is empty."); return false; } @@ -316,14 +332,15 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) { if (!tensor_desc.GetValidFlag()) { ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"}, {"intput", "datatype or format", std::to_string(index)}); - GELOGE(PARAM_INVALID, "Input's dataType or format is invalid when the index is %d", index); + GELOGE(PARAM_INVALID, + "[Check][Param] fail for Input's dataType or format is invalid when the index is %d", index); return false; } if ((tensor_desc.type == DT_UNDEFINED && tensor_desc.format != FORMAT_RESERVED) || (tensor_desc.type != DT_UNDEFINED && tensor_desc.format == FORMAT_RESERVED)){ ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"}, {"intput", "datatype or format", std::to_string(index)}); - GELOGE(PARAM_INVALID, "Input's dataType or format is invalid when the index is %d", index); + GELOGE(PARAM_INVALID, "[Check][Param]Input's dataType or format is invalid when the index is %d", index); return false; } ++index; @@ -334,20 +351,20 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) { if (!tensor_desc.GetValidFlag()) { ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"}, {"output", "datatype", std::to_string(index)}); - GELOGE(PARAM_INVALID, "Output's dataType is invalid when the index is %d", index); + GELOGE(PARAM_INVALID, "[Check][Param]fail for Output's dataType is invalid when the index is %d", index); return false; } if (tensor_desc.type == DT_UNDEFINED) { ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"}, {"output", "datatype", std::to_string(index)}); - GELOGE(PARAM_INVALID, "Output's dataType is invalid when the index is %d", index); + GELOGE(PARAM_INVALID, "[Check][Param]Output's dataType is invalid when the index is %d", index); return false; } if (tensor_desc.format == FORMAT_RESERVED) { ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"}, {"output", "format", std::to_string(index)}); - GELOGE(PARAM_INVALID, "Output's format is invalid when the index is %d", index); + GELOGE(PARAM_INVALID, "[Check][Param]Output's format is invalid when the index is %d", index); return false; } ++index; @@ -356,13 +373,13 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) { for (auto &attr : op_desc.attrs) { if (attr.name.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10029"); - GELOGE(PARAM_INVALID, "attr name is empty"); + GELOGE(PARAM_INVALID, "[Parse][Attr]attr name is empty"); return false; } if (attr.value.IsEmpty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10030", {"attrname"}, {attr.name}); - GELOGE(PARAM_INVALID, "Parse attr \"%s\" failed. ", attr.name.c_str()); + GELOGE(PARAM_INVALID, "[Parse][Attr] fail for vale of attr name:\"%s\" is empty. ", attr.name.c_str()); return false; } } @@ -442,7 +459,7 @@ Status SingleOpParser::ConvertToBuildParam(int index, } if (VerifyOpInputOutputSizeByIr(*op_desc) != SUCCESS) { - GELOGE(PARAM_INVALID, "Verify op [%s] input or output size failed.", op_desc->GetType().c_str()); + GELOGE(PARAM_INVALID, "[Verify][OpInputOutputSize] fail for input op [%s] invalid.", op_desc->GetType().c_str()); return PARAM_INVALID; } @@ -462,8 +479,9 @@ Status SingleOpParser::VerifyOpInputOutputSizeByIr(const OpDesc ¤t_op_desc string reason = "is smaller than the ir needed input size " + std::to_string(ir_opdesc_inputs_num); ErrorManager::GetInstance().ATCReportErrMessage("E19014", {"opname", "value", "reason"}, {current_op_desc.GetName(), "input size " + std::to_string(current_opdesc_inputs_num), reason}); - GELOGE(PARAM_INVALID, "This op [%s] input size %zu is smaller than the ir needed input size %zu", - current_op_desc.GetName().c_str(), current_opdesc_inputs_num, ir_opdesc_inputs_num); + GELOGE(PARAM_INVALID, + "[Verify][OpInputOutputSize]This op:%s input size %zu is smaller than the ir needed input size %zu", + current_op_desc.GetName().c_str(), current_opdesc_inputs_num, ir_opdesc_inputs_num); return PARAM_INVALID; } size_t current_opdesc_outputs_num = current_op_desc.GetOutputsSize(); @@ -472,8 +490,9 @@ Status SingleOpParser::VerifyOpInputOutputSizeByIr(const OpDesc ¤t_op_desc string reason = "is smaller than the ir needed output size " + std::to_string(ir_opdesc_outputs_num); ErrorManager::GetInstance().ATCReportErrMessage("E19014", {"opname", "value", "reason"}, {current_op_desc.GetName(), "output size " + std::to_string(current_opdesc_outputs_num), reason}); - GELOGE(PARAM_INVALID, "This op [%s] output size %zu is smaller than the ir needed output size %zu", - current_op_desc.GetName().c_str(), current_opdesc_outputs_num, ir_opdesc_outputs_num); + GELOGE(PARAM_INVALID, + "[Verify][OpInputOutputSize]This op:%s output size %zu is smaller than the ir needed output size %zu", + current_op_desc.GetName().c_str(), current_opdesc_outputs_num, ir_opdesc_outputs_num); return PARAM_INVALID; } } @@ -492,7 +511,8 @@ Status SingleOpParser::SetShapeRange(const std::string &op_name, {op_name, "shape", "has unknown rank but dim size is not one"}); - GELOGE(PARAM_INVALID, "Invalid tensor shape: [%s]", ge_tensor_desc.MutableShape().ToString().c_str()); + GELOGE(PARAM_INVALID, "[Set][ShapeRange]Invalid tensor shape:%s.", + ge_tensor_desc.MutableShape().ToString().c_str()); return PARAM_INVALID; } if (!tensor_desc.dim_ranges.empty()) { @@ -500,7 +520,7 @@ Status SingleOpParser::SetShapeRange(const std::string &op_name, {op_name, "shape range", "is not needed while the rank the shape is unknown"}); - GELOGE(PARAM_INVALID, "Shape range is not needed while the rank the shape is unknown"); + GELOGE(PARAM_INVALID, "[Set][ShapeRange]Shape range is not needed while the rank the shape is unknown."); return PARAM_INVALID; } @@ -522,7 +542,7 @@ Status SingleOpParser::SetShapeRange(const std::string &op_name, {op_name, "shape range size " + std::to_string(num_shape_ranges), reason}); - GELOGE(PARAM_INVALID, "The number of shape_range mismatches that of unknown dims."); + GELOGE(PARAM_INVALID, "[Set][ShapeRange]The number of shape_range mismatches that of unknown dims."); return PARAM_INVALID; } @@ -533,7 +553,8 @@ Status SingleOpParser::SetShapeRange(const std::string &op_name, {op_name, "shape range " + std::to_string(range_index), reason}); - GELOGE(PARAM_INVALID, "Invalid shape range entry. index = %zu, size = %zu", range_index, range.size()); + GELOGE(PARAM_INVALID, "[Set][ShapeRange]Invalid shape range entry. index = %zu, size = %zu", + range_index, range.size()); return PARAM_INVALID; } @@ -550,9 +571,8 @@ Status SingleOpParser::SetShapeRange(const std::string &op_name, "shape range size " + std::to_string(num_shape_ranges), reason}); GELOGE(PARAM_INVALID, - "The number of shape_range(%zu) mismatches that of unknown dims(%zu).", - num_shape_ranges, - range_index); + "[Set][ShapeRange]The number of shape_range(%zu) mismatches that of unknown dims(%zu).", + num_shape_ranges, range_index); return PARAM_INVALID; } @@ -572,17 +592,25 @@ Status SingleOpParser::ParseSingleOpList(const std::string &file, std::vector input_desc; std::vector output_desc; std::vector attrs; + int32_t compile_flag = 0; }; struct SingleOpBuildParam { @@ -62,6 +63,7 @@ struct SingleOpBuildParam { std::vector inputs; std::vector outputs; std::string file_name; + int32_t compile_flag = 0; }; void from_json(const nlohmann::json &json, SingleOpTensorDesc &desc); diff --git a/ge/omm/csa_interact.cc b/ge/omm/csa_interact.cc index 1b33ddbd..15bca075 100644 --- a/ge/omm/csa_interact.cc +++ b/ge/omm/csa_interact.cc @@ -78,7 +78,8 @@ void CsaInteract::Init(int32_t dev_index, int64_t job_id) { Status CsaInteract::WriteJobState(JobState job_state, JobSubState job_sub_state, uint32_t module_ret_errcode, ErrorModule error_module) { if (!is_init_) { - GELOGE(INTERNAL_ERROR, "CsaInteract has not init, can't WriteJobState"); + GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); + REPORT_INNER_ERROR("E19999", "WriteJobState failed before init. "); return INTERNAL_ERROR; } if ((curr_state_ == JOBSTATE_FAILED) || (curr_state_ == JOBSTATE_KILLED)) { @@ -107,7 +108,10 @@ Status CsaInteract::WriteJobState(JobState job_state, JobSubState job_sub_state, content = content_json.dump(); } catch (const nlohmann::json::exception &e) { - GELOGE(INTERNAL_ERROR, "build jobstate content json string failed, exception:%s job_state:%u", e.what(), job_state); + GELOGE(INTERNAL_ERROR, "[Create][JsonObject] exception:%s job_state:%u job_sub_state:%u.", + e.what(), job_state, job_sub_state); + REPORT_INNER_ERROR("E19999", "Create json object failed. exception:%s job_state:%u job_sub_state:%u.", + e.what(), job_state, job_sub_state); return INTERNAL_ERROR; } @@ -168,7 +172,8 @@ void CsaInteract::WriteInternalErrorCode() { /// Status CsaInteract::WriteHcomDetection(const std::string &content) { if (!is_init_) { - GELOGE(INTERNAL_ERROR, "CsaInteract has not init, can't WriteJobState"); + GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); + REPORT_INNER_ERROR("E19999", "WriteHcomDetection failed before init."); return INTERNAL_ERROR; } @@ -192,28 +197,33 @@ Status CsaInteract::WriteFile(const std::string &file_name, const std::string &c int32_t fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); if (fd == EN_ERROR) { if (MakePath(file_name) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "csainteract create file path fail, errno is %d", errno); + GELOGE(INTERNAL_ERROR, "[Create][File Path] errno is %d", errno); + REPORT_CALL_ERROR("E19999", "MakePath failed. errno is %d", errno); return INTERNAL_ERROR; } fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); if (fd == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "open file fail, errno is %d", errno); + GELOGE(INTERNAL_ERROR, "[Open][File] errno is %d file_name: %s", errno, file_name.c_str()); + REPORT_CALL_ERROR("E19999", "mmOpen2 failed. errno is %d file_name: %s", errno, file_name.c_str()); return INTERNAL_ERROR; } } mmSsize_t ret = mmWrite(fd, reinterpret_cast(const_cast(content.c_str())), content.length()); if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "write file fail, errno is %d", errno); + GELOGE(INTERNAL_ERROR, "[Write][File] errno is %d", errno); + REPORT_CALL_ERROR("E19999", "mmWrite failed. errno is %d", errno); ret = mmClose(fd); if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "close file fail, error is %d", errno); + GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); + REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); } return INTERNAL_ERROR; } ret = mmClose(fd); if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "close file fail, error is %d", errno); + GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); + REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); return INTERNAL_ERROR; } @@ -242,7 +252,8 @@ Status CsaInteract::MakePath(const std::string &file_name) { std::string pre_path = file_path.substr(0, found + 1); if (mmAccess(pre_path.c_str()) != EN_OK) { if (mmMkdir(pre_path.c_str(), M_IRWXU) != EN_OK) { - GELOGE(INTERNAL_ERROR, "csainteract mkdir fail, errno is %d", errno); + GELOGE(INTERNAL_ERROR, "[Create][FileDir] fail, errno is %d, pre_path:%s", errno, pre_path.c_str()); + REPORT_CALL_ERROR("E19999", "mmMkdir failed. errno is %d pre_path:%s", errno, pre_path.c_str()); return INTERNAL_ERROR; } } diff --git a/ge/opskernel_manager/ops_kernel_builder_manager.cc b/ge/opskernel_manager/ops_kernel_builder_manager.cc index 37bdcf7a..04262e1b 100644 --- a/ge/opskernel_manager/ops_kernel_builder_manager.cc +++ b/ge/opskernel_manager/ops_kernel_builder_manager.cc @@ -50,19 +50,19 @@ Status OpsKernelBuilderManager::Initialize(const map & GE_CHK_STATUS_RET_NOLOG(GetLibPaths(options, lib_paths)); plugin_manager_.reset(new (std::nothrow)PluginManager()); GE_CHECK_NOTNULL(plugin_manager_); - GE_CHK_STATUS_RET(plugin_manager_->LoadSo(lib_paths), "Failed to load libs"); + GE_CHK_STATUS_RET(plugin_manager_->LoadSo(lib_paths), + "[Load][Libs]Failed, lib_paths=%s.", lib_paths.c_str()); } auto &kernel_builders = OpsKernelBuilderRegistry::GetInstance().GetAll(); - GELOGI("Number of OpBuild = %zu", kernel_builders.size()); + GELOGI("[Show][OpsKernelBuilderNum]Number of OpBuild = %zu", kernel_builders.size()); for (const auto &it : kernel_builders) { const std::string &kernel_lib_name = it.first; GELOGI("Initialize ops kernel util for %s", kernel_lib_name.c_str()); GE_CHECK_NOTNULL(it.second); GE_CHK_STATUS_RET(it.second->Initialize(options), - "Failed to invoke Initialize, kernel lib name = %s", - kernel_lib_name.c_str()); + "[Invoke][Initialize]failed, kernel lib name = %s", kernel_lib_name.c_str()); ops_kernel_builders_.emplace(kernel_lib_name, it.second); } @@ -100,7 +100,8 @@ OpsKernelBuilderPtr OpsKernelBuilderManager::GetOpsKernelBuilder(const string &n return nullptr; } -Status OpsKernelBuilderManager::GetLibPaths(const std::map &options, std::string &lib_paths) { +Status OpsKernelBuilderManager::GetLibPaths(const std::map &options, std::string &lib_paths) { GELOGD("Start to execute GetLibPaths"); std::string path_base = PluginManager::GetPath(); std::string so_path = "plugin/opskernel/"; @@ -128,18 +129,17 @@ Status OpsKernelBuilderManager::CalcOpRunningParam(Node &node) const { const std::string &lib_name = op_desc->GetOpKernelLibName(); auto it = ops_kernel_builders_.find(lib_name); if (it == ops_kernel_builders_.end()) { - GELOGE(INTERNAL_ERROR, - "Failed to get OpKernelStore. libName = %s, node = %s", - lib_name.c_str(), - op_desc->GetName().c_str()); + GELOGE(INTERNAL_ERROR,"[Find][LibName] fail for libName = %s, node = %s.", + lib_name.c_str(), op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", + "find LibName for CalcOpRunningParam failed, libName = %s, node = %s not exist.", + lib_name.c_str(), op_desc->GetName().c_str()); return INTERNAL_ERROR; } GELOGD("To invoke CalcOpRunningParam, node = %s, lib name = %s", op_desc->GetName().c_str(), lib_name.c_str()); GE_CHK_STATUS_RET(it->second->CalcOpRunningParam(node), - "Failed to invoke CalcOpRunningParam, libName = %s, node = %s", - lib_name.c_str(), - op_desc->GetName().c_str()); + "[Invoke][CalcOpRunningParam]failed, libName = %s, node = %s", lib_name.c_str(), op_desc->GetName().c_str()); GELOGD("Done invoking CalcOpRunningParam successfully"); return SUCCESS; } @@ -152,20 +152,17 @@ Status OpsKernelBuilderManager::GenerateTask(const Node &node, const std::string &lib_name = op_desc->GetOpKernelLibName(); auto it = ops_kernel_builders_.find(lib_name); if (it == ops_kernel_builders_.end()) { - GELOGE(INTERNAL_ERROR, - "Failed to get OpKernelStore. libName = %s, node = %s", - lib_name.c_str(), + GELOGE(INTERNAL_ERROR, "[Find][LibName]fail for libName = %s, node:%s", lib_name.c_str(), op_desc->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "find LibName for GenerateTask failed, libName = %s, node = %s not exist", + lib_name.c_str(), op_desc->GetName().c_str()); return INTERNAL_ERROR; } GELOGD("To invoke GenerateTask, node = %s, lib name = %s", op_desc->GetName().c_str(), lib_name.c_str()); GE_CHK_STATUS_RET(it->second->GenerateTask(node, context, tasks), - "Failed to invoke GenerateTask, libName = %s, node = %s", - lib_name.c_str(), - op_desc->GetName().c_str()); + "[Invoke][GenerateTask]failed, libName = %s, node = %s", lib_name.c_str(), op_desc->GetName().c_str()); GELOGD("Done invoking GenerateTask successfully"); return SUCCESS; } - } // namespace ge diff --git a/ge/opskernel_manager/ops_kernel_manager.cc b/ge/opskernel_manager/ops_kernel_manager.cc index 30f39c0d..ac5e9153 100644 --- a/ge/opskernel_manager/ops_kernel_manager.cc +++ b/ge/opskernel_manager/ops_kernel_manager.cc @@ -56,7 +56,8 @@ Status OpsKernelManager::Initialize(const map &options_const) { std::map options(options_const); Status ret = InitPluginOptions(options); if (ret != SUCCESS) { - GELOGE(ret, "[OpsKernelManager] [Initialize] parse pluginFlag from ge options failed."); + GELOGE(ret, "[Init][PluginOptions] parse pluginFlag from ge options failed."); + REPORT_CALL_ERROR("E19999", "InitPluginOptions failed."); return ret; } @@ -85,7 +86,8 @@ Status OpsKernelManager::Initialize(const map &options_const) { initialize_ = options; Status rst0 = plugin_manager_.InvokeAll &, Status>(kInitialize, initialize_); if (rst0 == FAILED) { - GELOGE(GE_OPS_GET_NO_VALID_SO, "There is invalid so about OpsKernelInfoStore."); + GELOGE(GE_OPS_GET_NO_VALID_SO, "[Invoke][OpsKernelInfo]PluginManager InvokeAll failed."); + REPORT_INNER_ERROR("E19999", "PluginManager InvokeAll failed."); return GE_OPS_GET_NO_VALID_SO; } Status rst1 = @@ -114,18 +116,21 @@ Status OpsKernelManager::Initialize(const map &options_const) { } ret = InitGraphOptimizerPriority(); if ((ret != SUCCESS)) { - GELOGE(ret, "Init graph optimizer priority failed."); + GELOGE(ret, "[Init][GraphOptimizerPriority] failed."); + REPORT_CALL_ERROR("E19999", "InitGraphOptimizerPriority failed."); return ret; } init_flag_ = true; return SUCCESS; } else { - GELOGE(ret, "Failed to find any valid so file."); + GELOGE(ret, "[Check][SoFile] not find any valid so file."); + REPORT_INNER_ERROR("E19999", "OpsKernelManager::Initialize failed for not find any valid so file."); return ret; } } -void OpsKernelManager::GetExternalEnginePath(std::string &extern_engine_path, const std::map& options) { +void OpsKernelManager::GetExternalEnginePath(std::string &extern_engine_path, + const std::map& options) { GELOGI("Enter get external engine so path schedule"); const char *path_env = std::getenv("ASCEND_ENGINE_PATH"); if (path_env != nullptr) { @@ -175,21 +180,35 @@ Status OpsKernelManager::ParsePluginOptions(const map &options, } else if (flag == 1) { enable_flag = true; } else { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", + GELOGE(GE_GRAPH_OPTIONS_INVALID, + "[Parse][PluginOptions]option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(), iter->second.c_str()); + REPORT_INNER_ERROR("E19999", "ParsePluginOptions failed, option_key:%s, " + "its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(), iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; } } catch (std::invalid_argument &) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:ge.feFlag, its value %s is invalid_argument, it must be 0 or 1.", + GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Parse][PluginOptions] failed, option_key:ge.feFlag," + "its value %s is invalid_argument, it must be 0 or 1.", iter->second.c_str()); + REPORT_INNER_ERROR("E19999", "ParsePluginOptions failed, option_key:ge.feFlag," + "its value %s is invalid_argument, it must be 0 or 1.", + iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (std::out_of_range &) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:ge.feFlag, its value %s is out of range, it must be 0 or 1.", + GELOGE(GE_GRAPH_OPTIONS_INVALID, + "[Parse][PluginOptions]failed, option_key:ge.feFlag, its value %s is out of range, it must be 0 or 1.", iter->second.c_str()); + REPORT_INNER_ERROR("E19999", "ParsePluginOptions failed, option_key:ge.feFlag," + "its value %s is out of range, it must be 0 or 1.", + iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; } catch (...) { - GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", + GELOGE(GE_GRAPH_OPTIONS_INVALID, + "[Parse][PluginOptions]option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(), iter->second.c_str()); + REPORT_INNER_ERROR("E19999", "ParsePluginOptions failed, option_key:%s, " + "its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(), iter->second.c_str()); return GE_GRAPH_OPTIONS_INVALID; } } else { @@ -203,13 +222,15 @@ Status OpsKernelManager::ParsePluginOptions(const map &options, Status OpsKernelManager::CheckPluginPtr() const { for (auto iter = ops_kernel_store_.begin(); iter != ops_kernel_store_.end(); ++iter) { if (iter->second == nullptr) { - GELOGE(INTERNAL_ERROR, "CheckPluginPtr OpsKernelInfoStorePtr is null"); + GELOGE(INTERNAL_ERROR, "[Check][PluginPtr] OpsKernelInfoStorePtr key=%s is null", iter->first.c_str()); + REPORT_INNER_ERROR("E19999", "CheckPluginPtr OpsKernelInfoStorePtr key=%s is null", iter->first.c_str()); return FAILED; } } for (auto iter1 = graph_optimizers_.begin(); iter1 != graph_optimizers_.end(); ++iter1) { if (iter1->second == nullptr) { - GELOGE(INTERNAL_ERROR, "CheckPluginPtr GraphOptimizerPtr is null"); + GELOGE(INTERNAL_ERROR, "[Check][PluginPtr] GraphOptimizerPtr key=%s is null", iter1->first.c_str()); + REPORT_INNER_ERROR("E19999", "GraphOptimizerPtr key=%s is null", iter1->first.c_str()); return FAILED; } } @@ -222,7 +243,9 @@ Status OpsKernelManager::InitOpKernelInfoStores(const map &optio GELOGI("OpKernelInfoStore name: %s.", (it.first).c_str()); Status ret = it.second->Initialize(options); if (ret != SUCCESS) { - GELOGE(GE_OPS_KERNEL_STORE_INIT_FAILED, "OpKernelInfoStore: %s initialize failed.", (it.first).c_str()); + GELOGE(GE_OPS_KERNEL_STORE_INIT_FAILED, + "[Init][OpKernelLib]OpKernelInfoStore: %s initialize failed.", (it.first).c_str()); + REPORT_CALL_ERROR("E19999", "OpKernelInfoStore: %s initialize failed.", (it.first).c_str()); return GE_OPS_KERNEL_STORE_INIT_FAILED; } } @@ -247,7 +270,8 @@ void OpsKernelManager::InitOpsKernelInfo() { } std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "InitOpsKernelInfo failed."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Get][GELib]malloc instance_ptr failed."); + REPORT_INNER_ERROR("E19999", "InitOpsKernelInfo failed for new GELib."); return; } // sort opinfo of ops_kernel_info_ @@ -291,7 +315,8 @@ Status OpsKernelManager::InitGraphOptimzers(const map &options) GE_CHK_STATUS_RET(it.second->GetAttributes(attrs)) std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "InitGraphOptimzers failed."); + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Get][GELib]malloc instance_ptr failed."); + REPORT_INNER_ERROR("E19999", "InitGraphOptimzers failed for new GELib."); return GE_CLI_GE_NOT_INITIALIZED; } if (!instance_ptr->DNNEngineManagerObj().IsEngineRegistered(attrs.engineName)) { @@ -300,7 +325,9 @@ Status OpsKernelManager::InitGraphOptimzers(const map &options) } Status ret = it.second->Initialize(options); if (ret != SUCCESS) { - GELOGE(GE_OPS_GRAPH_OPTIMIZER_INIT_FAILED, "GraphOptimzer: %s initialize failed.", (it.first).c_str()); + GELOGE(GE_OPS_GRAPH_OPTIMIZER_INIT_FAILED, + "[Init][GraphOptimzer]GraphOptimzer: %s initialize failed.", (it.first).c_str()); + REPORT_CALL_ERROR("E19999", "InitGraphOptimzers failed. %s initialize failed.", (it.first).c_str()); return GE_OPS_GRAPH_OPTIMIZER_INIT_FAILED; } } @@ -317,7 +344,8 @@ Status OpsKernelManager::Finalize() { GELOGI("OpsKernelStore finalize, name: %s.", (iter->first).c_str()); Status status = iter->second->Finalize(); if (SUCCESS != status) { - GELOGE(status, "OpsKernelStore finalize failed, name: %s.", (iter->first).c_str()); + GELOGE(status, "[Check][Status]OpsKernelStore finalize failed, name: %s.", (iter->first).c_str()); + REPORT_CALL_ERROR("E19999", "OpsKernelStore finalize failed, name: %s.", (iter->first).c_str()); return status; } } @@ -325,14 +353,16 @@ Status OpsKernelManager::Finalize() { GELOGI("GraphOptimzers finalize, name: %s.", (iter->first).c_str()); Status status = iter->second->Finalize(); if (status != SUCCESS) { - GELOGE(status, "GraphOptimzers finalize failed, name: %s.", (iter->first).c_str()); + GELOGE(status, "[Check][Status]GraphOptimzers finalize failed, name: %s.", (iter->first).c_str()); + REPORT_CALL_ERROR("E19999", "GraphOptimzers finalize failed, name: %s.", (iter->first).c_str()); return status; } } Status ret = FinalizeOpsKernel(); if (ret != SUCCESS) { - GELOGE(ret, "free ops kernel resource failed."); + GELOGE(ret, "[Free][Ops Kernel Resource] failed."); + REPORT_CALL_ERROR("E19999", "FinalizeOpsKernel failed, Free Ops kernel resource failed."); return ret; } @@ -443,7 +473,8 @@ Status OpsKernelManager::FinalizeOpsKernel() { GELOGI("ge invoke ops kernal finalize."); Status ret = plugin_manager_.InvokeAll(kFinalize); if (ret != SUCCESS) { - GELOGE(ret, "[Finalize] invoke Fe finalize failed."); + GELOGE(ret, "[Finalize][Check][Status] invoke Fe finalize failed."); + REPORT_INNER_ERROR("E19999", "PluginManager InvokeAll failed."); return ret; } diff --git a/ge/plugin/engine/CMakeLists.txt b/ge/plugin/engine/CMakeLists.txt index e5736b51..3aace4ac 100644 --- a/ge/plugin/engine/CMakeLists.txt +++ b/ge/plugin/engine/CMakeLists.txt @@ -41,6 +41,7 @@ target_link_options(engine PRIVATE target_link_libraries(engine PRIVATE $ -Wl,--no-as-needed + c_sec slog -Wl,--as-needed -lrt diff --git a/ge/plugin/engine/dnnengines.cc b/ge/plugin/engine/dnnengines.cc index cf6b7517..5b06310c 100755 --- a/ge/plugin/engine/dnnengines.cc +++ b/ge/plugin/engine/dnnengines.cc @@ -55,7 +55,7 @@ void VectorCoreDNNEngine::GetAttributes(DNNEngineAttribute &attrs) const { attrs AICpuDNNEngine::AICpuDNNEngine(const std::string &engine_name) { engine_attribute_.engine_name = engine_name; - engine_attribute_.compute_cost = COST_3; + engine_attribute_.compute_cost = COST_2; engine_attribute_.runtime_type = DEVICE; engine_attribute_.engine_input_format = FORMAT_RESERVED; engine_attribute_.engine_output_format = FORMAT_RESERVED; @@ -71,7 +71,7 @@ void AICpuDNNEngine::GetAttributes(DNNEngineAttribute &attrs) const { attrs = en AICpuTFDNNEngine::AICpuTFDNNEngine(const std::string &engine_name) { engine_attribute_.engine_name = engine_name; - engine_attribute_.compute_cost = COST_2; + engine_attribute_.compute_cost = COST_3; engine_attribute_.runtime_type = DEVICE; engine_attribute_.engine_input_format = FORMAT_RESERVED; engine_attribute_.engine_output_format = FORMAT_RESERVED; diff --git a/ge/plugin/engine/engine_manage.cc b/ge/plugin/engine/engine_manage.cc index a14c92ea..0e129526 100644 --- a/ge/plugin/engine/engine_manage.cc +++ b/ge/plugin/engine/engine_manage.cc @@ -21,6 +21,7 @@ #include #include "common/ge/ge_util.h" +#include "securec.h" #include "framework/common/debug/ge_log.h" #include "plugin/engine/dnnengines.h" @@ -29,7 +30,8 @@ std::unique_ptr> EngineManager::engine_map_; Status EngineManager::RegisterEngine(const std::string &engine_name, DNNEnginePtr engine_ptr) { if (engine_ptr == nullptr) { - GELOGE(FAILED, "enginePtr is nullptr"); + GELOGE(FAILED, "[Register][Engine] failed, as input engine_ptr is nullptr"); + REPORT_INNER_ERROR("E19999", "RegisterEngine failed for input engine_ptr is nullptr."); return FAILED; } @@ -64,7 +66,8 @@ void RegisterAiCoreEngine() { DNNEngineAttribute attr_aicore = {ai_core, mem_type_aicore, COST_0, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr aicore_engine_ptr = MakeShared(attr_aicore); if (aicore_engine_ptr == nullptr) { - GELOGE(ge::FAILED, "make aiCoreEnginePtr failed"); + GELOGE(ge::FAILED, "[Register][AiCoreEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterAiCoreEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(ai_core, aicore_engine_ptr) != SUCCESS) { @@ -80,7 +83,8 @@ void RegisterVectorEngine() { DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr vectorcore_engine_ptr = MakeShared(attr_vector_core); if (vectorcore_engine_ptr == nullptr) { - GELOGE(ge::FAILED, "make vectorCoreEnginePtr failed"); + GELOGE(ge::FAILED, "[Register][VectorEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterVectorEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vector_core, vectorcore_engine_ptr) != SUCCESS) { @@ -92,10 +96,13 @@ void RegisterAiCpuEngine() { const std::string vm_aicpu = "DNN_VM_AICPU_ASCEND"; std::vector mem_type_aicpu; mem_type_aicpu.emplace_back(GE_ENGINE_ATTR_MEM_TYPE_HBM); - DNNEngineAttribute attr_aicpu = {vm_aicpu, mem_type_aicpu, COST_3, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; + + DNNEngineAttribute attr_aicpu = {vm_aicpu, mem_type_aicpu, COST_2, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; + DNNEnginePtr vm_engine_ptr = MakeShared(attr_aicpu); if (vm_engine_ptr == nullptr) { - GELOGE(ge::FAILED, "make vm_engine_ptr failed"); + GELOGE(ge::FAILED, "[Register][AiCpuEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterAiCpuEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vm_aicpu, vm_engine_ptr) != SUCCESS) { @@ -107,10 +114,13 @@ void RegisterAiCpuTFEngine() { const std::string vm_aicpu_tf = "DNN_VM_AICPU"; std::vector mem_type_aicpu_tf; mem_type_aicpu_tf.emplace_back(GE_ENGINE_ATTR_MEM_TYPE_HBM); - DNNEngineAttribute attr_aicpu_tf = {vm_aicpu_tf, mem_type_aicpu_tf, COST_2, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; + + DNNEngineAttribute attr_aicpu_tf = {vm_aicpu_tf, mem_type_aicpu_tf, COST_3, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; + DNNEnginePtr vm_engine_ptr = MakeShared(attr_aicpu_tf); if (vm_engine_ptr == nullptr) { - GELOGE(ge::FAILED, "make vm_engine_ptr failed"); + GELOGE(ge::FAILED, "[Register][AiCpuTFEngine]make vm_engine_ptr failed"); + REPORT_INNER_ERROR("E19999", "RegisterAiCpuTFEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vm_aicpu_tf, vm_engine_ptr) != SUCCESS) { @@ -126,7 +136,8 @@ void RegisterGeLocalEngine() { DNNEngineAttribute attr_ge_local = {vm_ge_local, mem_type_ge_local, COST_9, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr ge_local_engine = MakeShared(attr_ge_local); if (ge_local_engine == nullptr) { - GELOGE(ge::FAILED, "make ge_local_engine failed"); + GELOGE(ge::FAILED, "[Register][GeLocalEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterGeLocalEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vm_ge_local, ge_local_engine) != SUCCESS) { @@ -139,10 +150,12 @@ void RegisterHostCpuEngine() { std::vector mem_type_host_cpu; mem_type_host_cpu.emplace_back(GE_ENGINE_ATTR_MEM_TYPE_HBM); // HostCpu use minimum priority, set it as 10 - DNNEngineAttribute attr_host_cpu = {vm_host_cpu, mem_type_host_cpu, COST_10, HOST, FORMAT_RESERVED, FORMAT_RESERVED}; + DNNEngineAttribute attr_host_cpu = {vm_host_cpu, mem_type_host_cpu, COST_10, + HOST, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr host_cpu_engine = MakeShared(attr_host_cpu); if (host_cpu_engine == nullptr) { - GELOGE(ge::FAILED, "make host_cpu_engine failed"); + GELOGE(ge::FAILED, "[Register][HostCpuEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterHostCpuEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vm_host_cpu, host_cpu_engine) != SUCCESS) { @@ -157,7 +170,8 @@ void RegisterRtsEngine() { DNNEngineAttribute attr_rts = {vm_rts, mem_type_rts, COST_1, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr rts_engine = MakeShared(attr_rts); if (rts_engine == nullptr) { - GELOGE(ge::FAILED, "make rts_engine failed"); + GELOGE(ge::FAILED, "[Register][RtsEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterRtsEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(vm_rts, rts_engine) != SUCCESS) { @@ -172,7 +186,8 @@ void RegisterHcclEngine() { DNNEngineAttribute attr_hccl = {dnn_hccl, mem_type_hccl, COST_1, DEVICE, FORMAT_RESERVED, FORMAT_RESERVED}; DNNEnginePtr hccl_engine = MakeShared(attr_hccl); if (hccl_engine == nullptr) { - GELOGE(ge::FAILED, "make hccl_engine failed"); + GELOGE(ge::FAILED, "[Register][HcclEngine] failed, as malloc shared_ptr failed."); + REPORT_INNER_ERROR("E19999", "RegisterHcclEngine failed for new DNNEnginePtr failed."); return; } if (EngineManager::RegisterEngine(dnn_hccl, hccl_engine) != SUCCESS) { diff --git a/ge/proto/insert_op.proto b/ge/proto/insert_op.proto index bf918b20..7d708865 100644 --- a/ge/proto/insert_op.proto +++ b/ge/proto/insert_op.proto @@ -88,6 +88,7 @@ message AippOpParams { int32 right_padding_size = 69; int32 top_padding_size = 70; int32 bottom_padding_size = 71; + float padding_value = 72; int32 mean_chn_0 = 10; int32 mean_chn_1 = 11; diff --git a/ge/session/inner_session.cc b/ge/session/inner_session.cc index d11ba10e..e8b3ae0e 100755 --- a/ge/session/inner_session.cc +++ b/ge/session/inner_session.cc @@ -47,7 +47,10 @@ Status CheckReuseMemoryOption(const std::map &options) { } else if (iter->second == "1") { GELOGD("%s=1, reuse memory is close", OPTION_EXEC_DISABLE_REUSED_MEMORY); } else { - GELOGE(PARAM_INVALID, "option %s=%s is invalid", OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str()); + GELOGE(PARAM_INVALID, "[CheckReuse][MemoryOption]option %s=%s is invalid", + OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str()); + REPORT_INNER_ERROR("E19999", "CheckReuseMemoryOption failed because option %s=%s is invalid.", + OPTION_EXEC_DISABLE_REUSED_MEMORY, iter->second.c_str()); return FAILED; } } @@ -72,7 +75,8 @@ Status InnerSession::Initialize() { Status ret = CheckReuseMemoryOption(all_options); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] check reuse memory option failed.", session_id_); + GELOGE(ret, "[CheckReuse][MemoryOption] failed, [InnerSession:%lu].", session_id_); + REPORT_CALL_ERROR("E19999", "CheckReuseMemoryOption failed, InnerSession=%lu.", session_id_); return ret; } @@ -99,20 +103,22 @@ Status InnerSession::Initialize() { DumpProperties dump_properties; dump_properties.InitByOptions(); - GE_CHK_STATUS_RET(AddDumpProperties(dump_properties), "Add dump properties failed"); + GE_CHK_STATUS_RET(AddDumpProperties(dump_properties), "[Add][DumpProperties] failed."); ret = graph_manager_.Initialize(options_); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] initialize failed.", session_id_); - GE_CHK_STATUS(RemoveDumpProperties(), "Remove dump properties failed"); + GELOGE(ret, "[Init][GraphManager] failed, InnerSession:%lu.", session_id_); + REPORT_CALL_ERROR("E19999", "GraphManager initialize failed, InnerSession:%lu.", session_id_); + GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed."); return ret; } ret = VarManager::Instance(session_id_)->SetMemoryMallocSize(all_options); if (ret != SUCCESS) { - GELOGE(ret, "failed to set malloc size"); + GELOGE(ret, "[Set][MemoryMallocSize] failed."); + REPORT_CALL_ERROR("E19999", "VarManager SetMemoryMallocSize failed, InnerSession:%lu.", session_id_); (void)graph_manager_.Finalize(); - GE_CHK_STATUS(RemoveDumpProperties(), "Remove dump properties failed"); + GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed."); GE_CHK_RT(rtDeviceReset(static_cast(GetContext().DeviceId()))); return ret; } @@ -122,8 +128,9 @@ Status InnerSession::Initialize() { const int DEFAULT_JOB_ID = 0; ret = VarManager::Instance(session_id_)->Init(version, session_id_, DEFAULT_DEVICE_ID, DEFAULT_JOB_ID); if (ret != SUCCESS) { - GELOGE(ret, "failed to init session instance"); - GE_CHK_STATUS(RemoveDumpProperties(), "Remove dump properties failed"); + GELOGE(ret, "[Init][VarManager] failed."); + REPORT_CALL_ERROR("E19999", "VarManager init failed, InnerSession:%lu.", session_id_); + GE_CHK_STATUS(RemoveDumpProperties(), "[Remove][DumpProperties] failed."); } init_flag_ = true; return SUCCESS; @@ -139,7 +146,8 @@ Status InnerSession::Finalize() { Status ret = graph_manager_.Finalize(); if (ret != SUCCESS) { // Subsequent code execution is required, so no return is required - GELOGE(ret, "[InnerSession:%lu] finalize failed.", session_id_); + GELOGE(ret, "[Finalize][GraphManager] failed, InnerSession:%lu.", session_id_); + REPORT_CALL_ERROR("E19999", "GraphManager Finalize failed, InnerSession:%lu.", session_id_); } ModelManager::GetInstance()->DestroyAicpuSession(session_id_); @@ -151,7 +159,7 @@ Status InnerSession::Finalize() { Analyzer::GetInstance()->DestroySessionJsonObject(session_id_); GE_CHK_RT(rtDeviceReset(static_cast(GetContext().DeviceId()))); - GE_CHK_STATUS_RET(RemoveDumpProperties(), "Remove dump properties failed"); + GE_CHK_STATUS_RET(RemoveDumpProperties(), "[Remove][DumpProperties] failed."); return ret; } @@ -170,13 +178,17 @@ Status InnerSession::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { std::lock_guard lock(resource_mutex_); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, "[Add][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); + REPORT_INNER_ERROR("E19999", "AddGraph failed because GraphManager not init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); return GE_SESS_INIT_FAILED; } UpdateThreadContext(options); Status ret = graph_manager_.AddGraph(graph_id, graph, options, domi::GetContext()); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] add graph %u failed.", session_id_, graph_id); + GELOGE(ret, "[Add][Graph] failed, InnerSession:%lu graphid: %u.", session_id_, graph_id); + REPORT_CALL_ERROR("E19999", "GraphManager AddGraph failed, InnerSession:%lu graphid: %u.", session_id_, graph_id); return ret; } @@ -188,13 +200,19 @@ Status InnerSession::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, const std::map &options) { std::lock_guard lock(resource_mutex_); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, "[Add][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); + REPORT_INNER_ERROR("E19999", + "AddGraphWithCopy failed because GraphManager not init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); return GE_SESS_INIT_FAILED; } UpdateThreadContext(options); Status ret = graph_manager_.AddGraphWithCopy(graph_id, graph, options, domi::GetContext()); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] add graph %u failed.", session_id_, graph_id); + GELOGE(ret, "[Add][Graph] failed, InnerSession:%lu graphid: %u.", session_id_, graph_id); + REPORT_CALL_ERROR("E19999", + "GraphManager AddGraphWithCopy failed, InnerSession:%lu graphid: %u.", session_id_, graph_id); return ret; } @@ -207,7 +225,10 @@ Status InnerSession::RunGraph(uint32_t graph_id, const std::vector &inpu if (mutex_.try_lock()) { std::lock_guard lock(mutex_, std::adopt_lock); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, "[Run][Graph]failed because GraphManager not Init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); + REPORT_INNER_ERROR("E19999", "RunGraph failed because GraphManager not Init, InnerSession:%lu, graph_id:%u.", + session_id_, graph_id); return GE_SESS_INIT_FAILED; } UpdateThreadContext(graph_id); @@ -220,7 +241,9 @@ Status InnerSession::RunGraph(uint32_t graph_id, const std::vector &inpu domi::GetContext().out_nodes_map.clear(); domi::GetContext().user_out_nodes.clear(); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] run graph failed, graph_id=%u.", session_id_, graph_id); + GELOGE(ret, "[Run][Graph]failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id); + REPORT_CALL_ERROR("E19999", + "GraphManager RunGraph failed, InnerSession:%lu graph_id=%u.", session_id_, graph_id); return ret; } outputs.clear(); @@ -231,7 +254,10 @@ Status InnerSession::RunGraph(uint32_t graph_id, const std::vector &inpu GELOGI("[InnerSession:%lu] run graph success, graph_id=%u.", session_id_, graph_id); return SUCCESS; } else { - GELOGE(GE_SESS_ALREADY_RUNNING, "[InnerSession:%lu] run graph failed, graph_id=%u.", session_id_, graph_id); + GELOGE(GE_SESS_ALREADY_RUNNING, "[Run][Graph]failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id); + REPORT_INNER_ERROR("E19999", + "RunGraph failed because mutex try_lock false, InnerSession:%lu, graph_id=%u.", + session_id_, graph_id); return GE_SESS_ALREADY_RUNNING; } } @@ -239,13 +265,20 @@ Status InnerSession::RunGraph(uint32_t graph_id, const std::vector &inpu Status InnerSession::RemoveGraph(uint32_t graph_id) { std::lock_guard lock(resource_mutex_); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, + "[Remove][Graph] failed because GraphManager not init, InnerSession:%lu, graph_id=%u.", + session_id_, graph_id); + REPORT_INNER_ERROR("E19999", + "RemoveGraph failed, because GraphManager not init, InnerSession:%lu, graph_id=%u.", + session_id_, graph_id); return GE_SESS_INIT_FAILED; } UpdateThreadContext(graph_id); Status ret = graph_manager_.RemoveGraph(graph_id); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] remove graph failed, graph_id=%u.", session_id_, graph_id); + GELOGE(ret, "[Remove][Graph] failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id); + REPORT_CALL_ERROR("E19999", + "GraphManager RemoveGraph failed, InnerSession:%lu, graph_id=%u.", session_id_, graph_id); return ret; } @@ -258,13 +291,19 @@ Status InnerSession::RegisterCallBackFunc( const std::function &)> &callback) { std::lock_guard lock(resource_mutex_); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, + "[Register][CallBackFunc] failed because GraphManager not initialize, InnerSession:%lu.", session_id_); + REPORT_INNER_ERROR("E19999", + "RegisterCallBackFunc failed because GraphManager not init, InnerSession:%lu.", session_id_); return GE_SESS_INIT_FAILED; } UpdateThreadContext(std::map{}); Status ret = graph_manager_.RegisterCallBackFunc(key, callback); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] register %s callback function failed.", session_id_, key.c_str()); + GELOGE(ret, "[Register][CallBackFunc] failed, InnerSession:%lu register %s.", session_id_, key.c_str()); + REPORT_CALL_ERROR("E19999", + "GraphManager RegisterCallBackFunc failed, InnerSession:%lu register %s.", + session_id_, key.c_str()); return ret; } @@ -277,13 +316,20 @@ Status InnerSession::RegisterCallBackFunc( const std::function &)> &callback) { std::lock_guard lock(resource_mutex_); if (!init_flag_) { - GELOGE(GE_SESS_INIT_FAILED, "[InnerSession:%lu] initialize failed.", session_id_); + GELOGE(GE_SESS_INIT_FAILED, + "[Register][CallBackFunc]failed because GraphManager not initialize, InnerSession:%lu.", session_id_); + REPORT_INNER_ERROR("E19999", + "RegisterCallBackFunc failed because GraphManager not initialize, InnerSession:%lu.", + session_id_); return GE_SESS_INIT_FAILED; } UpdateThreadContext(std::map{}); Status ret = graph_manager_.RegisterCallBackFunc(key, callback); if (ret != SUCCESS) { - GELOGE(ret, "[InnerSession:%lu] register %s callback function failed.", session_id_, key.c_str()); + GELOGE(ret, "[Register][CallBackFunc] failed, InnerSession:%lu register %s.", session_id_, key.c_str()); + REPORT_CALL_ERROR("E19999", + "GraphManager RegisterCallBackFunc failed, InnerSession:%lu register %s.", + session_id_, key.c_str()); return ret; } @@ -308,7 +354,9 @@ Status InnerSession::BuildGraph(uint32_t graph_id, const std::vector &atc_params, } } -static Status CheckInputShapeNode(const ComputeGraphPtr &graph, const bool is_dynamic_input, RunMode run_mode) { - if (!is_dynamic_input && run_mode != MODEL_TO_JSON) { +static Status CheckInputShapeNode(const ComputeGraphPtr &graph, bool is_dynamic_input, + const std::string &input_shape_range, RunMode run_mode) { + if (!is_dynamic_input && run_mode != MODEL_TO_JSON && input_shape_range.empty()) { for (auto node : graph->GetDirectNode()) { if (node->GetType() == DATA) { auto data_op_desc = node->GetOpDesc(); @@ -576,6 +577,7 @@ Status InitDomiOmgContext(const string &input_shape, const string &input_format, GELOGE(PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); return PARAM_INVALID; } + return SUCCESS; } @@ -759,8 +761,9 @@ FMK_FUNC_HOST_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map &options, SessionId &session_id) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, "[Create][Session]fail for Session manager is not initialized."); + REPORT_INNER_ERROR("E19999", "CreateSession fail for Session manager is not initialized."); return GE_SESSION_MANAGER_NOT_INIT; } SessionId next_session_id = 0; @@ -92,7 +93,10 @@ Status SessionManager::CreateSession(const std::map &o Status SessionManager::DestroySession(SessionId session_id) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Destroy][Session]fail for Session manager is not initialized, session_id:%lu.", session_id); + REPORT_INNER_ERROR("E19999", "DestroySession fail for Session manager is not initialized, session_id:%lu.", + session_id); return GE_SESSION_MANAGER_NOT_INIT; } std::lock_guard lock(mutex_); @@ -119,7 +123,12 @@ Status SessionManager::DestroySession(SessionId session_id) { Status SessionManager::GetVariable(SessionId session_id, const std::string &name, Tensor &val) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Get][Variable]fail for Session manager is not initialized, session_id:%lu, input_name:%s.", + session_id, name.c_str()); + REPORT_INNER_ERROR("E19999", + "GetVariable fail for Session manager is not initialized, session_id:%lu, input_name:%s.", + session_id, name.c_str()); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -143,7 +152,11 @@ Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const G Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const Graph &graph, const std::map &options) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Add][Graph]fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); + REPORT_INNER_ERROR("E19999", "AddGraph fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -173,7 +186,12 @@ Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const G Status SessionManager::AddGraphWithCopy(SessionId session_id, uint32_t graph_id, const Graph &graph, const std::map &options) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Add][GraphWithCopy]fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); + REPORT_INNER_ERROR("E19999", + "AddGraphWithCopy fail for Session manager is not initialized, session_id:%lu, graph_id:%u", + session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -203,7 +221,12 @@ Status SessionManager::AddGraphWithCopy(SessionId session_id, uint32_t graph_id, Status SessionManager::RunGraph(SessionId session_id, uint32_t graph_id, const std::vector &inputs, std::vector &outputs) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Run][Graph]fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); + REPORT_INNER_ERROR("E19999", + "RunGraph fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -221,7 +244,12 @@ Status SessionManager::RunGraph(SessionId session_id, uint32_t graph_id, const s Status SessionManager::RemoveGraph(SessionId session_id, uint32_t graph_id) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Remove][Graph]fail for Session manager is not initialized, session_id:%lu graph_id:%u.", + session_id, graph_id); + REPORT_INNER_ERROR("E19999", + "RemoveGraph fail for Session manager is not initialized, session_id:%lu graph_id:%u.", + session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -239,7 +267,10 @@ Status SessionManager::RemoveGraph(SessionId session_id, uint32_t graph_id) { bool SessionManager::HasSession(SessionId session_id) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Has][Session]fail for Session manager is not initialized, session_id:%lu.", session_id); + REPORT_INNER_ERROR("E19999", + "HasSession fail for Session manager is not initialized, session_id:%lu.", session_id); return false; } return session_manager_map_.find(session_id) != session_manager_map_.end(); @@ -247,7 +278,8 @@ bool SessionManager::HasSession(SessionId session_id) { Status SessionManager::GetNextSessionId(SessionId &next_session_id) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, "[Get][NextSessionId]fail for Session manager is not initialized."); + REPORT_INNER_ERROR("E19999", "GetNextSessionId fail for Session manager is not initialized."); return GE_SESSION_MANAGER_NOT_INIT; } static SessionId session_id = 0; @@ -260,7 +292,11 @@ Status SessionManager::RegisterCallBackFunc( SessionId session_id, const std::string &key, const std::function &)> &callback) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Register][CallBackFunc]fail for Session manager is not initialized, session_id:%lu, input_key:%s.", + session_id, key.c_str()); + REPORT_INNER_ERROR("E19999", "RegisterCallBackFunc fail for Session manager is not initialized," + "session_id:%lu, input_key:%s.", session_id, key.c_str()); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -280,7 +316,11 @@ Status SessionManager::RegisterCallBackFunc( SessionId session_id, const std::string &key, const std::function &)> &callback) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Register][CallBackFunc]fail for Session manager is not initialized, session_id:%lu, input_key:%s.", + session_id, key.c_str()); + REPORT_INNER_ERROR("E19999", "RegisterCallBackFunc fail for Session manager is not initialized," + "session_id:%lu, input_key:%s.", session_id, key.c_str()); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -298,7 +338,10 @@ Status SessionManager::RegisterCallBackFunc( Status SessionManager::BuildGraph(SessionId session_id, uint32_t graph_id, const std::vector &inputs) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, "[Build][Graph]fail for Session manager is not initialized," + "session_id:%lu, graph_id:%u.", session_id, graph_id); + REPORT_INNER_ERROR("E19999", "BuildGraph fail for Session manager is not initialized," + "session_id:%lu, graph_id:%u.", session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -317,7 +360,12 @@ Status SessionManager::BuildGraph(SessionId session_id, uint32_t graph_id, const Status SessionManager::RunGraphAsync(SessionId session_id, uint32_t graph_id, const std::vector &inputs, RunAsyncCallback callback) { if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[AsyncRun][Graph]fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); + REPORT_INNER_ERROR("E19999", + "RunGraphAsync fail for Session manager is not initialized, session_id:%lu, graph_id:%u.", + session_id, graph_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -337,7 +385,10 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vector &var_values) { // step 0: init session manager if (!init_flag_) { - GELOGE(GE_SESSION_MANAGER_NOT_INIT, "Session manager is not initialized."); + GELOGE(GE_SESSION_MANAGER_NOT_INIT, + "[Get][Variables]fail for Session manager is not initialized, session_id:%lu", session_id); + REPORT_INNER_ERROR("E19999", + "GetVariables fail for Session manager is not initialized, session_id:%lu", session_id); return GE_SESSION_MANAGER_NOT_INIT; } SessionPtr innerSession = nullptr; @@ -355,7 +406,7 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vector all_variables; Status ret = innerSession->GetAllVariables(all_variables); if (ret != SUCCESS) { - GELOGE(FAILED, "Get all variables failed."); + GELOGE(FAILED, "[Get][AllVariables]failed."); return FAILED; } @@ -363,7 +414,7 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vectorGenCheckPointGraph(all_variables, graph); if (ret != SUCCESS) { - GELOGE(FAILED, "Build check point graph failed."); + GELOGE(FAILED, "[GenCheck][PointGraph] failed."); return FAILED; } @@ -371,7 +422,7 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vector outputs; ret = RunGraph(session_id, graph_id, inputs, outputs); if (ret != SUCCESS) { - GELOGE(FAILED, "Run check point graph failed."); + GELOGE(FAILED, "[Run][Graph] failed."); return FAILED; } @@ -388,14 +439,14 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vectorRemoveGraph(graph_id); if (ret != SUCCESS) { - GELOGE(FAILED, "Remove graph failed."); + GELOGE(FAILED, "[Remove][Graph] failed."); return FAILED; } return ret; @@ -403,7 +454,12 @@ Status SessionManager::GetVariables(SessionId session_id, const std::vector lock(mutex_); auto it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { - GELOGE(GE_SESSION_NOT_EXIST, "The session %lu does not exists", session_id); + GELOGE(GE_SESSION_NOT_EXIST, "[Find][InnerSession] fail for %lu does not exists", session_id); + REPORT_INNER_ERROR("E19999", + "IsGraphNeedRebuild fail for InnerSession is not exists, session_id:%lu, graph_id:%u.", + session_id, graph_id); return true; } else { innerSession = it->second; diff --git a/ge/single_op/single_op.cc b/ge/single_op/single_op.cc index 4a59a2b6..4b3f17cf 100755 --- a/ge/single_op/single_op.cc +++ b/ge/single_op/single_op.cc @@ -34,6 +34,9 @@ const size_t kDataMemAlignSize = 32; const size_t kDataMemAlignUnit = 2; const string kShapeTypeDynamic = "dynamic"; const string kShapeTypeStatic = "static"; +const int64_t kHostMemType = 1; +const uint32_t kFuzzDeviceBufferSize = 1 * 1024 * 1024; +const uint32_t kAlignBytes = 512; size_t GetAlignedSize(size_t size) { size_t aligned_size = (size + kDataMemAlignUnit * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize; @@ -48,7 +51,7 @@ Status ProfilingTaskInfo(OpTask *op_task, const string &shape_type) { TaskDescInfo tmp_task_desc_info; uint32_t model_id; if (op_task->GetProfilingArgs(tmp_task_desc_info, model_id) != SUCCESS) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Get profiling data of task failed"); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Get][ProfilingArgs] failed."); return ACL_ERROR_GE_PARAM_INVALID; } GELOGD("ProfilingReport of op[%s] model[%s] start.", @@ -65,6 +68,68 @@ Status ProfilingTaskInfo(OpTask *op_task, const string &shape_type) { profiling_manager.ReportProfilingData(model_id, task_desc_info); return SUCCESS; } + +Status CalInputsHostMemSize(const std::vector &inputs, + std::vector> &inputs_size) { + int64_t total_size = 0; + size_t index = 0; + for (auto &input_buffer : inputs) { + int64_t input_size = 0; + if (input_buffer.placement == kHostMemType) { + GE_CHECK_LE(input_buffer.length, INT64_MAX); + input_size = input_buffer.length; + // input_size pad to 512 + GE_CHK_STATUS_RET(CheckInt64AddOverflow(input_size, (kAlignBytes - 1)), "Padding size is beyond the INT64_MAX."); + input_size = ((input_size + kAlignBytes - 1) / kAlignBytes) * kAlignBytes; + inputs_size.emplace_back(index, input_size); + GE_CHK_STATUS_RET(CheckInt64AddOverflow(total_size, input_size), "Total size is beyond the INT64_MAX."); + total_size += input_size; + GELOGD("The %zu input mem type is host, the tensor size is %ld.", index, input_size); + } + index++; + } + if (total_size > kFuzzDeviceBufferSize) { + GELOGE(FAILED, "[Check][Size]Total size is %ld, larger than 1M.", total_size); + return FAILED; + } + return SUCCESS; +} + +Status UpdateInputsBufferAddr(StreamResource *stream_resource, rtStream_t stream, + const std::vector> &inputs_size, + std::vector &update_buffers) { + GE_CHECK_NOTNULL(stream_resource); + auto dst_addr = reinterpret_cast(stream_resource->GetDeviceBufferAddr()); + // copy host mem from input_buffer to device mem of dst_addr + for (const auto &input_size : inputs_size) { + auto index = input_size.first; + auto size = input_size.second; + GELOGD("Do h2d for %zu input, dst size is %zu, src length is %lu.", index, size, update_buffers[index].length); + GE_CHK_RT_RET(rtMemcpyAsync(dst_addr, size, update_buffers[index].data, update_buffers[index].length, + RT_MEMCPY_HOST_TO_DEVICE_EX, stream)); + update_buffers[index].data = dst_addr; + dst_addr = dst_addr + size; + } + return SUCCESS; +} + +Status InitHybridModelArgs(const std::vector &input_buffers, + const std::vector &output_buffers, + const std::vector &inputs_desc, + hybrid::HybridModelExecutor::ExecuteArgs &args) { + for (auto &input : input_buffers) { + args.inputs.emplace_back(hybrid::TensorValue(input.data, input.length)); + } + for (auto &output : output_buffers) { + args.outputs.emplace_back(hybrid::TensorValue(output.data, output.length)); + } + for (auto &tensor_desc : inputs_desc) { + auto desc = MakeShared(tensor_desc); + GE_CHECK_NOTNULL(desc); + args.input_desc.emplace_back(desc); + } + return SUCCESS; +} } // namespace SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream) @@ -81,8 +146,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { Status SingleOp::ValidateArgs(const std::vector &inputs, const std::vector &outputs) { auto num_inputs = inputs.size(); if (num_inputs != input_sizes_.size()) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input num mismatch. model expect %zu, but given %zu", input_addr_list_.size(), + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param:inputs]Input num mismatch. model expect %zu, but given %zu", input_addr_list_.size(), inputs.size()); + REPORT_INPUT_ERROR("E10401", std::vector({"expect_num", "input_num"}), + std::vector({std::to_string(input_addr_list_.size()), std::to_string(num_inputs)})); return ACL_ERROR_GE_PARAM_INVALID; } @@ -92,16 +160,22 @@ Status SingleOp::ValidateArgs(const std::vector &inputs, const std:: GELOGI("Input [%zu], aligned_size:%zu, inputs.length:%lu, input_sizes_:%zu", i, aligned_size, inputs[i].length, input_sizes_[i]); if (aligned_size < input_sizes_[i]) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input size mismatch. index = %zu, model expect %zu," - " but given %zu(after align)", i, input_sizes_[i], aligned_size); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param:inputs]Input size mismatch. index = %zu, model expect %zu, but given %zu(after align)", + i, input_sizes_[i], aligned_size); + REPORT_INPUT_ERROR("E10402", std::vector({"index", "expect_size", "input_size"}), + std::vector({std::to_string(i), std::to_string(input_sizes_[i]), std::to_string(aligned_size)}) + ); return ACL_ERROR_GE_PARAM_INVALID; } } auto num_outputs = outputs.size(); if (num_outputs != output_sizes_.size()) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output num mismatch. model expect %zu, but given %zu", - output_sizes_.size(), outputs.size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param:outputs]output num mismatch. model expect %zu, but given %zu", + output_sizes_.size(), outputs.size()); + REPORT_INPUT_ERROR("E10403", std::vector({"expect_num", "input_num"}), + std::vector({std::to_string(output_sizes_.size()), std::to_string(outputs.size())})); return ACL_ERROR_GE_PARAM_INVALID; } @@ -111,8 +185,12 @@ Status SingleOp::ValidateArgs(const std::vector &inputs, const std:: GELOGI("Output [%zu], aligned_size:%zu, outputs.length:%lu, output_sizes_:%zu", i, aligned_size, outputs[i].length, output_sizes_[i]); if (aligned_size < output_sizes_[i]) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output size mismatch. index = %zu, model expect %zu," - "but given %zu(after align)", i, output_sizes_[i], aligned_size); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "[Check][Param:outputs]Output size mismatch. index = %zu, model expect %zu, but given %zu(after align)", + i, output_sizes_[i], aligned_size); + REPORT_INPUT_ERROR("E10404", std::vector({"index", "expect_size", "input_size"}), + std::vector({std::to_string(i), std::to_string(output_sizes_[i]), std::to_string(aligned_size)}) + ); return ACL_ERROR_GE_PARAM_INVALID; } } @@ -155,25 +233,39 @@ Status SingleOp::UpdateArgs(const std::vector &inputs, const std::ve FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(const std::vector &inputs, const std::vector &outputs) { + GELOGD("Start SingleOp::ExecuteAsync."); Status ret = ValidateArgs(inputs, outputs); if (ret != SUCCESS) { return ret; } GE_CHECK_NOTNULL(stream_resource_); + vector> inputs_size; + GE_CHK_STATUS_RET_NOLOG(CalInputsHostMemSize(inputs, inputs_size)); std::lock_guard lk(*stream_mutex_); + vector update_buffers = inputs; + if (!inputs_size.empty()) { + GE_CHK_STATUS_RET_NOLOG(UpdateInputsBufferAddr(stream_resource_, stream_, inputs_size, update_buffers)); + } + + if (hybrid_model_executor_ != nullptr) { + GELOGD("Execute multi-task single op by hybrid model executor"); + hybrid::HybridModelExecutor::ExecuteArgs args; + GE_CHK_STATUS_RET_NOLOG(InitHybridModelArgs(update_buffers, outputs, inputs_desc_, args)); + return hybrid_model_executor_->Execute(args); + } + auto current_mem_base = stream_resource_->GetMemoryBase(); if (running_param_->mem_base != current_mem_base) { running_param_->mem_base = const_cast(current_mem_base); GELOGD("Memory base changed, new memory base = %p", current_mem_base); for (auto &task : tasks_) { auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_); - GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_), - "[%s] Failed to update arg table", - task->GetOpdesc()->GetName().c_str()); + GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_), "[Update][ArgTable] failed, single op:%s.", + task->GetOpdesc()->GetName().c_str()); } } - ret = UpdateArgs(inputs, outputs); + ret = UpdateArgs(update_buffers, outputs); if (ret != SUCCESS) { return ret; } @@ -183,7 +275,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c if (ret != SUCCESS) { return ret; } - GE_CHK_STATUS_RET(task->OpenDump(stream_), "Open single op %s dump filed",task->GetOpdesc()->GetName().c_str()); + GE_CHK_STATUS_RET(task->OpenDump(stream_), "[Open][Dump]failed, single op:%s.", + task->GetOpdesc()->GetName().c_str()); GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(task, kShapeTypeStatic)); } @@ -204,66 +297,99 @@ Status DynamicSingleOp::ValidateParams(const vector &input_desc, std::vector &outputs) const { if (inputs.size() != input_desc.size()) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "Input number mismatches input desc number. Input num = %zu, input desc num = %zu", - inputs.size(), - input_desc.size()); + "[Check][Param:inputs]Input number mismatches input desc number. Input num = %zu, input desc num = %zu", + inputs.size(), input_desc.size()); + REPORT_INPUT_ERROR("E10405", std::vector({"input_num", "input_desc_num"}), + std::vector({std::to_string(inputs.size()), std::to_string(input_desc.size())})); return ACL_ERROR_GE_PARAM_INVALID; } if (outputs.size() != output_desc.size()) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "Output number mismatches output desc number. Output num = %zu, output desc num = %zu", - outputs.size(), - output_desc.size()); + "[Check][Param:outputs]Output number mismatches output desc number. Output num = %zu, output desc num = %zu", + outputs.size(), output_desc.size()); + REPORT_INPUT_ERROR("E10406", std::vector({"out_num", "out_desc_num"}), + std::vector({std::to_string(outputs.size()), std::to_string(output_desc.size())})); return ACL_ERROR_GE_PARAM_INVALID; } if (input_desc.size() != num_inputs_) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "Input number mismatches. expect %zu, but given %zu", - num_inputs_, - input_desc.size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param:input_desc]Input number mismatches. expect %zu, but given %zu", + num_inputs_, input_desc.size()); + REPORT_INPUT_ERROR("E10401", std::vector({"expect_num", "input_num"}), + std::vector({std::to_string(num_inputs_), std::to_string(input_desc.size())})); return ACL_ERROR_GE_PARAM_INVALID; } if (output_desc.size() != num_outputs_) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "Output number mismatches. expect %zu, but given %zu", - num_outputs_, - output_desc.size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param:output_desc]Output number mismatches. expect %zu, but given %zu", + num_outputs_, output_desc.size()); + REPORT_INPUT_ERROR("E10403", std::vector({"expect_num", "input_num"}), + std::vector({std::to_string(num_outputs_), std::to_string(output_desc.size())})); return ACL_ERROR_GE_PARAM_INVALID; } return SUCCESS; } +Status DynamicSingleOp::SetHostTensorValue(const std::vector> &inputs_size, + const vector &input_desc, + const std::vector &input_buffers) { + auto op_desc = op_task_->GetOpdesc(); + GE_CHECK_NOTNULL(op_desc); + GELOGD("Start update inputs tensor value of %s.", op_desc->GetName().c_str()); + for (const auto &input_size : inputs_size) { + size_t index = input_size.first; + auto ge_tensor_desc = input_desc.at(index); + // reconstruct GeTensor by DataBuffer + GeTensorPtr ge_tensor = MakeShared(ge_tensor_desc); + GE_CHECK_NOTNULL(ge_tensor); + GELOGD("The %zu tensor input type is host, desc data type is %d, input buffer addr is %p, size is %ld.", + index, ge_tensor_desc.GetDataType(), input_buffers[index].data, input_buffers[index].length); + if (ge_tensor->SetData(reinterpret_cast(input_buffers[index].data), + static_cast(input_buffers[index].length)) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "[Set][Data]Failed to set data of ge tensor."); + return INTERNAL_ERROR; + } + auto tensor_desc = op_desc->MutableInputDesc(index); + GE_CHECK_NOTNULL(tensor_desc); + if (!AttrUtils::SetTensor(tensor_desc, ATTR_NAME_VALUE, ge_tensor)) { + GELOGE(FAILED, "[Set][ATTR_NAME_VALUE]Failed to set ATTR_NAME_VALUE to %s.", op_desc->GetName().c_str()); + return FAILED; + } + } + return SUCCESS; +} + Status DynamicSingleOp::ExecuteAsync(const vector &input_desc, const vector &input_buffers, vector &output_desc, vector &output_buffers) { + GELOGD("Start DynamicSingleOp::ExecuteAsync."); GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); + vector> inputs_size; + GE_CHK_STATUS_RET_NOLOG(CalInputsHostMemSize(input_buffers, inputs_size)); + vector update_buffers = input_buffers; + std::lock_guard lk(*stream_mutex_); + if (!inputs_size.empty()) { + StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_); + GE_CHK_STATUS_RET_NOLOG(UpdateInputsBufferAddr(stream_resource, stream_, inputs_size, update_buffers)); + } + if (hybrid_model_executor_ != nullptr) { GELOGD("Execute multi-task dynamic single op by hybrid model executor"); hybrid::HybridModelExecutor::ExecuteArgs args; - for (auto &input : input_buffers) { - args.inputs.emplace_back(hybrid::TensorValue(input.data, input.length)); - } - for (auto &output : output_buffers) { - args.outputs.emplace_back(hybrid::TensorValue(output.data, output.length)); - } - for (auto &tensor_desc : input_desc) { - auto desc = MakeShared(tensor_desc); - GE_CHECK_NOTNULL(desc); - args.input_desc.emplace_back(desc); - } + GE_CHK_STATUS_RET_NOLOG(InitHybridModelArgs(update_buffers, output_buffers, input_desc, args)); return hybrid_model_executor_->Execute(args); } - - std::lock_guard lk(*stream_mutex_); GE_CHECK_NOTNULL(op_task_); - - GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); + if (!inputs_size.empty()) { + GE_CHK_STATUS_RET_NOLOG(SetHostTensorValue(inputs_size, input_desc, input_buffers)); + GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, update_buffers, output_desc, output_buffers, stream_)); + } else { + GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); + } GE_CHK_STATUS_RET_NOLOG(op_task_->OpenDump(stream_)); GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get(), kShapeTypeDynamic)); return SUCCESS; diff --git a/ge/single_op/single_op.h b/ge/single_op/single_op.h index b350b684..01d6dfc0 100755 --- a/ge/single_op/single_op.h +++ b/ge/single_op/single_op.h @@ -59,6 +59,9 @@ class SingleOp { std::vector tasks_; std::vector> arg_table_; std::unique_ptr running_param_; + std::unique_ptr hybrid_model_; + std::unique_ptr hybrid_model_executor_; + std::vector inputs_desc_; }; class DynamicSingleOp { @@ -76,7 +79,8 @@ class DynamicSingleOp { const std::vector &inputs, std::vector &output_desc, std::vector &outputs) const; - + Status SetHostTensorValue(const std::vector> &inputs_size, + const vector &input_desc, const std::vector &input_buffers); std::unique_ptr op_task_; std::unique_ptr hybrid_model_; std::unique_ptr hybrid_model_executor_; @@ -85,6 +89,7 @@ class DynamicSingleOp { rtStream_t stream_ = nullptr; size_t num_inputs_ = 0; size_t num_outputs_ = 0; + ComputeGraphPtr compute_graph_; }; } // namespace ge #endif // GE_SINGLE_OP_SINGLE_OP_H_ diff --git a/ge/single_op/single_op_manager.cc b/ge/single_op/single_op_manager.cc index fddbeec2..667e987b 100644 --- a/ge/single_op/single_op_manager.cc +++ b/ge/single_op/single_op_manager.cc @@ -19,6 +19,9 @@ #include #include +#include "graph/manager/graph_mem_allocator.h" +#include "graph/manager/graph_caching_allocator.h" + namespace ge { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOpManager::~SingleOpManager() { for (auto &it : stream_resources_) { @@ -34,7 +37,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOpManager::GetOpFr const uint64_t model_id) { GELOGI("GetOpFromModel in. model name = %s, model id = %lu", model_name.c_str(), model_id); if (single_op == nullptr) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "single op is null"); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Param:single_op] is null."); + REPORT_INPUT_ERROR("E10412", std::vector({"inputparam"}), std::vector({"single_op"})); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -42,7 +46,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOpManager::GetOpFr GE_CHK_STATUS_RET(GetResourceId(stream, resource_id)); StreamResource *res = GetResource(resource_id, stream); if (res == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "GetResource failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Get][Resource] failed."); + REPORT_CALL_ERROR("E19999", "GetOpFromModel fail because GetResource return nullptr."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -67,6 +72,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOpManager::Release delete it->second; it->second = nullptr; (void)stream_resources_.erase(it); + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).TryFreeBlocks(); return SUCCESS; } @@ -75,8 +81,13 @@ StreamResource *SingleOpManager::GetResource(uintptr_t resource_id, rtStream_t s auto it = stream_resources_.find(resource_id); StreamResource *res = nullptr; if (it == stream_resources_.end()) { - res = new (std::nothrow) StreamResource(resource_id); + res = new(std::nothrow) StreamResource(resource_id); if (res != nullptr) { + if (res->Init() != SUCCESS) { + GELOGE(FAILED, "[Malloc][Memory]Failed to malloc device buffer."); + delete res; + return nullptr; + } res->SetStream(stream); stream_resources_.emplace(resource_id, res); } @@ -112,7 +123,8 @@ Status SingleOpManager::GetDynamicOpFromModel(const string &model_name, GE_CHK_STATUS_RET(GetResourceId(stream, resource_id)); StreamResource *res = GetResource(resource_id, stream); if (res == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "GetResource failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Get][Resource] failed."); + REPORT_CALL_ERROR("E19999", "GetDynamicOpFromModel fail because GetResource return nullptr."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -143,7 +155,9 @@ Status SingleOpManager::GetResourceId(rtStream_t stream, uintptr_t &resource_id) rtContext_t rt_cur_ctx = nullptr; auto rt_err = rtCtxGetCurrent(&rt_cur_ctx); if (rt_err != RT_ERROR_NONE) { - GELOGE(rt_err, "get current context failed, runtime result is %d", static_cast(rt_err)); + GELOGE(rt_err, "[Get][CurrentContext] failed, runtime result is %d", static_cast(rt_err)); + REPORT_CALL_ERROR("E19999", + "GetResourceId failed because rtCtxGetCurrent result is %d", static_cast(rt_err)); return RT_ERROR_TO_GE_STATUS(rt_err); } // use current context as resource key instead diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc index 49dde9c4..a4135999 100755 --- a/ge/single_op/single_op_model.cc +++ b/ge/single_op/single_op_model.cc @@ -43,20 +43,51 @@ using std::vector; namespace ge { namespace { const size_t kDataOutputNum = 1; +const uint32_t kOutputIndexOfData = 0; +constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; -bool NeedHybridModel(GeModelPtr &ge_model) { +Status IfInferDepend(GeModelPtr &ge_model, bool &flag) { + auto comp_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph()); + GE_CHECK_NOTNULL(comp_graph); + for (const auto &node : comp_graph->GetAllNodes()) { + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + const auto &depends = op_desc->GetOpInferDepends(); + bool support_dynamic_shape = false; + (void)AttrUtils::GetBool(op_desc, kAttrSupportDynamicShape, support_dynamic_shape); + if (!depends.empty() && support_dynamic_shape) { + flag = true; + return SUCCESS; + } + } + return SUCCESS; +} + +Status NeedHybridModel(GeModelPtr &ge_model, bool &flag) { + bool infer_depend_flag = false; + GE_CHK_STATUS_RET(IfInferDepend(ge_model, infer_depend_flag), "[Check][InferDepend] failed."); auto tasks = ge_model->GetModelTaskDefPtr()->task(); int32_t kernel_task_num = 0; for (int i = 0; i < tasks.size(); ++i) { auto task_type = static_cast(tasks[i].type()); if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) { - kernel_task_num++; - if (kernel_task_num > 1) { - return true; + const auto &context = task_type == RT_MODEL_TASK_KERNEL ? tasks[i].kernel().context() : + tasks[i].kernel_with_handle().context(); + auto kernel_type = static_cast(context.kernel_type()); + if (kernel_type == ccKernelType::TE) { + if (infer_depend_flag) { + flag = true; + return SUCCESS; + } + kernel_task_num++; + if (kernel_task_num > 1) { + flag = true; + return SUCCESS; + } } } } - return false; + return SUCCESS; } } // namespace @@ -75,7 +106,8 @@ Status SingleOpModel::InitModel() { auto ret = model_helper_.LoadModel(model); if (ret != SUCCESS) { - GELOGE(ret, "LoadModel failed"); + GELOGE(ret, "[Load][Model] failed."); + REPORT_CALL_ERROR("E19999", "InitModel fail for ModelHelper LoadModel failed."); return ret; } @@ -100,7 +132,7 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP ret = ge::AttrUtils::GetInt(model, ATTR_MODEL_CORE_TYPE, value); param.core_type = ret ? value : 0; - GELOGI("ParseOpModelParams(), total_memory_size:%lu, zero_copy_size:%lu, weight_size:%lu. core_type = %lu", + GELOGI("ParseOpModelParams(), total_memory_size:%lu, zero_copy_size:%lu, weight_size:%lu, core_type = %lu", param.memory_size, param.zero_copy_mem_size, param.weight_size, param.core_type); } @@ -141,7 +173,11 @@ Status SingleOpModel::ParseInputNode(const OpDescPtr &op_desc) { vector offsets = op_desc->GetOutputOffset(); if (offsets.size() != kDataOutputNum) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "Data op should have only one output, but got %zu", op_desc->GetOutputOffset().size()); + "[Parse][InputNode]Data op should have only one output, but got %zu, op_name:%s, op_type:%s.", + op_desc->GetOutputOffset().size(), op_desc->GetName().c_str(), op_desc->GetType().c_str()); + REPORT_INNER_ERROR("E19999", "ParseInputNode fail for Data op should have only one output, but got %zu," + "op_name:%s, op_type:%s.", op_desc->GetOutputOffset().size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); return ACL_ERROR_GE_PARAM_INVALID; } @@ -179,7 +215,9 @@ Status SingleOpModel::LoadAllNodes() { model_id_ = ge_model->GetModelId(); auto compute_graph = GraphUtils::GetComputeGraph(graph); if (compute_graph == nullptr) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[%s] compute_graph is null", model_name_.c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][ComputeGraph] fail, model_name:%s.", model_name_.c_str()); + REPORT_CALL_ERROR("E19999", "LoadAllNodes fail for GetComputeGraph return nullptr, model_name:%s.", + model_name_.c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -287,7 +325,11 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s single_op.tasks_.emplace_back(task); } else { GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, - "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type()); + "[Check][KernelType]Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", + context.kernel_type()); + REPORT_INNER_ERROR("E19999", + "BuildTaskList fail for %u not supported, Only TBE, AI_CPU, CUST_AI_CPU kernel are supported.", + context.kernel_type()); return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID; } } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { @@ -313,7 +355,8 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) { if (task == nullptr) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr"); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Parse][ArgTable] fail for input OpTask is nullptr."); + REPORT_INNER_ERROR("E19999", "ParseArgTable fail for input OpTask is nullptr."); return; } @@ -340,13 +383,15 @@ Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask * task_def.kernel_with_handle().context(); auto iter = op_list_.find(context.op_index()); if (iter == op_list_.end()) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "op desc not found. op index = %u", context.op_index()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Param:TaskDef]op desc not found. op index = %u", context.op_index()); + REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for op desc not found. op index = %u", context.op_index()); return ACL_ERROR_GE_INTERNAL_ERROR; } auto *tbe_task = new (std::nothrow) TbeOpTask(); if (tbe_task == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create tbe op task failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][TbeOpTask]failed."); + REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for new TbeOpTask."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -366,19 +411,24 @@ Status SingleOpModel::BuildKernelExTask(const domi::KernelExDef &kernel_def, AiC bool dynamic_flag, bool& depend_compute_flag, uint64_t kernel_id) { auto iter = op_list_.find(kernel_def.op_index()); if (iter == op_list_.end()) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "op desc not found. op index = %u", kernel_def.op_index()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Check][Param:KernelExDef]op not found. op index = %u", kernel_def.op_index()); + REPORT_INNER_ERROR("E19999", + "BuildKernelExTask fail for param kernel_def, because op of kernel_def not found, op index:%u.", + kernel_def.op_index()); return ACL_ERROR_GE_INTERNAL_ERROR; } std::unique_ptr aicpu_task(new (std::nothrow) AiCpuTask()); if (aicpu_task == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create aicpu_TF op task failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][AiCpuTask] failed."); + REPORT_INNER_ERROR("E19999", "BuildKernelExTask fail for new AiCpuTask, model_name:%s.", model_name_.c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto builder = AiCpuTaskBuilder(iter->second->GetOpDesc(), kernel_def); auto ret = builder.BuildTask(*aicpu_task, model_params_, dynamic_flag, kernel_id); if (ret != SUCCESS) { - GELOGE(ret, "build aicpu_TF op task failed"); + GELOGE(ret, "[Build][Task] failed, kernel_id:%lu.", kernel_id); return ret; } depend_compute_flag = (aicpu_task->GetUnknownType() == DEPEND_COMPUTE); @@ -391,25 +441,56 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa const auto &context = kernel_def.context(); auto iter = op_list_.find(context.op_index()); if (iter == op_list_.end()) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "op desc not found. op index = %u", context.op_index()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Check][Param:KernelDef] op desc not found. op index = %u", context.op_index()); + REPORT_INNER_ERROR("E19999", + "BuildCpuKernelTask fail for kernel_def is invalid, because op of kernel_def not found, op index:%u.", + context.op_index()); return ACL_ERROR_GE_INTERNAL_ERROR; } std::unique_ptr aicpucc_task(new (std::nothrow) AiCpuCCTask()); if (aicpucc_task == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create aicpu_CC op task failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][AiCpuCCTask] failed"); + REPORT_INNER_ERROR("E19999", "BuildCpuKernelTask fail for new AiCpuCCTask, model_name:%s.", model_name_.c_str()); return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto builder = AiCpuCCTaskBuilder(iter->second->GetOpDesc(), kernel_def); auto ret = builder.BuildTask(*aicpucc_task, kernel_id, model_params_); if (ret != SUCCESS) { - GELOGE(ret, "build aicpu_CC op task failed"); + GELOGE(ret, "[Build][AiCpuCCTask]failed, kernel_id:%lu.", kernel_id); + REPORT_CALL_ERROR("E19999", "BuildCpuKernelTask fail for build AiCpuTask, kernel_id:%lu.", kernel_id); return ret; } *task = aicpucc_task.release(); return SUCCESS; } +Status SingleOpModel::InitHybridModelExecutor(const StreamResource &resource, const GeModelPtr &ge_model, + SingleOp &single_op) { + for (const auto &op_desc : data_ops_) { + auto output_tensor_desc = op_desc->GetOutputDesc(kOutputIndexOfData); + GeTensorDesc tensor_desc(output_tensor_desc); + single_op.inputs_desc_.emplace_back(tensor_desc); + GELOGD("Init inputs desc from %s.", op_desc->GetName().c_str()); + } + GE_CHK_STATUS_RET_NOLOG(hybrid::NodeExecutorManager::GetInstance().EnsureInitialized()); + auto root_model = model_helper_.GetGeRootModel(); + GE_CHECK_NOTNULL(root_model); + root_model->SetRootGraph(GraphUtils::GetComputeGraph(ge_model->GetGraph())); + root_model->SetSubgraphInstanceNameToModel(root_model->GetRootGraph()->GetName(), ge_model); + single_op.hybrid_model_.reset(new (std::nothrow)hybrid::HybridModel(root_model)); + GE_CHECK_NOTNULL(single_op.hybrid_model_); + GE_CHK_STATUS_RET(single_op.hybrid_model_->Init(true), "[Init][HybridModel]Failed."); + int32_t device_id = 0; + GE_CHK_RT_RET(rtGetDevice(&device_id)); + single_op.hybrid_model_executor_.reset(new (std::nothrow)hybrid::HybridModelExecutor(single_op.hybrid_model_.get(), + device_id, + resource.GetStream())); + GE_CHECK_NOTNULL(single_op.hybrid_model_executor_); + GE_CHK_STATUS_RET(single_op.hybrid_model_executor_->Init(), "[Init][HybridModelExecutor]Failed."); + return SUCCESS; +} Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs()); @@ -417,20 +498,34 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_)); GE_CHECK_NOTNULL(single_op.running_param_); GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op)); + auto ge_model = model_helper_.GetGeModel(); + GE_CHECK_NOTNULL(ge_model); + bool infer_depend_flag = false; + GE_CHK_STATUS_RET(IfInferDepend(ge_model, infer_depend_flag), "[Check][InferDepend] failed."); + if (infer_depend_flag) { + // construct single_op, do single op with HybridModelExecutor + GELOGD("Init hybrid model params of single op, and will do execute with hybrid model executor."); + return InitHybridModelExecutor(resource, ge_model, single_op); + } return BuildTaskList(&resource, single_op); } -Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { +Status SingleOpModel::BuildModelTaskKernel(StreamResource *stream_resource, const TaskDef &task_def, + DynamicSingleOp &single_op) { auto task_type = static_cast(task_def.type()); const auto &context = task_type == RT_MODEL_TASK_KERNEL ? task_def.kernel().context() : task_def.kernel_with_handle().context(); auto kernel_type = static_cast(context.kernel_type()); if (kernel_type == ccKernelType::TE) { - GELOGD("Building TBE task"); + GELOGD("Building TBE task."); TbeOpTask *tbe_task = nullptr; GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def, &tbe_task)); tbe_task->SetModelArgs(model_name_, model_id_); + if (tbe_task->tiling_buffer_ != nullptr) { + GELOGD("tiling buffer is not nullptr."); + tbe_task->stream_resource_ = stream_resource; + } single_op.op_task_.reset(tbe_task); } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { GELOGD("Building AICPU_CC task"); @@ -442,31 +537,42 @@ Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingl single_op.op_task_.reset(task); } else { GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, - "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type()); + "[Check][Param:TaskDef]Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", + context.kernel_type()); + REPORT_INNER_ERROR("E19999", + "BuildModelTaskKernel fail for got:%u not supported, Only TBE, AI_CPU, CUST_AI_CPU kernel are supported.", + context.kernel_type()); return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID; } return SUCCESS; } -Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { +Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &single_op) { auto ge_model = model_helper_.GetGeModel(); GE_CHECK_NOTNULL(ge_model); + auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph()); + GE_CHECK_NOTNULL(compute_graph); + single_op.compute_graph_ = compute_graph; auto tasks = ge_model->GetModelTaskDefPtr()->task(); for (int i = 0; i < tasks.size(); ++i) { const TaskDef &task_def = tasks[i]; - GELOGI("[%s] Task[%d], type = %u, DebugString = %s", model_name_.c_str(), i, task_def.type(), + GELOGI("[%s] Task[%d], type = [%u], DebugString = [%s]", model_name_.c_str(), i, task_def.type(), task_def.DebugString().c_str()); auto task_type = static_cast(task_def.type()); if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) { if (single_op.op_task_ != nullptr) { - GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "Do not support dynamic op with multiple tasks."); + GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "[Check][TaskType]Do not support dynamic op with multiple tasks."); + REPORT_INNER_ERROR("E19999", + "BuildTaskListForDynamicOp fail for Do not support dynamic op with multiple tasks."); return ACL_ERROR_GE_OP_TASK_TYPE_INVALID; } - GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(task_def, single_op)); + GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(stream_resource, task_def, single_op)); } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { if (single_op.op_task_ != nullptr) { - GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "Do not support dynamic op with multiple tasks."); + GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "[Check][TaskType]Do not support dynamic op with multiple tasks."); + REPORT_INNER_ERROR("E19999", + "BuildTaskListForDynamicOp fail for Do not support dynamic op with multiple tasks."); return ACL_ERROR_GE_OP_TASK_TYPE_INVALID; } GELOGD("Building AICPU_TF task"); @@ -478,7 +584,8 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) { depend_compute_flag, dynamic_singleop_kernel_id)); if (depend_compute_flag) { if (i >= tasks.size() - 1) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "The copy task of the fourth operator was not found."); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Task]The copy task of the fourth operator was not found."); + REPORT_INNER_ERROR("E19999", "The copy task of the fourth operator was not found."); return ACL_ERROR_GE_PARAM_INVALID; } ++i; @@ -500,10 +607,13 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp & single_op.num_outputs_ = netoutput_op_->GetAllInputsSize(); GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource)); model_params_.memory_size = UINT_MAX; + model_params_.graph_is_dynamic = true; auto ge_model = model_helper_.GetGeModel(); GE_CHECK_NOTNULL(ge_model); - if (NeedHybridModel(ge_model)) { + bool need_hybrid_model = false; + GE_CHK_STATUS_RET(NeedHybridModel(ge_model, need_hybrid_model), "[Check][NeedHybridModel] failed."); + if (need_hybrid_model) { GELOGD("Build single op HybridModel."); GE_CHK_STATUS_RET_NOLOG(hybrid::NodeExecutorManager::GetInstance().EnsureInitialized()); auto root_model = model_helper_.GetGeRootModel(); @@ -512,16 +622,16 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp & root_model->SetSubgraphInstanceNameToModel(root_model->GetRootGraph()->GetName(), ge_model); single_op.hybrid_model_.reset(new (std::nothrow)hybrid::HybridModel(root_model)); GE_CHECK_NOTNULL(single_op.hybrid_model_); - GE_CHK_STATUS_RET(single_op.hybrid_model_->Init(true), "Failed to init hybrid model"); + GE_CHK_STATUS_RET(single_op.hybrid_model_->Init(true), "[Init][HybridModel]Failed."); int32_t device_id = 0; GE_CHK_RT_RET(rtGetDevice(&device_id)); single_op.hybrid_model_executor_.reset(new (std::nothrow)hybrid::HybridModelExecutor(single_op.hybrid_model_.get(), device_id, resource.GetStream())); GE_CHECK_NOTNULL(single_op.hybrid_model_executor_); - GE_CHK_STATUS_RET(single_op.hybrid_model_executor_->Init(), "Failed to init hybrid model"); + GE_CHK_STATUS_RET(single_op.hybrid_model_executor_->Init(), "[Init][HybridModelExecutor]Failed."); return SUCCESS; } - return BuildTaskListForDynamicOp(single_op); + return BuildTaskListForDynamicOp(&resource, single_op); } } // namespace ge diff --git a/ge/single_op/single_op_model.h b/ge/single_op/single_op_model.h index b1a7d3ea..d900f09f 100755 --- a/ge/single_op/single_op_model.h +++ b/ge/single_op/single_op_model.h @@ -40,6 +40,7 @@ struct SingleOpModelParam { std::map addr_mapping_; int64_t core_type = 0; + bool graph_is_dynamic = false; }; class SingleOpModel { @@ -65,15 +66,17 @@ class SingleOpModel { void ParseOutputNode(const OpDescPtr &op_desc); Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); - Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); + Status BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &dynamic_single_op); Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task); Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, bool dynamic_flag, bool& depend_compute_flag, uint64_t kernel_id); Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id); - Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); + Status BuildModelTaskKernel(StreamResource *stream_resource, const domi::TaskDef &task_def, + DynamicSingleOp &single_op); static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); void ParseArgTable(OpTask *task, SingleOp &op); + Status InitHybridModelExecutor(const StreamResource &resource, const GeModelPtr &ge_model, SingleOp &single_op); std::string model_name_; uint32_t model_id_ = 0; diff --git a/ge/single_op/stream_resource.cc b/ge/single_op/stream_resource.cc index 21d127ec..9fe8f26a 100755 --- a/ge/single_op/stream_resource.cc +++ b/ge/single_op/stream_resource.cc @@ -22,6 +22,11 @@ #include "single_op/single_op_model.h" namespace ge { +namespace { +// limit available device mem size 1M +const uint32_t kFuzzDeviceBufferSize = 1 * 1024 * 1024; +} + StreamResource::StreamResource(uintptr_t resource_id) : resource_id_(resource_id) { } @@ -29,16 +34,27 @@ StreamResource::~StreamResource() { for (auto mem : memory_list_) { if (mem != nullptr) { auto rt_ret = rtFree(mem); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtFree failed")); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "[Free][Rt] failed.")); } } for (auto weight : weight_list_) { if (weight != nullptr) { auto rt_ret = rtFree(weight); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtFree failed")); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "[Free][Rt] failed.")); } } + + if (device_buffer_ != nullptr) { + auto rt_ret = rtFree(device_buffer_); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "[Free][Rt] failed.")); + } +} + +Status StreamResource::Init() { + auto rt_ret = rtMalloc(&device_buffer_, kFuzzDeviceBufferSize, RT_MEMORY_HBM); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "[Malloc][Rt] failed.")); + return SUCCESS; } SingleOp *StreamResource::GetOperator(const uint64_t key) { @@ -95,16 +111,18 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, uint8_t *buffer = nullptr; auto ret = rtMalloc(reinterpret_cast(&buffer), size, RT_MEMORY_HBM); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", size, ret); + GELOGE(RT_FAILED, "[RtMalloc][Memory] failed, size = %zu, ret = %d", size, ret); + REPORT_INNER_ERROR("E19999", "rtMalloc failed, size = %zu, ret = %d.", size, ret); return nullptr; } GE_PRINT_DYNAMIC_MEMORY(rtMalloc, purpose.c_str(), size) ret = rtMemset(buffer, size, 0U, size); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMemset failed, ret = %d", ret); + GELOGE(RT_FAILED, "[RtMemset][Memory] failed, ret = %d", ret); + REPORT_INNER_ERROR("E19999", "rtMemset failed, ret = %d.", ret); auto rt_ret = rtFree(buffer); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtFree failed")); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "[RtFree][Memory] failed")); return nullptr; } @@ -129,7 +147,8 @@ uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) { uint8_t *buffer = nullptr; auto ret = rtMalloc(reinterpret_cast(&buffer), size, RT_MEMORY_HBM); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", size, ret); + GELOGE(RT_FAILED, "[RtMalloc][Memory] failed, size = %zu, ret = %d", size, ret); + REPORT_INNER_ERROR("E19999", "rtMalloc failed, size = %zu, ret = %d.", size, ret); return nullptr; } @@ -152,7 +171,8 @@ Status StreamResource::BuildDynamicOperator(const ModelData &model_data, SingleOpModel model(model_name, model_data.model_data, model_data.model_len); auto ret = model.Init(); if (ret != SUCCESS) { - GELOGE(ret, "Init model failed. model = %s, ret = %u", model_name.c_str(), ret); + GELOGE(ret, "[Init][SingleOpModel] failed. model = %s, ret = %u", model_name.c_str(), ret); + REPORT_CALL_ERROR("E19999", "SingleOpModel init failed, model = %s, ret = %u", model_name.c_str(), ret); return ret; } @@ -161,7 +181,7 @@ Status StreamResource::BuildDynamicOperator(const ModelData &model_data, GELOGI("To build operator: %s", model_name.c_str()); GE_CHK_STATUS_RET(model.BuildDynamicOp(*this, *new_op), - "Build op failed. op = %s, ret = %u", model_name.c_str(), ret); + "[Build][DynamicOp]failed. op = %s, ret = %u", model_name.c_str(), ret); *single_op = new_op.get(); dynamic_op_map_[model_id] = std::move(new_op); return SUCCESS; @@ -179,18 +199,20 @@ Status StreamResource::BuildOperator(const ModelData &model_data, SingleOp **sin SingleOpModel model(model_name, model_data.model_data, model_data.model_len); auto ret = model.Init(); if (ret != SUCCESS) { - GELOGE(ret, "Init model failed. model = %s, ret = %u", model_name.c_str(), ret); + GELOGE(ret, "[Init][SingleOpModel] failed. model = %s, ret = %u", model_name.c_str(), ret); + REPORT_CALL_ERROR("E19999", "SingleOpModel init failed, model = %s, ret = %u", model_name.c_str(), ret); return ret; } auto new_op = std::unique_ptr(new(std::nothrow) SingleOp(this, &stream_mu_, stream_)); if (new_op == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed"); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[New][SingleOp] failed."); + REPORT_CALL_ERROR("E19999", "new SingleOp failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } GELOGI("To build operator: %s", model_name.c_str()); - GE_CHK_STATUS_RET(model.BuildOp(*this, *new_op), "Build op failed. op = %s, ret = %u", model_name.c_str(), ret); + GE_CHK_STATUS_RET(model.BuildOp(*this, *new_op), "[Build][Op] failed. op = %s, ret = %u", model_name.c_str(), ret); *single_op = new_op.get(); op_map_[model_id] = std::move(new_op); diff --git a/ge/single_op/stream_resource.h b/ge/single_op/stream_resource.h index 73a6231b..aecb38c8 100755 --- a/ge/single_op/stream_resource.h +++ b/ge/single_op/stream_resource.h @@ -40,6 +40,7 @@ class StreamResource { rtStream_t GetStream() const; void SetStream(rtStream_t stream); + Status Init(); SingleOp *GetOperator(const uint64_t key); DynamicSingleOp *GetDynamicOperator(const uint64_t key); @@ -49,6 +50,9 @@ class StreamResource { uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true); uint8_t *MallocWeight(const std::string &purpose, size_t size); const uint8_t *GetMemoryBase() const; + void *GetDeviceBufferAddr() const { + return device_buffer_; + } private: uint8_t *DoMallocMemory(const std::string &purpose, @@ -65,6 +69,7 @@ class StreamResource { rtStream_t stream_ = nullptr; std::mutex mu_; std::mutex stream_mu_; + void *device_buffer_ = nullptr; }; } // namespace ge diff --git a/ge/single_op/task/aicpu_kernel_task_builder.cc b/ge/single_op/task/aicpu_kernel_task_builder.cc index 6580ea31..18f13691 100755 --- a/ge/single_op/task/aicpu_kernel_task_builder.cc +++ b/ge/single_op/task/aicpu_kernel_task_builder.cc @@ -26,7 +26,8 @@ AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::Ker Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m) { size_t aicpu_arg_size = kernel_def_.args_size(); if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); + REPORT_INNER_ERROR("E19999", "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); return ACL_ERROR_GE_PARAM_INVALID; } @@ -36,13 +37,15 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelP std::unique_ptr aicpu_args; aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]()); if (aicpu_args == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "malloc failed, size = %zu", aicpu_arg_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[New][Memory] failed, size = %zu", aicpu_arg_size); + REPORT_INNER_ERROR("E19999", "new Memory failed, size = %zu", aicpu_arg_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } auto err = memcpy_s(aicpu_args.get(), aicpu_arg_size, kernel_def_.args().data(), aicpu_arg_size); if (err != EOK) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "memcpy_s args failed, size = %zu, err = %d", aicpu_arg_size, err); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Memcpy_s][Args] failed, size = %zu, err = %d", aicpu_arg_size, err); + REPORT_INNER_ERROR("E19999", "memcpy_s aicpu_args failed, size = %zu, err = %d", aicpu_arg_size, err); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -76,9 +79,9 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, cons task.dump_flag_ |= RT_KERNEL_CUSTOM_AICPU; bool loaded = false; GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name, loaded), - "launch cust aicpu so failed"); + "[Load][CustAicpuSo] failed."); if (!loaded) { - GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed."); + GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "[Launch][CustAicpuSo] failed."); } } @@ -89,18 +92,19 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, cons auto &kernel_ext_info = kernel_def_.kernel_ext_info(); auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size(); GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, FAILED, - "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", + "[Check][Size]task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(), kernel_ext_info_size); ret = task.SetExtInfoAndType(kernel_ext_info, kernel_id); if (ret != SUCCESS) { - GELOGE(ret, "Init ext info failed."); + GELOGE(ret, "[Set][ExtInfoAndType]failed, kernel_id=%lu.", kernel_id); + REPORT_CALL_ERROR("E19999", "SetExtInfoAndType failed, kernel_id=%lu.", kernel_id); return ret; } - GE_CHK_STATUS_RET(task.SetInputConst(), "AiCpuCCTask set input_const failed."); + GE_CHK_STATUS_RET(task.SetInputConst(), "[Set][InputConst] failed."); if (task.GetUnknownType() == DEPEND_COMPUTE) { - GELOGE(FAILED, "AiCpuCCTask unknown type is depend compute, it's not supported now."); + GELOGE(FAILED, "[Get][UnknownType] is depend compute, it's not supported now."); return FAILED; } auto aicpu_param_head = reinterpret_cast(task.args_.get()); diff --git a/ge/single_op/task/aicpu_task_builder.cc b/ge/single_op/task/aicpu_task_builder.cc index a01ee0f0..805b1306 100755 --- a/ge/single_op/task/aicpu_task_builder.cc +++ b/ge/single_op/task/aicpu_task_builder.cc @@ -30,7 +30,8 @@ namespace ge { auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), kernel_def_.args().data(), kernel_def_.args().size()); if (sec_ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memcpy failed, ret: %d", sec_ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Memcpy_s][Param:fwk_op_kernel] failed, ret: %d", sec_ret); + REPORT_INNER_ERROR("E19999", "memcpy_s fwk_op_kernel failed, ret:%d.", sec_ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } @@ -45,7 +46,8 @@ namespace ge { void *fwk_op_args = nullptr; auto rt_ret = rtMalloc(&fwk_op_args, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "malloc arg memory failed, ret = %d", rt_ret); + GELOGE(rt_ret, "[RtMalloc][Memory] failed, ret = %d", rt_ret); + REPORT_INNER_ERROR("E19999", "rtMalloc Memory failed, ret = %d", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } @@ -53,7 +55,8 @@ namespace ge { sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { (void)rtFree(fwk_op_args); - GELOGE(rt_ret, "copy args failed, ret = %d", rt_ret); + GELOGE(rt_ret, "[rtMemcpy][Fwk_Op_Args] failed, ret = %d", rt_ret); + REPORT_INNER_ERROR("E19999", "rtMemcpy fwk_op_args failed, ret = %d", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } *args = fwk_op_args; @@ -62,8 +65,10 @@ namespace ge { Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag) { if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", - sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", + sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); + REPORT_INNER_ERROR("E19999", "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", + sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); return ACL_ERROR_GE_PARAM_INVALID; } GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM)); @@ -97,16 +102,16 @@ namespace ge { auto &kernel_ext_info = kernel_def_.kernel_ext_info(); auto kernel_ext_info_size = kernel_def_.kernel_ext_info_size(); GE_CHK_BOOL_RET_STATUS(kernel_ext_info.size() == kernel_ext_info_size, ACL_ERROR_GE_PARAM_INVALID, - "task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", + "[Check][Size]task def kernel_ext_info.size=%zu, but kernel_ext_info_size=%u.", kernel_ext_info.size(), kernel_ext_info_size); - GE_CHK_STATUS_RET(task.SetExtInfoAndType(kernel_ext_info, kernel_id), "Init ext info failed."); + GE_CHK_STATUS_RET(task.SetExtInfoAndType(kernel_ext_info, kernel_id), "[Set][ExtInfoAndType]failed."); if (task.ext_info_addr_dev_ != nullptr) { fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = reinterpret_cast(task.ext_info_addr_dev_); fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = kernel_ext_info_size; } - GE_CHK_STATUS_RET(task.SetInputConst(), "AiCpuTask set input_const failed."); - GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "AiCpuTask init for summary and copy task failed."); + GE_CHK_STATUS_RET(task.SetInputConst(), "[Set][InputConst] failed."); + GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "[Init][SummaryAndCopy] failed."); fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = ULLONG_MAX; fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID = kernel_id; diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc index f754af28..fbc3d68b 100755 --- a/ge/single_op/task/op_task.cc +++ b/ge/single_op/task/op_task.cc @@ -56,9 +56,11 @@ Status OpTask::OpenDump(rtStream_t stream) { size_t arg_num = 0; GetIoAddr(arg_base, arg_num); if (arg_num < input_size + output_size) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", - arg_num, - input_size + output_size); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Check][Size]io_addrs_for_dump_ size %zu is not equal input and output size %zu", + arg_num, input_size + output_size); + REPORT_INNER_ERROR("E19999", "io_addrs_for_dump_ size %zu is not equal input and output size %zu", + arg_num, input_size + output_size); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -74,7 +76,7 @@ Status OpTask::OpenDump(rtStream_t stream) { op_desc_, input_addrs, output_adds, stream); auto status = dump_op_.LaunchDumpOp(); if (status != SUCCESS) { - GELOGE(status, "Launch dump op failed in single op"); + GELOGE(status, "[Launch][DumpOp] failed in single op."); return status; } return SUCCESS; @@ -116,12 +118,13 @@ Status OpTask::GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id uint32_t stream_id = 0; auto rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Get task_id and stream_id failed ret: 0x%X.", rt_ret); + GELOGE(RT_FAILED, "[Get][TaskIdAndStreamID] failed, ret: 0x%X.", rt_ret); + REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } GE_CHECK_NOTNULL(op_desc_); string op_name = op_desc_->GetName(); - GELOGD("Get profiling args of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id); + GELOGD("Get profiling args of op [%s] end, task_id[%u], stream_id[%u].", op_name.c_str(), task_id, stream_id); model_id = model_id_; task_desc_info.model_name = model_name_; task_desc_info.block_dim = block_dim_; @@ -134,7 +137,7 @@ Status OpTask::GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id return SUCCESS; } -Status OpTask::UpdateRunInfo(const vector &input_desc, const vector &output_desc) { +Status OpTask::UpdateRunInfo() { return UNSUPPORTED; } @@ -145,10 +148,11 @@ Status OpTask::DoUpdateArgTable(const SingleOpModelParam ¶m, bool keep_works size_t arg_num = 0; GetIoAddr(arg_base, arg_num); if (arg_num < all_addresses.size()) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[%s] arg number mismatches, expect at least = %zu, but got = %zu", - op_desc_->GetName().c_str(), - all_addresses.size(), - arg_num); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, + "[Check][Size][%s] arg number mismatches, expect at least = %zu, but got = %zu.", + op_desc_->GetName().c_str(), all_addresses.size(), arg_num); + REPORT_INNER_ERROR("E19999", "%s arg number mismatches, expect at least = %zu, but got = %zu.", + op_desc_->GetName().c_str(), all_addresses.size(), arg_num); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -196,18 +200,19 @@ void TbeOpTask::SetHandle(void *handle) { Status TbeOpTask::LaunchKernel(rtStream_t stream) { GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_); - auto *sm_desc = reinterpret_cast(sm_desc_); - auto ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast(arg_size_), sm_desc, stream); + auto ret = DoLaunchKernel(stream); + int retry_times = 0; while (ret != RT_ERROR_NONE && retry_times < kLaunchRetryTimes) { retry_times++; GELOGW("Retry after %d ms, retry_times: %d", kSleepTime, retry_times); std::this_thread::sleep_for(std::chrono::milliseconds(kSleepTime)); - ret = rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, sm_desc, stream); + ret = DoLaunchKernel(stream); } if (ret != RT_ERROR_NONE) { - GELOGE(ret, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->stub_name_.c_str()); + GELOGE(ret, "[Invoke][RtKernelLaunch] failed. ret = %d, task = %s", ret, this->stub_name_.c_str()); + REPORT_INNER_ERROR("E19999", "invoke rtKernelLaunch failed, ret = %d, task = %s", ret, this->stub_name_.c_str()); return RT_ERROR_TO_GE_STATUS(ret); } GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); @@ -215,24 +220,23 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { return SUCCESS; } -Status TbeOpTask::UpdateRunInfo(const vector &input_desc, const vector &output_desc) { - GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc)); +Status TbeOpTask::UpdateRunInfo() { // invoke OpParaCalculate GELOGD("Start to invoke OpParaCalculate."); optiling::OpRunInfo run_info; run_info.block_dim = 0; auto ret = optiling::OpParaCalculate(*node_, run_info); if (ret != GRAPH_SUCCESS) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Failed to invoke OpParaCalculate. ret = %u", ret); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpParaCalculate] failed, ret = %u.", ret); + REPORT_INNER_ERROR("E19999", "invoke OpParaCalculate failed, ret = %u.", ret); return ACL_ERROR_GE_INTERNAL_ERROR; } block_dim_ = run_info.block_dim; tiling_data_ = run_info.tiling_data.str(); tiling_key_ = run_info.tiling_key; + run_info_workspaces_ = run_info.workspaces; GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu, tiling_key = %u", block_dim_, tiling_data_.size(), tiling_key_); - - GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces"); return SUCCESS; } @@ -248,7 +252,7 @@ Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc } else { std::vector storage_shape; if (!AttrUtils::GetListInt(src_tensor, ge::ATTR_NAME_STORAGE_SHAPE, storage_shape)) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Failed to get storage_shape while storage_format was set"); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][ListInt]failed while storage_format was set."); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -282,14 +286,33 @@ Status TbeOpTask::UpdateNodeByShape(const vector &input_desc, cons return SUCCESS; } -void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size) { +Status TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, uint32_t max_tiling_size) { + if (tiling_buffer != nullptr) { + uintptr_t *arg_base = nullptr; + size_t arg_num = 0; + GetIoAddr(arg_base, arg_num); + GE_CHECK_NOTNULL(node); + GE_CHECK_NOTNULL(node->GetOpDesc()); + uint32_t inputs_num = node->GetOpDesc()->GetInputsSize(); + uint32_t outputs_num = node->GetOpDesc()->GetOutputsSize(); + uint32_t workspace_nums = node->GetOpDesc()->GetWorkspace().size(); + uint32_t tiling_index = inputs_num + outputs_num + workspace_nums; + if (arg_num == 0 || arg_num < tiling_index) { + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Size]Tiling index %u, arg number %zu is invalid.", + tiling_index, arg_num); + return ACL_ERROR_GE_INTERNAL_ERROR; + } + arg_base[tiling_index] = reinterpret_cast(tiling_buffer); + } node_ = node; tiling_buffer_ = tiling_buffer; max_tiling_size_ = max_tiling_size; + return SUCCESS; } Status TbeOpTask::AllocateWorkspaces(const vector &workspace_sizes) { static const std::string kPurpose("malloc workspace memory for dynamic op."); + workspaces_.clear(); if (workspace_sizes.empty()) { GELOGD("No need to allocate workspace."); return SUCCESS; @@ -309,7 +332,8 @@ Status TbeOpTask::AllocateWorkspaces(const vector &workspace_sizes) { GE_CHECK_NOTNULL(stream_resource_); auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast(total_size)); if (ws_base == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Malloc][Memory] failed, size: %ld", total_size); + REPORT_INNER_ERROR("E19999", "MallocMemory failed, size: %ld", total_size); return ACL_ERROR_GE_MEMORY_ALLOCATION; } GELOGD("Done allocating workspace memory successfully."); @@ -326,8 +350,10 @@ Status TbeOpTask::LaunchKernel(const vector &input_desc, vector &output_desc, vector &output_buffers, rtStream_t stream) { - GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc)); GELOGD("[%s] Start to launch kernel", node_->GetName().c_str()); + GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc)); + GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo()); + GE_CHK_STATUS_RET(AllocateWorkspaces(run_info_workspaces_), "[Allocate][Workspaces] failed."); std::vector args; for (auto &buffer : input_buffers) { args.emplace_back(buffer.data); @@ -347,24 +373,38 @@ Status TbeOpTask::LaunchKernel(const vector &input_desc, args.emplace_back(tiling_buffer_); } + GELOGD("Dst size is %zu, src size is %zu.", arg_size_, args.size() * sizeof(void *)); + // node with workspace: build can not get size of workspace, need to update arg_size_ when execute + if (arg_size_ < (args.size() * sizeof(void *))) { + size_t temp_size = args.size() * sizeof(void *); + GELOGD("Need to reset size of args_ from %zu to %zu.", arg_size_, temp_size); + args_.reset(new(std::nothrow) uint8_t[temp_size]()); + GE_CHECK_NOTNULL(args_); + arg_size_ = temp_size; + } if (memcpy_s(args_.get(), arg_size_, args.data(), args.size() * sizeof(void *)) != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[%s] Failed to update kernel args.", - node_->GetName().c_str()); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][KernelArgs] failed for [%s].", node_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "update kernel args failed for %s.", node_->GetName().c_str()); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str()); + GE_CHK_STATUS_RET(DoLaunchKernel(stream), "Failed to do launch kernel."); + + return SUCCESS; +} + +Status TbeOpTask::DoLaunchKernel(rtStream_t stream) { + auto *sm_desc = reinterpret_cast(sm_desc_); if (handle_ == nullptr) { - GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, nullptr, stream)); - GELOGD("[%s] Done invoking rtKernelLaunch successfully", node_->GetName().c_str()); + GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), static_cast(arg_size_), + sm_desc, stream)); } else { std::string dev_func = original_kernel_key_ + "_" + std::to_string(tiling_key_); std::string kernel_info = node_info_ + "/" + std::to_string(tiling_key_); - GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(), arg_size_, nullptr, - stream, kernel_info.c_str())); - GELOGD("[%s] Done invoking rtKernelLaunchWithHandle successfully", node_->GetName().c_str()); + GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(), + static_cast(arg_size_), sm_desc, stream, kernel_info.c_str())); } - return SUCCESS; } @@ -398,17 +438,19 @@ Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint num_outputs_, unknown_type_)); GE_CHK_BOOL_RET_STATUS(aicpu_ext_handle_ != nullptr, ACL_ERROR_GE_MEMORY_ALLOCATION, - "Malloc aicpu_ext_handle mem failed!"); + "[Malloc][Memory] failed for aicpu_ext_handle!"); Status ret = aicpu_ext_handle_->Parse(kernel_ext_info); if (ret != SUCCESS) { - GELOGE(ret, "Parse kernel ext info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size()); + GELOGE(ret, "[Parse][Param:kernel_ext_info] failed, kernel_ext_info_size=%zu.", kernel_ext_info.size()); + REPORT_INNER_ERROR("E19999", + "Parse Param:kernel_ext_info failed, kernel_ext_info_size=%zu.", kernel_ext_info.size()); return ret; } GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateSessionInfo(ULLONG_MAX, kernel_id, false), - "UpdateSessionInfo failed."); - GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(true), "UpdateExecuteMode failed."); + "[Update][SessionInfo] failed."); + GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(true), "[Update][ExecuteMode] failed."); GE_CHK_RT_RET(rtMalloc(&ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), RT_MEMORY_HBM)); GE_CHK_RT_RET(rtMemcpy(ext_info_addr_dev_, aicpu_ext_handle_->GetExtInfoLen(), @@ -441,7 +483,7 @@ Status AiCpuBaseTask::UpdateExtInfo(const std::vector &input_desc, rtStream_t stream) { GELOGI("Update ext info begin, unknown_type=%d.", unknown_type_); GE_CHECK_NOTNULL(aicpu_ext_handle_); - GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(false), "UpdateExecuteMode failed."); + GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateExecuteMode(false), "[Update][ExecuteMode] failed."); if (num_inputs_ == 0 && num_outputs_ == 0) { GELOGI("No input and output, no need update ext info."); @@ -455,21 +497,28 @@ Status AiCpuBaseTask::UpdateExtInfo(const std::vector &input_desc, auto const_input_desc = op_desc_->MutableInputDesc(static_cast(input_index)); GE_CHECK_NOTNULL(const_input_desc); GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, *const_input_desc), - "Input[%zu] update input shape failed.", input_index); + "[Update][InputShapeAndType] failed, input_index:%zu.", input_index); continue; } GE_CHK_BOOL_RET_STATUS(non_const_index < input_desc.size(), ACL_ERROR_GE_PARAM_INVALID, - "Input_desc size is %zu, but get non_const_index is %zu", - input_desc.size(), non_const_index); + "[Check][Size]Input_desc size is %zu, but get non_const_index is %zu", input_desc.size(), non_const_index); GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, input_desc[non_const_index]), - "Input[%zu] update input shape failed.", input_index); + "[Update][InputShapeAndType]failed, input_index:%zu.", input_index); + if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { + GE_CHK_STATUS_RET(op_desc_->UpdateInputDesc(input_index, input_desc[non_const_index]), + "AiCpuTask Update [%zu]th input desc failed.",input_index); + } non_const_index++; } if (unknown_type_ != DEPEND_COMPUTE) { for (size_t j = 0; j < num_outputs_; ++j) { - GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]), - "Output[%zu] UpdateOutputShapeAndType failed.", j); + GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateOutputShapeAndType(j, output_desc[j]), + "[Update][OutputShapeAndType] failed, Output:%zu.", j); + if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { + GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(j, output_desc[j]), + "AiCpuTask Update [%zu]th output desc failed.",j); + } } } @@ -498,11 +547,10 @@ Status AiCpuBaseTask::UpdateOutputShape(vector &output_desc) { GeShape shape; DataType data_type; aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type); - GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.", - i); + GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), + "[Update][ShapeToOutputDesc] failed, output:%zu.", i); if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { - GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuCCTask Update [%zu]th output desc failed.", - i); + GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i); } } GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished."); @@ -527,7 +575,7 @@ Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensor auto trans_ret = formats::TransShape(format, shape_new.GetDims(), output_desc.GetDataType(), origin_format, origin_dims_new); GE_CHK_STATUS_RET(trans_ret, - "AiCpuTask originFormat[%d] is not same as format[%d], but TransShape failed, shape=%s.", + "[Trans][Shape] failed, AiCpuTask originFormat[%d] is not same as format[%d], shape=%s.", origin_format, format, shape_new.ToString().c_str()); auto origin_shape_new = GeShape(origin_dims_new); @@ -553,8 +601,7 @@ Status AiCpuBaseTask::UpdateIoAddr(const vector &inputs, const vecto continue; } GE_CHK_BOOL_RET_STATUS(non_const_index < inputs.size(), ACL_ERROR_GE_PARAM_INVALID, - "Input size is %zu, but get non_const_index is %zu", - inputs.size(), non_const_index); + "[Check][Size] Input size is %zu, but get non_const_index is %zu", inputs.size(), non_const_index); auto addr = inputs[non_const_index].data; GE_CHECK_NOTNULL(addr); GELOGD("AICpuTask input[%zu] addr = %p", input_index, addr); @@ -602,14 +649,16 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) { RT_MEMCPY_HOST_TO_DEVICE_EX, stream); if (ret != RT_ERROR_NONE) { - GELOGE(ret, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str()); + GELOGE(ret, "[MemcpyAsync][Date] failed. ret = %d, task = %s", ret, this->op_type_.c_str()); + REPORT_CALL_ERROR("E19999", "rtMemcpyAsync data failed, ret = %d, task = %s", ret, this->op_type_.c_str()); return RT_ERROR_TO_GE_STATUS(ret); } GELOGI("To invoke rtKernelLaunchEx. task = %s", this->op_type_.c_str()); ret = rtKernelLaunchEx(args_, arg_size_, 0, stream); if (ret != RT_ERROR_NONE) { - GELOGE(ret, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->op_type_.c_str()); + GELOGE(ret, "[Invoke][rtKernelLaunch] failed. ret = %d, task = %s", ret, this->op_type_.c_str()); + REPORT_CALL_ERROR("E19999", "invoke rtKernelLaunchEx failed, ret = %d, task = %s", ret, this->op_type_.c_str()); return RT_ERROR_TO_GE_STATUS(ret); } GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str()); @@ -706,10 +755,9 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector &output_desc) { } GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]), - "AiCpuTask update [%zu]th output shape failed.", i); + "[Update][ShapeToOutputDesc] failed , output:%zu.", i); if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) { - GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuTask update [%zu]th output desc failed.", - i); + GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "[Update][OutputDesc] failed, output:%zu.", i); } } return SUCCESS; @@ -731,13 +779,13 @@ Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector &output } out_shape_hbm_.clear(); GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(), - "Read ResultSummary and update output shape failed."); + "[Read][ResultSummaryAndPrepareMemory] failed."); GE_CHK_STATUS_RET(CopyDataToHbm(outputs, stream), - "Copy data to output failed."); + "[Copy][DataToHbm] failed."); GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc), - "Update shape by hbm buffer failed."); + "[Update][ShapeByHbmBuffer] failed."); for (auto out_shape : out_shape_hbm_) { FreeHbm(out_shape); @@ -787,8 +835,10 @@ Status AiCpuTask::InitForSummaryAndCopy() { Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) { if (kernel_def.args_size() > sizeof(STR_FWK_OP_KERNEL)) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", - sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", + sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size()); + REPORT_INNER_ERROR("E19999", "[sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", + sizeof(STR_FWK_OP_KERNEL), kernel_def.args_size()); return ACL_ERROR_GE_PARAM_INVALID; } GE_CHK_RT_RET(rtMalloc(©_workspace_buf_, kernel_def.task_info_size(), RT_MEMORY_HBM)); @@ -799,7 +849,8 @@ Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) { auto sec_ret = memcpy_s(&aicpu_task, sizeof(STR_FWK_OP_KERNEL), kernel_def.args().data(), kernel_def.args().size()); if (sec_ret != EOK) { - GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "memcpy failed, ret: %d", sec_ret); + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][TaskArgs] failed, ret: %d", sec_ret); + REPORT_INNER_ERROR("E19999", "update STR_FWK_OP_KERNEL args failed because memcpy_s return %d.", sec_ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } @@ -883,7 +934,8 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { block_dim_, args_.get(), static_cast(arg_size_), sm_desc, stream, dump_flag_); if (ret != RT_ERROR_NONE) { - GELOGE(ret, "Invoke rtCpuKernelLaunch failed. ret = %d", ret); + GELOGE(ret, "[Invoke][rtCpuKernelLaunchWithFlag] failed. ret = %d.", ret); + REPORT_CALL_ERROR("E19999", "invoke rtCpuKernelLaunchWithFlag failed, ret:%d.", ret); return RT_ERROR_TO_GE_STATUS(ret); } GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str()); diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h index 8c91bd5f..0c64ecb4 100644 --- a/ge/single_op/task/op_task.h +++ b/ge/single_op/task/op_task.h @@ -30,6 +30,7 @@ #include "cce/aicpu_engine_struct.h" #include "hybrid/node_executor/aicpu/aicpu_ext_info.h" #include "init/gelib.h" +#include "register/op_tiling.h" namespace ge { class StreamResource; @@ -39,8 +40,7 @@ class OpTask { OpTask() = default; virtual ~OpTask() = default; virtual Status LaunchKernel(rtStream_t stream) = 0; - virtual Status UpdateRunInfo(const vector &input_desc, - const vector &output_desc); + virtual Status UpdateRunInfo(); virtual Status UpdateArgTable(const SingleOpModelParam ¶m); void SetModelArgs(std::string model_name, uint32_t model_id); Status GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id); @@ -81,22 +81,23 @@ class TbeOpTask : public OpTask { void SetKernelWithHandleArgs(std::unique_ptr &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc, const domi::KernelDefWithHandle& kernel_def_with_handle); - Status UpdateRunInfo(const vector &input_desc, - const vector &output_desc) override; + Status UpdateRunInfo() override; const void *GetArgs() const; size_t GetArgSize() const; const std::string &GetStubName() const; - void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); + Status EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, uint32_t max_tiling_size); const std::string &GetTaskType() const override; void SetHandle(void *handle); private: friend class SingleOpModel; + friend class TbeTaskBuilder; static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); Status UpdateNodeByShape(const vector &input_desc, const vector &output_desc); Status AllocateWorkspaces(const std::vector &workspace_sizes); + Status DoLaunchKernel(rtStream_t stream); const void *stub_func_ = nullptr; std::unique_ptr args_; @@ -108,6 +109,7 @@ class TbeOpTask : public OpTask { void *tiling_buffer_ = nullptr; uint32_t max_tiling_size_ = 0; std::string tiling_data_; + std::vector run_info_workspaces_; std::vector workspaces_; NodePtr node_; diff --git a/ge/single_op/task/tbe_task_builder.cc b/ge/single_op/task/tbe_task_builder.cc index 606f8087..c7ff13d1 100644 --- a/ge/single_op/task/tbe_task_builder.cc +++ b/ge/single_op/task/tbe_task_builder.cc @@ -112,8 +112,10 @@ Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bi ret = rtDevBinaryRegister(&binary, bin_handle); } if (ret != RT_ERROR_NONE) { - GELOGE(ret, "DoRegisterBinary failed, bin key = %s, core_type = %ld, rt ret = %d", stub_name_.c_str(), - param.core_type, static_cast(ret)); + GELOGE(ret, "[DoRegister][Binary] failed, bin key = %s, core_type = %ld, rt ret = %d", stub_name_.c_str(), + param.core_type, static_cast(ret)); + REPORT_CALL_ERROR("E19999", "DoRegisterBinary failed, bin key = %s, core_type = %ld, rt ret = %d", + stub_name_.c_str(), param.core_type, static_cast(ret)); return ret; } @@ -127,8 +129,10 @@ Status TbeTaskBuilder::DoRegisterMeta(void *bin_handle) { if (!meta_data.empty()) { auto rt_ret = rtMetadataRegister(bin_handle, meta_data.c_str()); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMetadataRegister failed. bin key = %s, meta_data = %s, rt ret = %d", stub_name_.c_str(), - meta_data.c_str(), static_cast(rt_ret)); + GELOGE(rt_ret, "[Invoke][rtMetadataRegister] failed. bin key = %s, meta_data = %s, rt ret = %d", + stub_name_.c_str(), meta_data.c_str(), static_cast(rt_ret)); + REPORT_CALL_ERROR("E19999", "rtMetadataRegister failed, bin key = %s, meta_data = %s, rt ret = %d", + stub_name_.c_str(), meta_data.c_str(), static_cast(rt_ret)); return rt_ret; } } @@ -139,8 +143,10 @@ Status TbeTaskBuilder::DoRegisterMeta(void *bin_handle) { Status TbeTaskBuilder::DoRegisterFunction(void *bin_handle, const char *stub_name, const char *kernel_name) { auto rt_ret = rtFunctionRegister(bin_handle, stub_name, stub_name, kernel_name, FUNC_MODE_NORMAL); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtFunctionRegister failed. bin key = %s, kernel name = %s, rt ret = %d", stub_name, kernel_name, - static_cast(rt_ret)); + GELOGE(rt_ret, "[Invoke][rtFunctionRegister] failed. bin key = %s, kernel name = %s, rt ret = %d", + stub_name, kernel_name, static_cast(rt_ret)); + REPORT_CALL_ERROR("E19999", "rtFunctionRegister failed. bin key = %s, kernel name = %s, rt ret = %d", + stub_name, kernel_name, static_cast(rt_ret)); return rt_ret; } @@ -197,27 +203,32 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task, const SingleOpModelParam auto tbe_kernel = GetTbeKernel(op_desc_); if (tbe_kernel == nullptr) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s", - op_desc_->GetName().c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TbeKernel] fail for OP EXT ATTR NAME TBE_KERNEL not found. op = %s", + op_desc_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetTbeKernel fail for OP EXT ATTR NAME TBE_KERNEL not found. op = %s", + op_desc_->GetName().c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } auto holder = std::unique_ptr(new (std::nothrow) KernelHolder(stub_func, tbe_kernel)); if (holder == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create KernelHodler failed."); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][KernelHodler] failed."); + REPORT_INNER_ERROR("E19999", "Create KernelHodler failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } void *bin_handle = nullptr; auto ret = DoRegisterKernel(*tbe_kernel, stub_func, &bin_handle, param); if (ret != SUCCESS) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "RegisterKernel failed. stub name = %s", stub_name_.c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Register][Kernel] failed. stub name = %s", stub_name_.c_str()); + REPORT_CALL_ERROR("E19999", "DoRegisterKernel failed, stub name = %s", stub_name_.c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } holder->SetBinHandle(bin_handle); if (!registry.AddKernel(stub_name_, std::move(holder))) { // should not happen. only one thread can reach here - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Add kernel failed. stub name = %s", stub_name_.c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Add][Kernel] failed. stub name = %s", stub_name_.c_str()); + REPORT_CALL_ERROR("E19999", "AddKernel failed. stub name = %s", stub_name_.c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } } @@ -231,24 +242,29 @@ Status TbeTaskBuilder::RegisterKernelWithHandle(TbeOpTask &task, const SingleOpM HandleRegistry ®istry = HandleRegistry::GetInstance(); auto tbe_kernel = GetTbeKernel(op_desc_); if (tbe_kernel == nullptr) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s", - op_desc_->GetName().c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TbeKernel] fail for OP EXT ATTR NAME TBE_KERNEL not found. op = %s", + op_desc_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetTbeKernel fail for OP EXT ATTR NAME TBE_KERNEL not found. op = %s", + op_desc_->GetName().c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } void *bin_handle = nullptr; auto ret = DoRegisterKernel(*tbe_kernel, nullptr, &bin_handle, param); if (ret != SUCCESS) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "RegisterKernel failed. node name = %s", op_desc_->GetName().c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Register][Kernel] failed. node name = %s", op_desc_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "DoRegisterKernel failed, node name = %s", op_desc_->GetName().c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } handle_ = bin_handle; auto holder = std::unique_ptr(new (std::nothrow) HandleHolder(handle_)); if (holder == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "create HandleHodler failed."); + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][HandleHolder] failed."); + REPORT_INNER_ERROR("E19999", "Create HandleHolder failed."); return ACL_ERROR_GE_MEMORY_ALLOCATION; } if (!registry.AddHandle(std::move(holder))) { - GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "Add handle failed. node name = %s", op_desc_->GetName().c_str()); + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Add][Handle] failed. node name = %s", op_desc_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "AddHandle failed, node name = %s", op_desc_->GetName().c_str()); return ACL_ERROR_GE_INTERNAL_ERROR; } @@ -274,14 +290,16 @@ Status TbeTaskBuilder::GetSmDesc(void **sm_desc, const SingleOpModelParam ¶m auto rt_ret = rtMemAllocManaged(sm_desc, sm_desc_str.size(), RT_MEMORY_SPM); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemAllocManaged failed, ret: %d", static_cast(rt_ret)); + GELOGE(rt_ret, "[Invoke][rtMemAllocManaged] failed, ret: %d.", static_cast(rt_ret)); + REPORT_CALL_ERROR("E19999", "rtMemAllocManaged failed, ret: %d.", static_cast(rt_ret)); return rt_ret; } rt_ret = rtMemcpy(*sm_desc, sm_desc_str.size(), sm_desc_str.data(), sm_desc_str.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { (void)rtMemFreeManaged(*sm_desc); - GELOGE(rt_ret, "rtMemcpy, ret: %d", static_cast(rt_ret)); + GELOGE(rt_ret, "[Update][Param:sm_desc] fail for rtMemcpy return: %d.", static_cast(rt_ret)); + REPORT_INNER_ERROR("E19999", "rtMemcpy failed, ret:%d.", static_cast(rt_ret)); return rt_ret; } } @@ -290,86 +308,65 @@ Status TbeTaskBuilder::GetSmDesc(void **sm_desc, const SingleOpModelParam ¶m } Status TbeTaskBuilder::SetKernelArgs(TbeOpTask &task, const SingleOpModelParam ¶m, const OpDescPtr &op_desc) { - size_t arg_size = kernel_def_.args_size(); - auto args = std::unique_ptr(new (std::nothrow) uint8_t[arg_size]); - GE_CHECK_NOTNULL(args); - - auto rt_ret = rtMemcpy(args.get(), arg_size, kernel_def_.args().data(), arg_size, RT_MEMCPY_HOST_TO_HOST); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemcpy args failed, size = %zu, ret = %d", arg_size, static_cast(rt_ret)); - return RT_ERROR_TO_GE_STATUS(rt_ret); + auto task_type = static_cast(task_def_.type()); + bool is_task_all_kernel = (task_type == RT_MODEL_TASK_ALL_KERNEL); + size_t arg_size = 0; + std::unique_ptr args = nullptr; + if (is_task_all_kernel) { + GELOGD("SetKernelArgs of %s in branch of RT_MODEL_TASK_ALL_KERNEL.", op_desc->GetName().c_str()); + arg_size = kernel_def_with_handle_.args_size(); + args = std::unique_ptr(new (std::nothrow) uint8_t[arg_size]); + GE_CHECK_NOTNULL(args); + GE_CHK_RT_RET(rtMemcpy(args.get(), arg_size, kernel_def_with_handle_.args().data(), arg_size, + RT_MEMCPY_HOST_TO_HOST)) + } else { + GELOGD("SetKernelArgs of %s in branch of RT_MODEL_TASK_KERNEL.", op_desc->GetName().c_str()); + arg_size = kernel_def_.args_size(); + args = std::unique_ptr(new (std::nothrow) uint8_t[arg_size]); + GE_CHECK_NOTNULL(args); + GE_CHK_RT_RET(rtMemcpy(args.get(), arg_size, kernel_def_.args().data(), arg_size, RT_MEMCPY_HOST_TO_HOST)) } - const domi::KernelContext &context = kernel_def_.context(); + const domi::KernelContext &context = task_type == RT_MODEL_TASK_ALL_KERNEL ? + kernel_def_with_handle_.context() : kernel_def_.context(); const auto *args_offset_tmp = reinterpret_cast(context.args_offset().data()); uint16_t offset = *args_offset_tmp; - bool is_dynamic = false; - (void)AttrUtils::GetBool(op_desc_, kAttrSupportDynamicShape, is_dynamic); - if (is_dynamic) { - GE_CHK_STATUS_RET_NOLOG(InitTilingInfo(task)); - } else { - // copy args - std::vector tensor_device_addr_vec = BuildTaskUtils::GetKernelArgs(op_desc_, param); - void *src_addr = reinterpret_cast(tensor_device_addr_vec.data()); - uint64_t src_len = sizeof(void *) * tensor_device_addr_vec.size(); - rt_ret = rtMemcpy(args.get() + offset, arg_size - offset, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", static_cast(rt_ret)); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - } - task.SetKernelArgs(std::move(args), arg_size, kernel_def_.block_dim(), op_desc); + // copy args + std::vector tensor_device_addr_vec = BuildTaskUtils::GetKernelArgs(op_desc_, param); + void *src_addr = reinterpret_cast(tensor_device_addr_vec.data()); + uint64_t src_len = sizeof(void *) * tensor_device_addr_vec.size(); + GE_CHK_RT_RET(rtMemcpy(args.get() + offset, arg_size - offset, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST)); - return SUCCESS; -} - -Status TbeTaskBuilder::SetKernelWithHandleArgs(TbeOpTask &task, const SingleOpModelParam ¶m, - const OpDescPtr &op_desc) { - size_t arg_size = kernel_def_with_handle_.args_size(); - auto args = std::unique_ptr(new (std::nothrow) uint8_t[arg_size]); - GE_CHECK_NOTNULL(args); - - auto rt_ret = rtMemcpy(args.get(), arg_size, kernel_def_with_handle_.args().data(), arg_size, RT_MEMCPY_HOST_TO_HOST); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemcpy args failed, size = %zu, ret = %d", arg_size, static_cast(rt_ret)); - return rt_ret; + if (is_task_all_kernel) { + task.SetKernelWithHandleArgs(std::move(args), arg_size, kernel_def_with_handle_.block_dim(), op_desc, + kernel_def_with_handle_); + } else { + task.SetKernelArgs(std::move(args), arg_size, kernel_def_.block_dim(), op_desc); } - const domi::KernelContext &context = kernel_def_with_handle_.context(); - const auto *args_offset_tmp = reinterpret_cast(context.args_offset().data()); - uint16_t offset = *args_offset_tmp; - bool is_dynamic = false; (void)AttrUtils::GetBool(op_desc_, kAttrSupportDynamicShape, is_dynamic); if (is_dynamic) { GE_CHK_STATUS_RET_NOLOG(InitTilingInfo(task)); - } else { - // copy args - std::vector tensor_device_addr_vec = BuildTaskUtils::GetKernelArgs(op_desc_, param); - void *src_addr = reinterpret_cast(tensor_device_addr_vec.data()); - uint64_t src_len = sizeof(void *) * tensor_device_addr_vec.size(); - rt_ret = rtMemcpy(args.get() + offset, arg_size - offset, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", static_cast(rt_ret)); - return rt_ret; + if (!param.graph_is_dynamic && task.tiling_buffer_ != nullptr) { + GELOGD("Need to update run info when graph is static with dynamic node: %s.", op_desc->GetName().c_str()); + task.UpdateRunInfo(); + GE_CHK_RT_RET(rtMemcpy(task.tiling_buffer_, task.max_tiling_size_, task.tiling_data_.data(), + task.tiling_data_.size(), RT_MEMCPY_HOST_TO_DEVICE)); } } - task.SetKernelWithHandleArgs(std::move(args), arg_size, kernel_def_with_handle_.block_dim(), op_desc, - kernel_def_with_handle_); - return SUCCESS; } Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam ¶m) { GELOGD("Build tbe task begin"); - auto task_type = static_cast(task_def_.type()); - auto ret = task_type == RT_MODEL_TASK_ALL_KERNEL ? SetKernelWithHandleArgs(task, param, op_desc_) : - SetKernelArgs(task, param, op_desc_); + auto ret = SetKernelArgs(task, param, op_desc_); if (ret != SUCCESS) { return ret; } + auto task_type = static_cast(task_def_.type()); ret = task_type == RT_MODEL_TASK_ALL_KERNEL ? RegisterKernelWithHandle(task, param) : RegisterKernel(task, param); task.SetHandle(handle_); @@ -384,7 +381,8 @@ Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam ¶ void *stub_func = nullptr; auto rt_ret = rtGetFunctionByName(stub_name_.c_str(), &stub_func); if (rt_ret != SUCCESS) { - GELOGE(rt_ret, "rtGetFunctionByName failed."); + GELOGE(rt_ret, "[Get][FunctionByName] failed. stub_name:%s.", stub_name_.c_str()); + REPORT_CALL_ERROR("E19999", "rtGetFunctionByName failed, stub_name:%s.", stub_name_.c_str()); return RT_ERROR_TO_GE_STATUS(rt_ret); } task.SetStubFunc(stub_name_, stub_func); @@ -399,7 +397,10 @@ Status TbeTaskBuilder::InitTilingInfo(TbeOpTask &task) { (void)AttrUtils::GetInt(op_desc_, kAttrOpParamSize, max_size); GELOGD("Got op param size by key: %s, ret = %ld", kAttrOpParamSize, max_size); if (max_size < 0) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[%s] Invalid op_param_size: %ld.", op_desc_->GetName().c_str(), max_size); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Get][Int] %s Invalid op_param_size: %ld.", + op_desc_->GetName().c_str(), max_size); + REPORT_CALL_ERROR("E19999", "AttrUtils::GetInt failed, %s Invalid op_param_size: %ld.", + op_desc_->GetName().c_str(), max_size); return ACL_ERROR_GE_PARAM_INVALID; } void *tiling_buffer = nullptr; @@ -409,7 +410,7 @@ Status TbeTaskBuilder::InitTilingInfo(TbeOpTask &task) { GELOGD("[%s] Done allocating tiling buffer, size=%ld.", op_desc_->GetName().c_str(), max_size); } - task.EnableDynamicSupport(node_, tiling_buffer, static_cast(max_size)); + task.EnableDynamicSupport(node_, tiling_buffer, static_cast(max_size)); return SUCCESS; } } // namespace ge diff --git a/ge/single_op/task/tbe_task_builder.h b/ge/single_op/task/tbe_task_builder.h index 8af9a68d..a202cbf1 100755 --- a/ge/single_op/task/tbe_task_builder.h +++ b/ge/single_op/task/tbe_task_builder.h @@ -97,7 +97,6 @@ class TbeTaskBuilder { private: Status InitTilingInfo(TbeOpTask &task); Status SetKernelArgs(TbeOpTask &task, const SingleOpModelParam ¶m, const OpDescPtr &op_desc); - Status SetKernelWithHandleArgs(TbeOpTask &task, const SingleOpModelParam ¶m, const OpDescPtr &op_desc); Status GetSmDesc(void **sm_desc, const SingleOpModelParam ¶m) const; Status RegisterKernel(TbeOpTask &task, const SingleOpModelParam ¶m); diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index 834df591..5ae5f036 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -110,6 +110,7 @@ const char *const SAVE_ORIGINAL_MODEL = "ge.saveOriginalModel"; const char *const ORIGINAL_MODEL_FILE = "ge.originalModelFile"; const char *const INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16"; const char *const OP_DEBUG_LEVEL = "ge.opDebugLevel"; +const char *const PERFORMANCE_MODE = "ge.performance_mode"; } // namespace configure_option // Configure stream num by Session constructor options param, // its value should be int32_t type, default value is "1" @@ -311,6 +312,14 @@ const std::string OP_BANK_UPDATE_FLAG = "ge.op_bank_update"; // 0: data multi; 1: model multi; const std::string HCOM_MULTI_MODE = "ge.hcomMultiMode"; +// atc and ir option +const char *const INPUT_SHAPE_RANGE = "input_shape_range"; + +// Configure express high compile performance or high execute performance +// normal: no need to compile, used saved .o files directly +// high: need to recompile, high execute performance mode +const std::string PERFORMANCE_MODE = "ge.performance_mode"; + // Graph run mode enum GraphRunMode { PREDICTION = 0, TRAIN }; @@ -385,11 +394,13 @@ static const char *const MDL_BANK_PATH = ge::MDL_BANK_PATH_FLAG.c_str(); static const char *const OP_BANK_PATH = ge::OP_BANK_PATH_FLAG.c_str(); static const char *const OP_BANK_UPDATE = ge::OP_BANK_UPDATE_FLAG.c_str(); static const char *const OP_DEBUG_LEVEL = ge::OP_DEBUG_LEVEL.c_str(); +static const char *const PERFORMANCE_MODE = ge::PERFORMANCE_MODE.c_str(); // for interface: aclgrphBuildModel #ifdef __GNUC__ const std::set ir_builder_suppported_options = {INPUT_FORMAT, INPUT_SHAPE, + INPUT_SHAPE_RANGE, OP_NAME_MAP, DYNAMIC_BATCH_SIZE, DYNAMIC_IMAGE_SIZE, @@ -408,7 +419,8 @@ const std::set ir_builder_suppported_options = {INPUT_FORMAT, OP_COMPILER_CACHE_MODE, MDL_BANK_PATH, OP_BANK_PATH, - OP_BANK_UPDATE}; + OP_BANK_UPDATE, + PERFORMANCE_MODE}; // for interface: aclgrphParse const std::set ir_parser_suppported_options = { diff --git a/inc/framework/common/debug/ge_log.h b/inc/framework/common/debug/ge_log.h index c1359a20..754712f3 100644 --- a/inc/framework/common/debug/ge_log.h +++ b/inc/framework/common/debug/ge_log.h @@ -20,6 +20,7 @@ #include #include "framework/common/ge_inner_error_codes.h" +#include "common/util/error_manager/error_manager.h" #include "toolchain/slog.h" #ifdef __GNUC__ #include @@ -41,9 +42,9 @@ class GE_FUNC_VISIBILITY GeLog { public: static uint64_t GetTid() { #ifdef __GNUC__ - thread_local static uint64_t tid = static_cast(syscall(__NR_gettid)); + uint64_t tid = static_cast(syscall(__NR_gettid)); #else - thread_local static uint64_t tid = static_cast(GetCurrentThreadId()); + uint64_t tid = static_cast(GetCurrentThreadId()); #endif return tid; } @@ -55,9 +56,10 @@ inline bool IsLogEnable(int module_name, int log_level) { return (enable == 1); } -#define GELOGE(ERROR_CODE, fmt, ...) \ - dlog_error(GE_MODULE_NAME, "%lu %s: ErrorNo: %d(%s) " fmt, GeLog::GetTid(), __FUNCTION__, ERROR_CODE, \ - ((GE_GET_ERRORNO_STR(ERROR_CODE)).c_str()), ##__VA_ARGS__) +#define GELOGE(ERROR_CODE, fmt, ...) \ + dlog_error(GE_MODULE_NAME, "%lu %s: ErrorNo: %d(%s) %s" fmt, GeLog::GetTid(), __FUNCTION__, ERROR_CODE, \ + ((GE_GET_ERRORNO_STR(ERROR_CODE)).c_str()), ErrorManager::GetInstance().GetLogHeader().c_str(), \ + ##__VA_ARGS__) #define GELOGW(fmt, ...) \ if (IsLogEnable(GE_MODULE_NAME, DLOG_WARN)) \ dlog_warn(GE_MODULE_NAME, "%lu %s:" fmt, GeLog::GetTid(), __FUNCTION__, ##__VA_ARGS__) diff --git a/inc/framework/common/debug/log.h b/inc/framework/common/debug/log.h index 58cb3693..f06faa1b 100644 --- a/inc/framework/common/debug/log.h +++ b/inc/framework/common/debug/log.h @@ -19,6 +19,7 @@ #include #include +#include #include "runtime/rt.h" #include "common/string_util.h" @@ -85,12 +86,13 @@ } while (0); // If expr is not GRAPH_SUCCESS, print the log and return FAILED -#define GE_CHK_GRAPH_STATUS_RET(expr, ...) \ - do { \ - if ((expr) != ge::GRAPH_SUCCESS) { \ - DOMI_LOGE(__VA_ARGS__); \ - return FAILED; \ - } \ +#define GE_CHK_GRAPH_STATUS_RET(expr, ...) \ + do { \ + if ((expr) != ge::GRAPH_SUCCESS) { \ + REPORT_CALL_ERROR("E19999", "Operator graph failed"); \ + DOMI_LOGE(__VA_ARGS__); \ + return FAILED; \ + } \ } while (0); // If expr is not SUCCESS, print the log and execute a custom statement @@ -105,6 +107,7 @@ do { \ bool b = (expr); \ if (!b) { \ + REPORT_INNER_ERROR("E19999", __VA_ARGS__); \ GELOGE(_status, __VA_ARGS__); \ return _status; \ } \ @@ -193,6 +196,7 @@ { \ bool b = (expr); \ if (b) { \ + REPORT_INNER_ERROR("E19999", __VA_ARGS__); \ DOMI_LOGE(__VA_ARGS__); \ exec_expr; \ return _status; \ @@ -229,13 +233,14 @@ } // If expr is not RT_ERROR_NONE, print the log and return -#define GE_CHK_RT_RET(expr) \ - do { \ - rtError_t _rt_ret = (expr); \ - if (_rt_ret != RT_ERROR_NONE) { \ - DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ - return RT_ERROR_TO_GE_STATUS(_rt_ret); \ - } \ +#define GE_CHK_RT_RET(expr) \ + do { \ + rtError_t _rt_ret = (expr); \ + if (_rt_ret != RT_ERROR_NONE) { \ + REPORT_CALL_ERROR("E19999", "Call %s fail, ret: 0x%X", #expr, _rt_ret); \ + DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ + return RT_ERROR_TO_GE_STATUS(_rt_ret); \ + } \ } while (0); // If expr is true, execute exec_expr without printing logs @@ -255,10 +260,10 @@ exec_expr1; \ } -#define GE_ERRORLOG_AND_ERRORMSG(_status, errormsg) \ - { \ - GELOGE(_status, "%s", errormsg); \ - ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {errormsg}); \ +#define GE_ERRORLOG_AND_ERRORMSG(_status, errormsg) \ + { \ + GELOGE(_status, "[Check][InnerData]%s", errormsg); \ + REPORT_INNER_ERROR("E19999", "%s", errormsg); \ } #define GE_WARNINGLOG_AND_ERRORMSG(errormsg) \ diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h index 0d996a67..b37574f7 100644 --- a/inc/framework/common/ge_types.h +++ b/inc/framework/common/ge_types.h @@ -67,8 +67,9 @@ struct DataBuffer { void *data; // Data address uint64_t length; // Data length bool isDataSupportMemShare = false; - DataBuffer(void *dataIn, uint64_t len, bool isSupportMemShare) - : data(dataIn), length(len), isDataSupportMemShare(isSupportMemShare) {} + uint32_t placement = 0; + DataBuffer(void *dataIn, uint64_t len, bool isSupportMemShare, uint32_t placement = 0) + : data(dataIn), length(len), isDataSupportMemShare(isSupportMemShare), placement(placement) {} DataBuffer() : data(nullptr), length(0), isDataSupportMemShare(false) {} }; diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h index 2dbb1753..91759b8f 100644 --- a/inc/framework/common/types.h +++ b/inc/framework/common/types.h @@ -130,6 +130,8 @@ REGISTER_OPTYPE_DECLARE(REFORMAT, "ReFormat"); REGISTER_OPTYPE_DECLARE(DEPCONVOLUTION, "ConvolutionDepthwise"); REGISTER_OPTYPE_DECLARE(DROPOUT, "Dropout"); REGISTER_OPTYPE_DECLARE(DROPOUTDOMASK, "DropOutDoMask"); +REGISTER_OPTYPE_DECLARE(DROPOUTDOMASKV3, "DropOutDoMaskV3"); +REGISTER_OPTYPE_DECLARE(DROPOUTDOMASKV3D, "DropOutDoMaskV3D"); REGISTER_OPTYPE_DECLARE(DROPOUTGENMASK, "DropOutGenMask"); REGISTER_OPTYPE_DECLARE(CONCAT, "Concat"); REGISTER_OPTYPE_DECLARE(ROIPOOLING, "ROIPooling"); diff --git a/inc/framework/common/util.h b/inc/framework/common/util.h index 525cf3ea..92cb8397 100644 --- a/inc/framework/common/util.h +++ b/inc/framework/common/util.h @@ -30,12 +30,12 @@ #include "framework/common/ge_inner_error_codes.h" #include "mmpa/mmpa_api.h" -#define GE_CHECK_POSITIVE_SIZE_RANGE(size) \ - do { \ - if (size <= 0) { \ - DOMI_LOGE("param[%s] is not a positive number", #size); \ - return PARAM_INVALID; \ - } \ +#define GE_CHECK_POSITIVE_SIZE_RANGE(size) \ + do { \ + if (size <= 0) { \ + DOMI_LOGE("param[%s] is not a positive number", #size); \ + return PARAM_INVALID; \ + } \ } while (0) #define CHECK_FALSE_EXEC(expr, exec_expr, ...) \ @@ -113,84 +113,77 @@ } while (0) // Check if the parameter is null. If yes, return PARAM_INVALID and record the error -#define GE_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - DOMI_LOGE("param[%s] must not be null.", #val); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + REPORT_INNER_ERROR("E19999", "Param:%s is nullptr, check invalid when %s", \ + #val, __FUNCTION__); \ + DOMI_LOGE("[Check][Param:%s]null is invalid when %s.", #val, __FUNCTION__); \ + return ge::PARAM_INVALID; \ + } \ } while (0) // Check if the parameter is null. If yes, just return and record the error -#define GE_CHECK_NOTNULL_JUST_RETURN(val) \ - do { \ - if (val == nullptr) { \ - DOMI_LOGE("param[%s] must not be null.", #val); \ - return; \ - } \ +#define GE_CHECK_NOTNULL_JUST_RETURN(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE("param[%s] must not be null.", #val); \ + return; \ + } \ } while (0) // Check whether the parameter is null. If so, execute the exec_expr expression and record the error log -#define GE_CHECK_NOTNULL_EXEC(val, exec_expr) \ - do { \ - if (val == nullptr) { \ - DOMI_LOGE("param[%s] must not be null.", #val); \ - exec_expr; \ - } \ +#define GE_CHECK_NOTNULL_EXEC(val, exec_expr) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE("param[%s] must not be null.", #val); \ + exec_expr; \ + } \ } while (0) // Check whether the parameter is null. If yes, return directly and record the error log -#define GE_RT_VOID_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - DOMI_LOGE("param[%s] must not be null.", #val); \ - return; \ - } \ +#define GE_RT_VOID_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE("param[%s] must not be null.", #val); \ + return; \ + } \ } while (0) // Check if the parameter is null. If yes, return false and record the error log -#define GE_RT_FALSE_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - DOMI_LOGE("param[%s] must not be null.", #val); \ - return false; \ - } \ +#define GE_RT_FALSE_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE("param[%s] must not be null.", #val); \ + return false; \ + } \ } while (0) // Check if the parameter is out of bounds -#define GE_CHECK_SIZE(size) \ - do { \ - if (size == 0) { \ - DOMI_LOGE("param[%s] is out of range", #size); \ - return ge::PARAM_INVALID; \ - } \ - } while (0) - -// Check if the container is empty -#define GE_CHECK_VECTOR_NOT_EMPTY(vector) \ - do { \ - if (vector.empty()) { \ - DOMI_LOGE("param[%s] is empty!", #vector); \ - return ge::FAILED; \ - } \ +#define GE_CHECK_SIZE(size) \ + do { \ + if (size == 0) { \ + DOMI_LOGE("param[%s] is out of range", #size); \ + return ge::PARAM_INVALID; \ + } \ } while (0) // Check if the value on the left is greater than or equal to the value on the right -#define GE_CHECK_GE(lhs, rhs) \ - do { \ - if (lhs < rhs) { \ - DOMI_LOGE("param[%s] is less than[%s]", #lhs, #rhs); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_GE(lhs, rhs) \ + do { \ + if (lhs < rhs) { \ + DOMI_LOGE("param[%s] is less than[%s]", #lhs, #rhs); \ + return ge::PARAM_INVALID; \ + } \ } while (0) // Check if the value on the left is less than or equal to the value on the right -#define GE_CHECK_LE(lhs, rhs) \ - do { \ - if (lhs > rhs) { \ - DOMI_LOGE("param[%s] is greater than[%s]", #lhs, #rhs); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_LE(lhs, rhs) \ + do { \ + if (lhs > rhs) { \ + DOMI_LOGE("param[%s] is greater than[%s]", #lhs, #rhs); \ + return ge::PARAM_INVALID; \ + } \ } while (0) #define GE_DELETE_NEW_SINGLE(var) \ @@ -209,6 +202,17 @@ } \ } while (0) +#define GE_FREE_RT_LOG(addr) \ + do { \ + if (addr != nullptr) { \ + rtError_t error = rtFree(addr); \ + if (error != RT_ERROR_NONE) { \ + GELOGE(RT_FAILED, "Call rtFree failed, error: %#x", error); \ + } \ + addr = nullptr; \ + } \ + } while (0) + /** * @ingroup domi_common * @brief version of om.proto file diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h index 2d7d007b..24f969dd 100644 --- a/inc/framework/generator/ge_generator.h +++ b/inc/framework/generator/ge_generator.h @@ -31,6 +31,7 @@ #include "omg/omg_inner_types.h" namespace ge { +class GeRootModel; class GE_FUNC_VISIBILITY GeGenerator { public: static GeGenerator &GetInstance() { @@ -64,10 +65,12 @@ class GE_FUNC_VISIBILITY GeGenerator { /// @param [in] inputs: input tensors. /// @param [in] outputs: output tensors. /// @param [in] model_file_name: name of model file. + /// @param [in] compile_flag: op build flag, accurate build is 0, fuzz build is 1 /// @return SUCCESS or FAILED /// Status BuildSingleOpModel(OpDescPtr &op_desc, const std::vector &inputs, - const std::vector &outputs, const std::string &model_file_name); + const std::vector &outputs, const std::string &model_file_name, + int32_t compile_flag = 0); /// /// @ingroup ge /// @brief: Build single Op into model buff. @@ -75,10 +78,13 @@ class GE_FUNC_VISIBILITY GeGenerator { /// @param [in] inputs: input tensors. /// @param [in] outputs: output tensors. /// @param [in] engine_type: engine type. + /// @param [in] compile_flag: op build flag, accurate build is 0, fuzz build is 1 /// @param [out] model_buff: model buff of op. /// @return SUCCESS or FAILED Status BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, OpEngineType engine_type, ModelBufferData &model_buff); + Status BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, + OpEngineType engine_type, int32_t compile_flag, ModelBufferData &model_buff); /// /// @ingroup ge /// @brief: Build single Op into model buff. @@ -96,9 +102,14 @@ class GE_FUNC_VISIBILITY GeGenerator { ge::ModelBufferData &model, bool is_offline = true); Status BuildSingleOp(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff, - bool is_offline = true); + bool is_offline = true, int32_t compile_flag = 0); + bool CheckNoAicore(const ComputeGraphPtr &graph); + void RemoveConst(const vector &inputs, vector &outputs); Status CheckForSingleOp(OpDescPtr &op_desc, const vector &inputs, const vector &outputs); + using GeRootModelPtr = std::shared_ptr; + Status SetModelNameForDump(const GeRootModelPtr &ge_root_model); + class Impl; std::shared_ptr impl_; diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h index 54c9ab4a..84f6ef46 100644 --- a/inc/framework/omg/omg_inner_types.h +++ b/inc/framework/omg/omg_inner_types.h @@ -123,6 +123,7 @@ struct OmgContext { bool need_multi_batch = false; std::vector data_nodes; std::vector getnext_nosink_nodes; + bool fuzz_compile_flag = false; }; } // namespace ge diff --git a/metadef b/metadef index 2607691f..99627af3 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit 2607691fc5edaad412d21c9f4a3284b02cfc8c5e +Subproject commit 99627af3e039343ee972701acaf9a6f376a6ca77 diff --git a/parser b/parser index 6a07f1a8..424ac060 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit 6a07f1a8b9b8b4630a5b60d9d8d02ec4a6314d68 +Subproject commit 424ac0609fe17f455865436462a2c62f85aea2b1 diff --git a/tests/depends/error_manager/src/error_manager_stub.cc b/tests/depends/error_manager/src/error_manager_stub.cc index eadc8687..5f62c91b 100644 --- a/tests/depends/error_manager/src/error_manager_stub.cc +++ b/tests/depends/error_manager/src/error_manager_stub.cc @@ -18,6 +18,14 @@ using namespace ErrorMessage; +namespace ErrorMessage { +int FormatErrorMessage(char *str_dst, size_t dst_max, const char *format, ...) { + return 1; +} +} + +thread_local Context ErrorManager::error_context_ = {0, "", "", ""}; + ErrorManager &ErrorManager::GetInstance() { static ErrorManager instance; return instance; @@ -40,6 +48,10 @@ using namespace ErrorMessage; return 0; } + int ErrorManager::ReportInterErrMessage(std::string error_code, const std::string &error_msg) { + return 0; + } + /// /// @brief output error message /// @param [in] handle: print handle @@ -84,7 +96,7 @@ using namespace ErrorMessage; void ErrorManager::GenWorkStreamIdBySessionGraph(uint64_t session_id, uint64_t graph_id) {} - const std::string &ErrorManager::GetLogHeader() { return "[TEST][TEST]"; } + const std::string &ErrorManager::GetLogHeader() { return error_context_.log_header; } struct Context &ErrorManager::GetErrorContext() { struct Context error_context; diff --git a/tests/depends/mmpa/src/mmpa_stub.cc b/tests/depends/mmpa/src/mmpa_stub.cc index 5b6dbd22..62499ca1 100644 --- a/tests/depends/mmpa/src/mmpa_stub.cc +++ b/tests/depends/mmpa/src/mmpa_stub.cc @@ -269,7 +269,7 @@ CHAR *mmDlerror() INT32 mmDladdr(VOID *addr, mmDlInfo *info) { - return 0; + return -1; } VOID *mmDlopen(const CHAR *fileName, INT32 mode) diff --git a/tests/depends/runtime/src/runtime_stub.cc b/tests/depends/runtime/src/runtime_stub.cc index 440b98e7..00873b8f 100644 --- a/tests/depends/runtime/src/runtime_stub.cc +++ b/tests/depends/runtime/src/runtime_stub.cc @@ -43,6 +43,11 @@ rtError_t rtEventCreate(rtEvent_t *event) { *event = new int[EVENT_LENTH]; return RT_ERROR_NONE; } + +rtError_t rtEventCreateWithFlag(rtEvent_t *event, uint32_t flag) { + return rtEventCreate(event); +} + rtError_t rtEventRecord(rtEvent_t event, rtStream_t stream) { return RT_ERROR_NONE; } rtError_t rtEventSynchronize(rtEvent_t event) { return RT_ERROR_NONE; } @@ -313,6 +318,8 @@ rtError_t rtFlushCache(uint64_t base, uint32_t len) { return RT_ERROR_NONE; } rtError_t rtProfilerTrace(uint64_t id, bool notify, uint32_t flags, rtStream_t stream_) { return RT_ERROR_NONE; } +rtError_t rtProfilerTraceEx(uint64_t id, uint64_t modelId, uint16_t tagId, rtStream_t stream) { return RT_ERROR_NONE; } + rtError_t rtMemSetRC(const void *dev_ptr, uint64_t size, uint32_t read_count) { return RT_ERROR_NONE; } rtError_t rtStreamSwitch(void *ptr, rtCondition_t condition, int64_t value, rtStream_t true_stream, rtStream_t stream) { @@ -435,3 +442,7 @@ rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId) rtError_t rtDebugRegisterForStream(rtStream_t stream, uint32_t flag, const void *addr, uint32_t *streamId, uint32_t *taskId) { return RT_ERROR_NONE; } + +rtError_t rtDebugUnRegisterForStream(rtStream_t stream) { + return RT_ERROR_NONE; +} \ No newline at end of file diff --git a/tests/ut/common/graph/CMakeLists.txt b/tests/ut/common/graph/CMakeLists.txt index 1c64dce1..a957298a 100644 --- a/tests/ut/common/graph/CMakeLists.txt +++ b/tests/ut/common/graph/CMakeLists.txt @@ -20,7 +20,7 @@ set(CMAKE_CXX_STANDARD 11) set(PROTO_LIST "${GE_CODE_DIR}/metadef/proto/om.proto" "${GE_CODE_DIR}/metadef/proto/ge_ir.proto" - "${GE_CODE_DIR}/metadef/proto/proto_inner/ge_onnx.proto" + "${GE_CODE_DIR}/metadef/proto/onnx/ge_onnx.proto" ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) @@ -38,6 +38,7 @@ include_directories(${GE_CODE_DIR}/metadef/inc) include_directories(${GE_CODE_DIR}/metadef/inc/graph) include_directories(${GE_CODE_DIR}/metadef/inc/common) include_directories(${GE_CODE_DIR}/metadef/third_party) +include_directories(${GE_CODE_DIR}/metadef/third_party/transformer/inc) include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc) include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${CMAKE_BINARY_DIR}) @@ -95,11 +96,12 @@ set(SRC_FILES "${GE_CODE_DIR}/metadef/graph/utils/tensor_utils.cc" "${GE_CODE_DIR}/metadef/ops/op_imp.cpp" "${GE_CODE_DIR}/metadef/graph/opsproto/opsproto_manager.cc" - "${GE_CODE_DIR}/metadef/graph/utils/transformer_utils.cc" "${GE_CODE_DIR}/metadef/graph/runtime_inference_context.cc" "${GE_CODE_DIR}/metadef/graph/ref_relation.cc" - "${GE_CODE_DIR}/metadef/third_party/transformer/src/transfer_shape_according_to_format.cpp" - "${GE_CODE_DIR}/metadef/third_party/transformer/src/axis_util.cpp" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/transfer_shape_according_to_format.cc" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/axis_util.cc" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/expand_dimension.cc" + "${GE_CODE_DIR}/metadef/graph/utils/transformer_utils.cc" ) #add_executable(ut_libgraph ${UT_FILES} ${SRC_FILES} ${PROTO_SRCS} ${PROTO_HDRS}) diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index c1a61c67..dabc1485 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -33,7 +33,7 @@ set(PROTO_LIST "${GE_CODE_DIR}/metadef/proto/tensorflow/tensor_shape.proto" "${GE_CODE_DIR}/metadef/proto/tensorflow/types.proto" "${GE_CODE_DIR}/metadef/proto/tensorflow/node_def.proto" - "${GE_CODE_DIR}/metadef/proto/proto_inner/ge_onnx.proto" + "${GE_CODE_DIR}/metadef/proto/onnx/ge_onnx.proto" ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) @@ -45,6 +45,7 @@ include_directories(${GE_CODE_DIR}/inc) include_directories(${GE_CODE_DIR}/metadef/inc) include_directories(${GE_CODE_DIR}/ge) include_directories(${GE_CODE_DIR}/ge/inc) +include_directories(${GE_CODE_DIR}/ge/ir_build) include_directories(${GE_CODE_DIR}/metadef) include_directories(${GE_CODE_DIR}/metadef/graph) include_directories(${GE_CODE_DIR}/inc/external) @@ -54,6 +55,7 @@ include_directories(${GE_CODE_DIR}/metadef/inc/graph) include_directories(${GE_CODE_DIR}/inc/framework) include_directories(${GE_CODE_DIR}/metadef/inc/common) include_directories(${GE_CODE_DIR}/metadef/third_party) +include_directories(${GE_CODE_DIR}/metadef/third_party/transformer/inc) include_directories(${GE_CODE_DIR}/parser) include_directories(${GE_CODE_DIR}/parser/parser) include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc) @@ -61,6 +63,7 @@ include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc/cce) include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain) include_directories(${GE_CODE_DIR}/tests/ut/ge) +include_directories(${GE_CODE_DIR}/tests/ut/common) include_directories(${CMAKE_BINARY_DIR}) include_directories(${CMAKE_BINARY_DIR}/proto/ge) include_directories(${CMAKE_BINARY_DIR}/proto/ge/proto) @@ -85,8 +88,9 @@ set(GRAPH_SRC_FILES "${GE_CODE_DIR}/metadef/graph/node.cc" "${GE_CODE_DIR}/metadef/graph/runtime_inference_context.cc" "${GE_CODE_DIR}/metadef/graph/op_desc.cc" - "${GE_CODE_DIR}/metadef/third_party/transformer/src/transfer_shape_according_to_format.cpp" - "${GE_CODE_DIR}/metadef/third_party/transformer/src/axis_util.cpp" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/transfer_shape_according_to_format.cc" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/axis_util.cc" + "${GE_CODE_DIR}/metadef/third_party/transformer/src/expand_dimension.cc" "${GE_CODE_DIR}/metadef/graph/operator.cc" "${GE_CODE_DIR}/metadef/graph/operator_factory.cc" "${GE_CODE_DIR}/metadef/graph/operator_factory_impl.cc" @@ -162,7 +166,9 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc" "${GE_CODE_DIR}/ge/common/helper/model_helper.cc" "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc" + "${GE_CODE_DIR}/ge/common/dump/exception_dumper.cc" "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc" + "${GE_CODE_DIR}/ge/common/dump/dump_op.cc" "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc" "${GE_CODE_DIR}/ge/model/ge_root_model.cc" "${GE_CODE_DIR}/ge/common/model_parser/model_parser.cc" @@ -266,10 +272,14 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/passes/link_gen_mask_nodes_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/replace_with_empty_const_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/hccl_group_pass.cc" + "${GE_CODE_DIR}/ge/graph/passes/hccl_tailing_optimization_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/memcpy_addr_async_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/set_input_output_offset_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/remove_same_const_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/useless_control_out_remove_pass.cc" + "${GE_CODE_DIR}/ge/graph/passes/parallel_group_pass.cc" + "${GE_CODE_DIR}/ge/graph/passes/buffer_pool_memory_pass.cc" + "${GE_CODE_DIR}/ge/graph/passes/mark_node_unknown_shape_pass.cc" "${GE_CODE_DIR}/ge/model/ge_model.cc" "${GE_CODE_DIR}/ge/common/cust_aicpu_kernel_store.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/model_utils.cc" @@ -295,7 +305,6 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/ir_build/attr_options/keep_dtype_option.cc" "${GE_CODE_DIR}/ge/ir_build/attr_options/weight_compress_option.cc" "${GE_CODE_DIR}/ge/graph/build/label_allocator.cc" - "${GE_CODE_DIR}/ge/graph/passes/memcpy_addr_async_pass.cc" "${GE_CODE_DIR}/ge/graph/partition/stage_partition.cc" "${GE_CODE_DIR}/ge/graph/partition/dynamic_shape_partition.cc" "${GE_CODE_DIR}/ge/graph/optimize/summary_optimize.cc" @@ -317,6 +326,7 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/build/memory/block_mem_assigner.cc" "${GE_CODE_DIR}/ge/graph/build/memory/binary_block_mem_assigner.cc" "${GE_CODE_DIR}/ge/graph/build/memory/max_block_mem_assigner.cc" + "${GE_CODE_DIR}/ge/graph/build/memory/buffer_pool_mem_assigner.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_mem_allocator.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_var_manager.cc" "${GE_CODE_DIR}/ge/analyzer/analyzer.cc" @@ -421,7 +431,6 @@ set(DISTINCT_GRAPH_LOAD_SRC_FILES "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/memcpy_async_task_info.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/profiler_trace_task_info.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/stream_active_task_info.cc" - "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/stream_switch_task_info.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/end_graph_task_info.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/model_exit_task_info.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/task_info/super_kernel/super_kernel.cc" @@ -515,6 +524,7 @@ set(GRAPH_PASS_COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/passes/compile_nodes_pass.cc" "${GE_CODE_DIR}/ge/graph/common/transop_util.cc" "${GE_CODE_DIR}/ge/graph/passes/flow_ctrl_pass.cc" + "${GE_CODE_DIR}/ge/graph/passes/parallel_group_pass.cc" #"${GE_CODE_DIR}/ge/graph/optimize/optimizer/allreduce_fusion_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/folding_pass.cc" "${GE_CODE_DIR}/ge/graph/passes/variable_op_pass.cc" @@ -584,6 +594,7 @@ set(SINGLE_OP_SRC_FILES "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_async_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_execution_context.cc" + "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_pipeline_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_context.cc" "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/worker/task_compile_engine.cc" @@ -620,6 +631,7 @@ set(SINGLE_OP_SRC_FILES # test files set(COMMON_TEST_FILES "graph/passes/graph_builder_utils.cc" + "graph/utils/buffer_pool_graph_builder.cc" "test.cc" ) @@ -662,8 +674,10 @@ set(PASS_TEST_FILES "graph/passes/addn_pass_unittest.cc" "graph/passes/save_pass_unittest.cc" "graph/passes/merge_pass_unittest.cc" - #"graph/passes/switch_pass_unittest.cc" "graph/passes/switch_logic_remove_pass_unittest.cc" + "graph/passes/cond_branch_v1_unittest.cc" + "graph/passes/loop_branch_v1_unittest.cc" + "graph/passes/switch_dead_branch_elimination_unittest.cc" "graph/passes/assert_pass_unittest.cc" "graph/passes/dropout_pass_unittest.cc" "graph/passes/unused_const_pass_unittest.cc" @@ -691,7 +705,12 @@ set(PASS_TEST_FILES "graph/passes/infershape_pass_unittest.cc" "graph/passes/multi_batch_clone_pass_unittest.cc" "graph/passes/replace_with_empty_const_pass_unittest.cc" + "graph/passes/link_gen_mask_nodes_pass_unittest.cc" "graph/passes/transpose_transdata_pass_unittest.cc" + "graph/passes/parallel_group_pass_unittest.cc" + "graph/passes/buffer_pool_memory_pass_unittest.cc" + "graph/passes/mark_node_unknown_shape_pass_unittest.cc" + "graph/passes/reshape_recovery_pass_unittest.cc" ) set(KERNEL_TEST_FILES @@ -728,13 +747,17 @@ set(KERNEL_TEST_FILES "graph/passes/folding_kernel/gather_v2_kernel_unittest.cc" "graph/passes/folding_kernel/slice_kernel_unittest.cc" "graph/passes/folding_kernel/dynamic_stitch_kernel_unittest.cc" + "graph/passes/atomic_addr_clean_pass_unittest.cc" ) set(MULTI_PARTS_TEST_FILES "graph_ir/ge_operator_factory_unittest.cc" + "graph_ir/ge_ir_build_unittest.cc" "graph/transop_util_unittest.cc" "common/datatype_transfer_unittest.cc" "common/dump_manager_unittest.cc" + "common/dump_op_unittest.cc" + "common/dump_exception_unittest.cc" "common/opdebug_register_unittest.cc" "common/format_transfer_unittest.cc" "common/format_transfer_transpose_unittest.cc" @@ -753,12 +776,19 @@ set(MULTI_PARTS_TEST_FILES "common/format_transfer_fracz_nhwc_unittest.cc" "common/format_transfer_fracz_hwcn_unittest.cc" "common/ge_format_util_unittest.cc" + "common/ge_auth_file_saver_unittest.cc" "graph/variable_accelerate_ctrl_unittest.cc" "graph/build/logical_stream_allocator_unittest.cc" + "graph/build/model_builder_unittest.cc" "graph/build/mem_assigner_unittest.cc" + "graph/build/task_generator_unittest.cc" + "graph/build/buffer_pool_mem_assigner_unittest.cc" + "graph/execute/graph_execute_unittest.cc" "graph/preprocess/graph_preprocess_unittest.cc" "graph/manager/hcom_util_unittest.cc" "graph/manager/graph_caching_allocator_unittest.cc" + "graph/partition/dynamic_shape_partition_unittest.cc" + "graph/manager/graph_manager_unittest.cc" "session/omg_omg_unittest.cc" ) @@ -775,6 +805,7 @@ set(SINGLE_OP_TEST_FILES "single_op/single_op_manager_unittest.cc" "single_op/stream_resource_unittest.cc" "single_op/single_op_task_unittest.cc" + "single_op/single_op_unittest.cc" ) set(PROFILING_MNG_TEST_FILES @@ -783,6 +814,7 @@ set(PROFILING_MNG_TEST_FILES set(HYBRID_TEST_FILES "hybrid/ge_hybrid_unittest.cc" + "hybrid/known_node_executor_unittest.cc" ) set(OTHERS_TEST_FILES @@ -825,6 +857,7 @@ add_library(ge_ut_common STATIC ${COMMON_SRC_FILES} ${PROTO_HDRS}) target_compile_definitions(ge_ut_common PRIVATE google=ascend_private + $<$:ONLY_COMPILE_OPEN_SRC> ) target_compile_options(ge_ut_common PRIVATE @@ -838,6 +871,7 @@ target_link_libraries(ge_ut_common PRIVATE ascend_protobuf json ge_ut_graph + runtime_stub ) # build common format @@ -845,6 +879,7 @@ add_library(ge_ut_common_format STATIC ${COMMON_SRC_FILES} ${COMMON_FORMAT_SRC_F target_compile_definitions(ge_ut_common_format PRIVATE google=ascend_private + $<$:ONLY_COMPILE_OPEN_SRC> ) target_compile_options(ge_ut_common_format PRIVATE @@ -997,6 +1032,7 @@ add_library(ge_single_op STATIC ${SINGLE_OP_SRC_FILES} ${PROTO_HDRS}) target_compile_definitions(ge_single_op PRIVATE google=ascend_private + $<$:ONLY_COMPILE_OPEN_SRC> ) target_compile_options(ge_single_op PRIVATE @@ -1093,6 +1129,7 @@ target_compile_options(ut_libge_distinct_load_utest PRIVATE target_compile_definitions(ut_libge_distinct_load_utest PRIVATE google=ascend_private + $<$:ONLY_COMPILE_OPEN_SRC> ) target_link_libraries(ut_libge_distinct_load_utest diff --git a/tests/ut/ge/common/dump_exception_unittest.cc b/tests/ut/ge/common/dump_exception_unittest.cc new file mode 100644 index 00000000..339d532e --- /dev/null +++ b/tests/ut/ge/common/dump_exception_unittest.cc @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define protected public +#define private public +#include "common/dump/exception_dumper.h" +#include "common/debug/log.h" +#include "common/ge_inner_error_codes.h" +#undef private +#undef protected + +namespace ge { +class UTEST_dump_exception : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_dump_exception, save_dump_op_info_success) { + OpDescPtr op_desc = std::make_shared("GatherV2", "GatherV2"); + uint32_t task_id = 1; + uint32_t stream_id = 233; + vector input_addr; + vector output_addr; + ExceptionDumper exception_dumper; + exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addr, output_addr); +} + +TEST_F(UTEST_dump_exception, dump_exception_info) { + rtExceptionInfo exception_info = {1, 2, 3, 4, 5}; + std::vector exception_infos = { exception_info }; + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}}; + + ExceptionDumper exception_dumper; + exception_dumper.op_desc_info_ = { op_desc_info }; + exception_dumper.DumpExceptionInfo(exception_infos); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/common/dump_op_unittest.cc b/tests/ut/ge/common/dump_op_unittest.cc new file mode 100644 index 00000000..9007ac95 --- /dev/null +++ b/tests/ut/ge/common/dump_op_unittest.cc @@ -0,0 +1,61 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define protected public +#define private public +#include "common/dump/dump_op.h" +#include "common/debug/log.h" +#include "common/ge_inner_error_codes.h" +#include "common/dump/dump_properties.h" +#undef private +#undef protected + +namespace ge { +class UTEST_dump_op : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_dump_op, launch_dump_op_success) { + DumpOp dump_op; + DumpProperties dump_properties; + OpDescPtr op_desc = std::make_shared("GatherV2", "GatherV2"); + std::set temp; + dump_properties.model_dump_properties_map_.emplace("model1", temp); + dump_properties.enable_dump_ = "1"; + dump_op.SetDynamicModelInfo("model1", "model2", 1); + dump_op.SetDumpInfo(dump_properties, op_desc, {}, {}, nullptr); + auto ret = dump_op.LaunchDumpOp(); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST_F(UTEST_dump_op, launch_dump_op_success_2) { + DumpOp dump_op; + DumpProperties dump_properties; + OpDescPtr op_desc = std::make_shared("GatherV2", "GatherV2"); + std::set temp; + dump_properties.model_dump_properties_map_.emplace("model1", temp); + dump_properties.enable_dump_ = "1"; + dump_op.SetDynamicModelInfo("modle2", "model2", 1); + dump_op.SetDumpInfo(dump_properties, op_desc, {}, {}, nullptr); + auto ret = dump_op.LaunchDumpOp(); + EXPECT_EQ(ret, ge::SUCCESS); +} + +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/common/format_transfer_5d_nchw_unittest.cc b/tests/ut/ge/common/format_transfer_5d_nchw_unittest.cc index 64664a5c..0eded4d7 100644 --- a/tests/ut/ge/common/format_transfer_5d_nchw_unittest.cc +++ b/tests/ut/ge/common/format_transfer_5d_nchw_unittest.cc @@ -569,7 +569,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_shape1) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_shape2) { @@ -579,7 +579,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_shape2) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_shape1) { @@ -588,7 +588,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_shape1) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_shape2) { @@ -598,7 +598,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_shape2) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_dst_shape_relation) { @@ -608,7 +608,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_dst_shape_relation) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_format) { @@ -618,10 +618,10 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_format) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_FORMAT_INVALID); Status status = transfer.TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, args.dst_shape); - EXPECT_EQ(status, UNSUPPORTED); + EXPECT_EQ(status, ACL_ERROR_GE_FORMAT_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_format) { @@ -631,7 +631,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_dst_format) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_FORMAT_INVALID); } TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_data_type) { @@ -642,7 +642,7 @@ TEST_F(UTEST_FormatTransferNc1hwc0ToNchw, invalid_src_data_type) { TransResult result; FormatTransferNc1hwc0Nchw transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_DATATYPE_INVALID); } } // namespace formats } // namespace ge diff --git a/tests/ut/ge/common/format_transfer_fractal_nz_unittest.cc b/tests/ut/ge/common/format_transfer_fractal_nz_unittest.cc index fb579fc0..02f8251a 100644 --- a/tests/ut/ge/common/format_transfer_fractal_nz_unittest.cc +++ b/tests/ut/ge/common/format_transfer_fractal_nz_unittest.cc @@ -9148,7 +9148,7 @@ TEST_F(UtestFormatTransferNdFractNz, invalid_src_data_type3) { 4, 4, }, - DT_VARIANT}; + DT_STRING}; TransResult result; FormatTransferFractalNzND transfer; EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_DATATYPE_INVALID); diff --git a/tests/ut/ge/common/format_transfer_fracz_nhwc_unittest.cc b/tests/ut/ge/common/format_transfer_fracz_nhwc_unittest.cc index e406eb43..a4d6f9ae 100644 --- a/tests/ut/ge/common/format_transfer_fracz_nhwc_unittest.cc +++ b/tests/ut/ge/common/format_transfer_fracz_nhwc_unittest.cc @@ -39,7 +39,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_data_type) { TransResult result; FormatTransferFracZNhwc transfer; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_DATATYPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_format_reserved) { @@ -50,7 +50,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_format_reserved) reinterpret_cast(data), FORMAT_RESERVED, FORMAT_NHWC, {16, 1, 16, 16}, {1, 4, 4, 1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_FORMAT_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_format_reserved) { @@ -61,7 +61,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_format_reserved) reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_RESERVED, {16, 1, 16, 16}, {1, 4, 4, 1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_FORMAT_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_shape) { @@ -71,7 +71,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_shape) { TransArgs args{reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, 1, 16}, {1, 4, 4, 1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_shape2) { @@ -82,7 +82,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_shape2) { reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, -1, 16, 16}, {1, 4, 4, 1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_shape) { @@ -93,7 +93,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_shape) { reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, 1, 16, 16}, {1, 4, 4}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_shape2) { @@ -104,7 +104,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_dst_shape2) { reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, 1, 16, 16}, {1, 4, 4, -1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_dst_shape_relation1) { @@ -115,7 +115,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_dst_shape_relatio reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, 1, 16, 16}, {17, 4, 4, 1}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_dst_shape_relation2) { @@ -126,7 +126,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_invalid_src_dst_shape_relatio reinterpret_cast(data), FORMAT_FRACTAL_Z, FORMAT_NHWC, {16, 1, 16, 16}, {1, 4, 4, 17}, DT_FLOAT}; TransResult result; - EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransFormat(args, result), ACL_ERROR_GE_SHAPE_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_fp16_success_lt_cube) { @@ -301,7 +301,7 @@ TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_fp16_success_eq_cube) { } Status status = transfer.TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, args.dst_shape); - EXPECT_EQ(status, UNSUPPORTED); + EXPECT_EQ(status, ACL_ERROR_GE_FORMAT_INVALID); } TEST_F(UtestFormatTransferFraczNhwc, fracz_to_nhwc_fp16_success_gt_cube) { diff --git a/tests/ut/ge/common/format_transfer_hwcn_fractalz_unittest.cc b/tests/ut/ge/common/format_transfer_hwcn_fractalz_unittest.cc index b0c0b706..651b733b 100644 --- a/tests/ut/ge/common/format_transfer_hwcn_fractalz_unittest.cc +++ b/tests/ut/ge/common/format_transfer_hwcn_fractalz_unittest.cc @@ -34427,6 +34427,240 @@ TEST_F(UtestFormatTransferHwcnFz, fp32_2c_2n_pad) { } } +TEST_F(UtestFormatTransferHwcnFz, fp16_1c_1n_with_groups) { + uint16_t data[1 * 1 * 1 * 2] = {19, 88}; + uint16_t ret[1 * 1 * 16 * 16] ={19 , 0, 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 88, 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0, + 0 , 0 , 0, 0 ,0 , 0, 0, 0 , 0 , 0 , 0, 0, 0 , 0 , 0, 0}; + FormatTransferFractalZ transfer; + ge::Format old_format = FORMAT_FRACTAL_Z; + int32_t groups = 2; + ge::Format new_format = static_cast(ge::GetFormatFromSub(old_format, groups)); + TransArgs args{ + reinterpret_cast(data), FORMAT_HWCN, new_format, std::vector({1, 1, 1, 2}), + std::vector({1, 1, 16, 16}), DT_FLOAT16}; + + TransResult result; + EXPECT_EQ(transfer.TransFormat(args, result), SUCCESS); + EXPECT_EQ(result.length, sizeof(ret) / sizeof(ret[0]) * 2); + for (int i = 0; i < sizeof(ret) / sizeof(ret[0]); ++i) { + EXPECT_EQ((reinterpret_cast(result.data.get()))[i], ret[i]); + } +} + +TEST_F(UtestFormatTransferHwcnFz, fp16_4c_8n_with_groups_02) { +uint16_t data[3 * 3 * 4 * 8] = { + 11 , 99 , 68 , 2 , 14 , 59 , 24 , 100, + 4 , 65 , 11 , 7 , 74 , 28 , 71 , 81, + 94 , 63 , 80 , 7 , 95 , 29 , 92 , 76, + 88 , 68 , 67 , 98 , 82 , 11 , 20 , 68, + 36 , 17 , 15 , 89 , 31 , 8 , 51 , 49, + 49 , 89 , 79 , 97 , 7 , 91 , 14 , 34, + 55 , 40 , 85 , 59 , 31 , 35 , 41 , 89, + 4 , 82 , 90 , 48 , 44 , 19 , 9 , 84, +100 , 43 , 7 , 94 , 4 , 91 , 67 , 16, + 63 , 79 , 20 , 62 , 55 , 38 , 13 , 61, + 98 , 99 , 44 , 0 , 97 , 42 , 65 , 80, + 78 , 56 , 26 , 17 , 23 , 22 , 76 , 84, + 34 , 88 , 38 , 57 , 37 , 77 , 46 , 28, + 48 , 11 , 6 , 18 , 8 , 66 , 24 , 29, + 7 , 72 , 34 , 79 , 99 , 14 , 75 , 62, + 44 , 98 , 11 , 31 , 4 , 79 , 51 , 37, + 84 , 3 , 89 , 74 , 68 , 85 , 17 , 93, + 81 , 88 , 38 , 8 , 69 , 82 , 91 , 91, + 45 , 42 , 7 , 96 , 81 , 96 , 39 , 35, + 93 , 46 , 73 , 7 , 9 , 81 , 5 , 63, + 35 , 30 , 27 , 42 , 20 , 52 , 36 , 91, + 87 , 1 , 8 , 7 , 78 , 21 , 76 , 97, + 52 , 18 , 55 , 57 , 95 , 67 , 3 , 69, + 98 , 85 , 75 , 75 , 38 , 3 , 94 , 66, + 92 , 27 , 9 , 39 , 5 , 21 , 4 , 48, + 55 , 38 , 58 , 84 , 23 , 13 , 71 , 91, + 99 , 58 , 58 , 16 , 86 , 45 , 63 , 97, + 30 , 10 , 21 , 37 , 78 , 94 , 8 , 49, + 18 , 52 , 67 , 65 , 78 , 82 , 74 , 35, + 97 , 15 , 43 , 22 , 30 , 87 , 98 , 91, + 22 , 88 , 83 , 63 , 79 , 63 , 42 , 74, + 29 , 62 , 2 , 97 , 65 , 45 , 76 , 57, + 71 , 65 , 0 , 69 , 76 , 41 , 58 , 98, + 90 , 3 , 75 , 56 , 41 , 66 , 41 , 96, + 44 , 87 , 61 , 26 , 62 , 57 , 49 , 29, + 49 , 94 , 90 , 96 , 33 , 32 , 10 , 25}; + uint16_t ret[9 * 1 * 16 * 16] ={ + 11 , 4 , 94 , 88 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 99 , 65 , 63 , 68 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 68 , 11 , 80 , 67 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2 , 7 , 7 , 98 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 14 , 74, 95, 82, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 59 , 28, 29, 11, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 24 , 71, 92, 20, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 ,100 , 81, 76, 68, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 36 , 49 , 55 , 4 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 17 , 89 , 40 , 82 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 15 , 79 , 85 , 90 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 89 , 97 , 59 , 48 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 31 , 7, 31, 44, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 8 , 91, 35, 19, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 51 , 14, 41, 9, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 49 , 34, 89, 84, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100 , 63 , 98 , 78 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 43 , 79 , 99 , 56 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7 , 20 , 44 , 26 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 94 , 62 , 0 , 17 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 4 , 55, 97, 23, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 91 , 38, 42, 22, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 67 , 13, 65, 76, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 16 , 61, 80, 84, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 34 , 48 , 7 , 44 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 88 , 11 , 72 , 98 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 38 , 6 , 34 , 11 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 57 , 18 , 79 , 31 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 37 , 8, 99, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 77 , 66, 14, 79, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 46 , 24, 75, 51, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 28 , 29, 62, 37, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 84 , 81 , 45 , 93 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3 , 88 , 42 , 46 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 89 , 38 , 7 , 73 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 74 , 8 , 96 , 7 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 68 , 69, 81, 9, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 85 , 82, 96, 81, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 17 , 91, 39, 5, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 93 , 91, 35, 63, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 35 , 87 , 52 , 98 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 30 , 1 , 18 , 85 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 27 , 8 , 55 , 75 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 42 , 7 , 57 , 75 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 20 , 78, 95, 38, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 52 , 21, 67, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 36 , 76, 3, 94, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 91 , 97, 69, 66, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 92 , 55 , 99 , 30 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 27 , 38 , 58 , 10 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 9 , 58 , 58 , 21 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 39 , 84 , 16 , 37 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 5 , 23, 86, 78, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 21 , 13, 45, 94, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 4 , 71, 63, 8, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 48 , 91, 97, 49, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 18 , 97 , 22 , 29 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 52 , 15 , 88 , 62 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 67 , 43 , 83 , 2 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65 , 22 , 63 , 97 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 78 , 30, 79, 65, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 82 , 87, 63, 45, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 74 , 98, 42, 76, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 35 , 91, 74, 57, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 71 , 90 , 44 , 49 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65 , 3 , 87 , 94 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 75 , 61 , 90 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 69 , 56 , 26 , 96 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 76 , 41, 62, 33, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 41 , 66, 57, 32, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 58 , 41, 49, 10, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 98 , 96, 29, 25, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 , 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + FormatTransferFractalZ transfer; + ge::Format old_format = FORMAT_FRACTAL_Z; + int32_t groups = 2; + ge::Format new_format = static_cast(ge::GetFormatFromSub(old_format, groups)); + TransArgs args{ + reinterpret_cast(data), FORMAT_HWCN, new_format, std::vector({3, 3, 4, 8}), + std::vector({9, 1, 16, 16}), DT_FLOAT16}; + + TransResult result; + EXPECT_EQ(transfer.TransFormat(args, result), SUCCESS); + EXPECT_EQ(result.length, sizeof(ret) / sizeof(ret[0]) * 2); + for (int i = 0; i < sizeof(ret) / sizeof(ret[0]); ++i) { + EXPECT_EQ((reinterpret_cast(result.data.get()))[i], ret[i]); + } +} + + TEST_F(UtestFormatTransferHwcnFz, build_transfer_fp32) { float data[5 * 5 * 31 * 17]; TransArgs args{ diff --git a/tests/ut/ge/common/format_transfer_nhwc_fractalz_unittest.cc b/tests/ut/ge/common/format_transfer_nhwc_fractalz_unittest.cc index ade28c02..7431440b 100644 --- a/tests/ut/ge/common/format_transfer_nhwc_fractalz_unittest.cc +++ b/tests/ut/ge/common/format_transfer_nhwc_fractalz_unittest.cc @@ -5357,7 +5357,7 @@ TEST_F(UtestFormatTransferNhwcFz, build_transfer_uint8) { TEST_F(UtestFormatTransferNhwcFz, invalid_data_type) { uint16_t data[1 * 4 * 4 * 1] = {0}; TransArgs args{ - reinterpret_cast(data), FORMAT_NHWC, FORMAT_FRACTAL_NZ, {1, 4, 4}, {1, 1, 1, 16, 16}, DT_VARIANT}; + reinterpret_cast(data), FORMAT_NHWC, FORMAT_FRACTAL_NZ, {1, 4, 4}, {1, 1, 1, 16, 16}, DT_STRING}; FormatTransferFractalZ transfer; EXPECT_EQ(transfer.TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, args.dst_shape), ACL_ERROR_GE_DATATYPE_INVALID); diff --git a/tests/ut/ge/common/format_transfer_transpose_unittest.cc b/tests/ut/ge/common/format_transfer_transpose_unittest.cc index 04f2a557..b710acde 100644 --- a/tests/ut/ge/common/format_transfer_transpose_unittest.cc +++ b/tests/ut/ge/common/format_transfer_transpose_unittest.cc @@ -4676,5 +4676,24 @@ TEST_F(UtestFormatTranspose, invalid_dst_format) { EXPECT_EQ(transfer.TransShape(FORMAT_NCHW, src_shape, DT_FLOAT16, FORMAT_C1HWNC0, dst_shape), ACL_ERROR_GE_FORMAT_INVALID); } + +TEST_F(UtestFormatTranspose, invalid_src_data) { + uint8_t *data = nullptr; + TransArgs args{data, FORMAT_NCHW, FORMAT_NHWC, std::vector({1, 3, 8, 8}), std::vector({1, 8, 8, 3}), DT_INT64}; + FormatTransferTranspose transpose; + TransResult result; + EXPECT_EQ(transpose.TransFormat(args, result), ACL_ERROR_GE_PARAM_INVALID); + + uint16_t data1[3] = {14583, 12849, 14184}; + TransArgs args1{reinterpret_cast(data1), FORMAT_NCHW, FORMAT_NHWC, std::vector({-1, 3, 1, 1}), std::vector({1, 1, 1, 3}), DT_INT64}; + FormatTransferTranspose transpose1; + TransResult result1; + EXPECT_EQ(transpose1.TransFormat(args1, result1), ACL_ERROR_GE_SHAPE_INVALID); + + TransArgs args2{reinterpret_cast(data1), FORMAT_NCHW, FORMAT_NHWC, std::vector({3, 1, 1}), std::vector({1, 1, 1, 3}), DT_INT64}; + FormatTransferTranspose transpose2; + TransResult result2; + EXPECT_EQ(transpose2.TransFormat(args2, result2), ACL_ERROR_GE_SHAPE_INVALID); +} } // namespace formats } // namespace ge diff --git a/tests/ut/ge/common/format_transfer_unittest.cc b/tests/ut/ge/common/format_transfer_unittest.cc index fd2a296c..73b7703d 100644 --- a/tests/ut/ge/common/format_transfer_unittest.cc +++ b/tests/ut/ge/common/format_transfer_unittest.cc @@ -75,11 +75,11 @@ TEST_F(UtestFormatTransfer, get_size_by_data_type) { EXPECT_EQ(GetSizeByDataType(DT_QINT32), 4); EXPECT_EQ(GetSizeByDataType(DT_QUINT8), 1); EXPECT_EQ(GetSizeByDataType(DT_QUINT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_RESOURCE), -1); + EXPECT_EQ(GetSizeByDataType(DT_RESOURCE), 8); EXPECT_EQ(GetSizeByDataType(DT_STRING_REF), -1); EXPECT_EQ(GetSizeByDataType(DT_DUAL), 5); EXPECT_EQ(GetSizeByDataType(DT_UNDEFINED), -1); - EXPECT_EQ(DT_UNDEFINED, 27); + EXPECT_EQ(DT_UNDEFINED, 28); } } // namespace formats } // namespace ge diff --git a/tests/ut/ge/common/ge_auth_file_saver_unittest.cc b/tests/ut/ge/common/ge_auth_file_saver_unittest.cc new file mode 100644 index 00000000..de44fb25 --- /dev/null +++ b/tests/ut/ge/common/ge_auth_file_saver_unittest.cc @@ -0,0 +1,53 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "common/auth/file_saver.h" + +namespace ge { +class UTEST_file_saver : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_file_saver, save_model_data_to_buff_success) { + ModelFileHeader file_header; + std::vector data; + data.resize(sizeof(ModelPartitionTable) + sizeof(ModelPartitionMemInfo), 0); + ModelPartitionTable *partition_table = reinterpret_cast(data.data()); + partition_table->num = 1; + partition_table->partition[0] = { MODEL_DEF, 0, 12 }; + std::vector partition_tables; + partition_tables.push_back(partition_table); + auto buff = reinterpret_cast(malloc(12)); + struct ge::ModelPartition model_partition; + model_partition.type = MODEL_DEF; + model_partition.data = buff; + model_partition.size = 12; + std::vector model_partitions = { model_partition }; + std::vector> all_partition_datas = { model_partitions }; + ge::ModelBufferData model; + + Status ret = FileSaver::SaveToBuffWithFileHeader(file_header, partition_tables, all_partition_datas, model); + EXPECT_EQ(ret, ge::SUCCESS); + + free(buff); + buff = nullptr; + model_partition.data = nullptr; +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/common/opdebug_register_unittest.cc b/tests/ut/ge/common/opdebug_register_unittest.cc index fcdaddaf..528fd9e3 100644 --- a/tests/ut/ge/common/opdebug_register_unittest.cc +++ b/tests/ut/ge/common/opdebug_register_unittest.cc @@ -31,7 +31,7 @@ TEST_F(UTEST_opdebug_register, register_debug_for_model_success) { OpdebugRegister opdebug_register; rtModel_t model_handle = (void*)0x111; uint32_t op_debug_mode = 1; - DataDumper data_dumper; + DataDumper data_dumper({}); auto ret = opdebug_register.RegisterDebugForModel(model_handle, op_debug_mode, data_dumper); opdebug_register.UnregisterDebugForModel(model_handle); EXPECT_EQ(ret, ge::SUCCESS); @@ -41,7 +41,7 @@ TEST_F(UTEST_opdebug_register, register_debug_for_stream_success) { OpdebugRegister opdebug_register; rtStream_t stream = (void*)0x111; uint32_t op_debug_mode = 1; - DataDumper data_dumper; + DataDumper data_dumper({}); auto ret = opdebug_register.RegisterDebugForStream(stream, op_debug_mode, data_dumper); opdebug_register.UnregisterDebugForStream(stream); EXPECT_EQ(ret, ge::SUCCESS); diff --git a/tests/ut/ge/executor/ge_executor_unittest.cc b/tests/ut/ge/executor/ge_executor_unittest.cc index a98f9290..a4606320 100644 --- a/tests/ut/ge/executor/ge_executor_unittest.cc +++ b/tests/ut/ge/executor/ge_executor_unittest.cc @@ -39,4 +39,10 @@ TEST_F(UtestGeExecutor, test_single_op_exec) { EXPECT_EQ(exeutor.LoadSingleOp(model_name, model_data, nullptr, nullptr), ACL_ERROR_GE_INTERNAL_ERROR); EXPECT_EQ(exeutor.LoadDynamicSingleOp(model_name, model_data, nullptr, nullptr), PARAM_INVALID); } + +TEST_F(UtestGeExecutor, test_ge_initialize) { + GeExecutor executor; + EXPECT_EQ(executor.Initialize(), SUCCESS); + EXPECT_EQ(executor.Initialize(), SUCCESS); +} } // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/generator/ge_generator_unittest.cc b/tests/ut/ge/generator/ge_generator_unittest.cc index 3daa5592..fb256c7c 100644 --- a/tests/ut/ge/generator/ge_generator_unittest.cc +++ b/tests/ut/ge/generator/ge_generator_unittest.cc @@ -20,6 +20,12 @@ #define protected public #include "generator/ge_generator.h" #include "graph/utils/tensor_utils.h" +#include "graph/attr_value.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "../graph/passes/graph_builder_utils.h" +#include "../graph/manager/graph_manager.h" +#include "all_ops.h" using namespace std; @@ -31,6 +37,25 @@ class UtestGeGenerator : public testing::Test { void TearDown() {} }; +namespace { +ComputeGraphPtr MakeGraph() { + ge::ut::GraphBuilder builder("graph"); + auto data = builder.AddNode("data", "Data", 1, 1); + auto addn1 = builder.AddNode("addn1", "AddN", 1, 1); + builder.AddDataEdge(data, 0, addn1, 0); + return builder.GetGraph(); +} + +static GeAttrValue::NamedAttrs CreateNamedAttrs(const string &name, std::map map) { + GeAttrValue::NamedAttrs named_attrs; + named_attrs.SetName(name); + for (auto it : map) { + named_attrs.SetAttr(it.first, it.second); + } + return named_attrs; +} +} // namespace + /* TEST_F(UtestGeGenerator, test_build_single_op_offline) { GeTensorDesc tensor_desc(GeShape(), FORMAT_NCHW, DT_FLOAT); @@ -69,6 +94,69 @@ TEST_F(UtestGeGenerator, test_build_single_op_online) { GeGenerator generator; generator.Initialize({}); ModelBufferData model_buffer; - EXPECT_EQ(generator.BuildSingleOpModel(op_desc, inputs, outputs, ENGINE_AIVECTOR, model_buffer), FAILED); + EXPECT_EQ(generator.BuildSingleOpModel(op_desc, inputs, outputs, ENGINE_AIVECTOR, false, model_buffer), FAILED); +} + +TEST_F(UtestGeGenerator, test_check_aicore) { + GeGenerator generator; + generator.Initialize({}); + auto graph = MakeGraph(); + EXPECT_EQ(generator.CheckNoAicore(graph), true); +} + +TEST_F(UtestGeGenerator, test_graph_manager) { + GraphManager graph_manager; + GraphPartitioner graph_partitioner; + + auto root_graph = MakeGraph(); + auto sub_graph = MakeGraph(); + root_graph->AddSubGraph(sub_graph); + + auto sgi = MakeShared(); + // set engine name + sgi->SetEngineName("AIcoreEngine"); + sgi->SetSubGraph(sub_graph); + + auto sgi_gelocal = MakeShared(); + // set engine name + sgi_gelocal->SetEngineName("GELOCAL"); + sgi_gelocal->SetSubGraph(sub_graph); + + graph_partitioner.graph_2_input_subgraph_[root_graph] = sgi_gelocal; + graph_partitioner.graph_2_subgraph_list_.insert({root_graph, {sgi, sgi_gelocal}}); + graph_partitioner.graph_2_subgraph_list_.insert({sub_graph, {sgi, sgi_gelocal}}); + EXPECT_EQ(graph_manager.ConvertGraphToFile(root_graph, graph_partitioner, "./"), GRAPH_SUCCESS); +} + +TEST_F(UtestGeGenerator, test_set_model_name) { + GeGenerator generator; + generator.Initialize({}); + GeRootModelPtr ge_root_model = make_shared(GeRootModel()); + ComputeGraphPtr graph = make_shared(ComputeGraph("graph")); + (void)AttrUtils::SetBool(graph, "_dynamic_shape_partitioned", true); + ge_root_model->root_graph_ = std::move(graph); + EXPECT_EQ(generator.SetModelNameForDump(ge_root_model), SUCCESS); +} + +TEST_F(UtestGeGenerator, test_remove_const) { + GeGenerator generator; + GeTensorDesc tensor_desc; + GeTensor tensor(tensor_desc); + const vector inputs = {tensor}; + vector outputs; + generator.RemoveConst(inputs, outputs); +} + +TEST_F(UtestGeGenerator, test_generate_online_model) { + GeTensorDesc tensor_desc; + GeTensor tensor(tensor_desc); + const vector inputs = { tensor, tensor }; + auto compute_graph = MakeGraph(); + compute_graph->TopologicalSorting(); + Graph graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph); + GeGenerator generator; + generator.Initialize({}); + std::string name; + EXPECT_NE(generator.GenerateOfflineModel(graph, name, inputs), SUCCESS); } } // namespace ge diff --git a/tests/ut/ge/graph/build/buffer_pool_mem_assigner_unittest.cc b/tests/ut/ge/graph/build/buffer_pool_mem_assigner_unittest.cc new file mode 100644 index 00000000..96283250 --- /dev/null +++ b/tests/ut/ge/graph/build/buffer_pool_mem_assigner_unittest.cc @@ -0,0 +1,607 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "common/ge_inner_error_codes.h" +#include "common/types.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "../utils/buffer_pool_graph_builder.h" +#include "graph/passes/buffer_pool_memory_pass.h" + +#define protected public +#define private public +#include "graph/build/memory/buffer_pool_mem_assigner.h" +#include "graph/build/memory/graph_mem_assigner.h" +#include "graph/build/stream_allocator.h" +#undef protected +#undef private + +namespace ge { +namespace { +const int64_t kMemoryTypeHBM = static_cast(RT_MEMORY_HBM); +const int64_t kMemoryTypeP2P = static_cast(RT_MEMORY_P2P_HBM); +const int64_t kMemoryTypeDDR = static_cast(RT_MEMORY_DDR); +const size_t kOffsetHBM = 10240; +const size_t kOffsetP2P = 20480; +const size_t kOffsetDDR = 30720; +const int64_t kMemAlignSize = 512; + +int64_t AlignMemSize(int64_t mem_size, int64_t align_size = kMemAlignSize) { + int64_t tmp = (mem_size + align_size - 1) / align_size * align_size; + return tmp; +} +int64_t AlignOutputMemSize(int64_t mem_size) { + int64_t tmp = (mem_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize; + // hccl need alignment + tmp = kMemAlignSize + tmp + kMemAlignSize; + return tmp; +} +} // namespace +class UtestBufferPoolMemAssignerTest : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} + +}; + +TEST_F(UtestBufferPoolMemAssignerTest, buffer_pool_normal_assign_success) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map mem_type_to_offset = {{kMemoryTypeHBM, kOffsetHBM}, + {kMemoryTypeP2P, kOffsetP2P}}; + int64_t offset_base = static_cast(kOffsetHBM + kMemAlignSize); + std::vector expect_offset = {(offset_base + 0), + (offset_base + AlignOutputMemSize(500)), + (offset_base + (AlignOutputMemSize(500) * 2)), + (offset_base + 0), + (offset_base + AlignOutputMemSize(1024))}; + + BufferPoolMemAssigner buffer_pool_mem_assigner(graph, mem_type_to_offset); + ret = buffer_pool_mem_assigner.Assign(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(buffer_pool_mem_assigner.GetMemOffset(), offset_base + + AlignMemSize(5600, kMemAlignSize) + kMemAlignSize); + + { + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, buffer_pool_normal_graph_with_multi_buffer_pool_assign_success) { + ut::BufferPoolGraphBuilder builder("NormalGraphWithMultiBufferPool"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraphWithMultiBufferPool(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map mem_type_to_offset = {{kMemoryTypeHBM, kOffsetHBM}, + {kMemoryTypeP2P, kOffsetP2P}}; + int64_t offset_base_0 = static_cast(kOffsetHBM + kMemAlignSize); + int64_t offset_base_1 = static_cast(kOffsetHBM + kMemAlignSize) + + AlignMemSize(5000, kMemAlignSize) + kMemAlignSize; + std::vector expect_offset = {(offset_base_0 + 0), + (offset_base_1 + 0), + (offset_base_0 + AlignOutputMemSize(500)), + (offset_base_0 + 0), + (offset_base_1 + AlignOutputMemSize(500))}; + + BufferPoolMemAssigner buffer_pool_mem_assigner(graph, mem_type_to_offset); + ret = buffer_pool_mem_assigner.Assign(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(buffer_pool_mem_assigner.GetMemOffset(), offset_base_1 + + AlignMemSize(5000, kMemAlignSize) + kMemAlignSize); + + { + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, buffer_pool_serial_graph_assign_success) { + ut::BufferPoolGraphBuilder builder("SerialGraph"); + ge::ComputeGraphPtr graph = builder.BuildSerialGraph(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map mem_type_to_offset = {{kMemoryTypeHBM, kOffsetHBM}, + {kMemoryTypeP2P, kOffsetP2P}}; + int64_t offset_base = static_cast(kOffsetHBM + kMemAlignSize); + std::vector expect_offset = {offset_base, offset_base, offset_base, offset_base, offset_base}; + + BufferPoolMemAssigner buffer_pool_mem_assigner(graph, mem_type_to_offset); + ret = buffer_pool_mem_assigner.Assign(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(buffer_pool_mem_assigner.GetMemOffset(), offset_base + + AlignMemSize(2048, kMemAlignSize) + kMemAlignSize); + + { + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, buffer_pool_subgraph_with_inner_dependency_assign_success) { + ut::BufferPoolGraphBuilder builder("SubgraphWithInnerDependency"); + ge::ComputeGraphPtr graph = builder.BuildSubgraphWithInnerDependency(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map mem_type_to_offset = {{kMemoryTypeHBM, kOffsetHBM}, + {kMemoryTypeP2P, kOffsetP2P}}; + int64_t offset_base = static_cast(kOffsetHBM + kMemAlignSize); + std::vector expect_offset = {(offset_base + 0), + (offset_base + AlignOutputMemSize(500)), + (offset_base + (AlignOutputMemSize(500) * 2)), + (offset_base + 0), + (offset_base + AlignOutputMemSize(1024))}; + + BufferPoolMemAssigner buffer_pool_mem_assigner(graph, mem_type_to_offset); + ret = buffer_pool_mem_assigner.Assign(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(buffer_pool_mem_assigner.GetMemOffset(), offset_base + + AlignMemSize(5600, kMemAlignSize) + kMemAlignSize); + + std::map all_nodes; + for (auto node : graph->GetAllNodes()) { + EXPECT_NE(node, nullptr); + all_nodes[node->GetName()] = node; + } + + { + auto prefetch = all_nodes.at("prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = all_nodes.at("prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = all_nodes.at("prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = all_nodes.at("prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = all_nodes.at("prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, buffer_pool_graph_with_multi_batch_assign_success) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiBatch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiBatch(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map mem_type_to_offset = {{kMemoryTypeHBM, kOffsetHBM}, + {kMemoryTypeP2P, kOffsetP2P}}; + int64_t offset_base = static_cast(kOffsetHBM + kMemAlignSize); + std::vector expect_offset = {(offset_base + 0), + (offset_base + AlignOutputMemSize(500)), + (offset_base + (AlignOutputMemSize(500) * 2)), + (offset_base + 0), + (offset_base + AlignOutputMemSize(1024))}; + + BufferPoolMemAssigner buffer_pool_mem_assigner(graph, mem_type_to_offset); + ret = buffer_pool_mem_assigner.Assign(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(buffer_pool_mem_assigner.GetMemOffset(), offset_base + + AlignMemSize(5600, kMemAlignSize) + kMemAlignSize); + + { + auto prefetch = graph->FindNode("batch_label_128/prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = graph->FindNode("batch_label_128/prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = graph->FindNode("batch_label_128/prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = graph->FindNode("batch_label_128/prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = graph->FindNode("batch_label_128/prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } + + { + auto prefetch = graph->FindNode("batch_label_256/prefetch1"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(0)); + } + + { + auto prefetch = graph->FindNode("batch_label_256/prefetch2"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(1)); + } + + { + auto prefetch = graph->FindNode("batch_label_256/prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(2)); + } + + { + auto prefetch = graph->FindNode("batch_label_256/prefetch4"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(3)); + } + + { + auto prefetch = graph->FindNode("batch_label_256/prefetch5"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector output_offset = prefetch->GetOpDesc()->GetOutputOffset(); + EXPECT_EQ(output_offset.size(), 1); + EXPECT_EQ(output_offset.at(0), expect_offset.at(4)); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, test_AssignBufferPoolMemory_success) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + std::map memory_offset = {{kMemoryTypeHBM, MemoryOffset(RT_MEMORY_HBM, kOffsetHBM)}, + {kMemoryTypeP2P, MemoryOffset(RT_MEMORY_P2P_HBM, kOffsetP2P)}}; + + GraphMemoryAssigner graph_memory_assigner(graph); + graph_memory_assigner.memory_offset_ = memory_offset; + ret = graph_memory_assigner.AssignBufferPoolMemory(); + EXPECT_EQ(ret, SUCCESS); +} + +TEST_F(UtestBufferPoolMemAssignerTest, test_AssignBufferPoolMemory_fail) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + std::map memory_offset = {{kMemoryTypeHBM, MemoryOffset(RT_MEMORY_HBM, kOffsetHBM)}, + {kMemoryTypeP2P, MemoryOffset(RT_MEMORY_P2P_HBM, kOffsetP2P)}}; + { + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + std::vector type_list = {static_cast(RT_MEMORY_P2P_HBM)}; + bool set_attr = ge::AttrUtils::SetListInt(prefetch->GetOpDesc(), ATTR_NAME_OUTPUT_MEM_TYPE_LIST, type_list); + EXPECT_EQ(set_attr, true); + + GraphMemoryAssigner graph_memory_assigner(graph); + graph_memory_assigner.memory_offset_ = memory_offset; + Status ret = graph_memory_assigner.AssignBufferPoolMemory(); + EXPECT_EQ(ret, FAILED); + } + + { + std::vector node_list = {"prefetch1", "prefetch2", "prefetch3", "prefetch4", "prefetch5"}; + std::vector type_list = {static_cast(RT_MEMORY_L1)}; + for (auto &node_name : node_list) { + auto prefetch = graph->FindNode(node_name); + EXPECT_NE(prefetch, nullptr); + EXPECT_NE(prefetch->GetOpDesc(), nullptr); + bool set_attr = ge::AttrUtils::SetListInt(prefetch->GetOpDesc(), ATTR_NAME_OUTPUT_MEM_TYPE_LIST, type_list); + EXPECT_EQ(set_attr, true); + } + GraphMemoryAssigner graph_memory_assigner(graph); + graph_memory_assigner.memory_offset_ = memory_offset; + Status ret = graph_memory_assigner.AssignBufferPoolMemory(); + EXPECT_EQ(ret, FAILED); + } +} + +TEST_F(UtestBufferPoolMemAssignerTest, test_RefreshEventsWithReuse_success) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + std::map all_nodes; + for (auto node : graph->GetAllNodes()) { + EXPECT_NE(node, nullptr); + all_nodes[node->GetName()] = node; + } + + Graph2SubGraphInfoList sub_graphs; + StreamAllocator stream_allocator(graph, sub_graphs); + stream_allocator.event_num_ = 65520; + + // stream ctrl event + stream_allocator.AddSendEventId(all_nodes.at("prefetch1"), 30); + stream_allocator.AddRecvEventId(all_nodes.at("add1"), 30); + + stream_allocator.AddSendEventId(all_nodes.at("prefetch2"), 31); + stream_allocator.AddRecvEventId(all_nodes.at("add2"), 31); + + stream_allocator.AddSendEventId(all_nodes.at("prefetch3"), 32); + stream_allocator.AddRecvEventId(all_nodes.at("add3"), 32); + + stream_allocator.AddSendEventId(all_nodes.at("prefetch4"), 33); + stream_allocator.AddRecvEventId(all_nodes.at("add4"), 33); + + stream_allocator.AddSendEventId(all_nodes.at("add2"), 34); + stream_allocator.AddRecvEventId(all_nodes.at("prefetch4"), 34); + + stream_allocator.AddSendEventId(all_nodes.at("prefetch5"), 35); + stream_allocator.AddRecvEventId(all_nodes.at("add5"), 35); + + stream_allocator.AddSendEventId(all_nodes.at("add3"), 36); + stream_allocator.AddRecvEventId(all_nodes.at("prefetch5"), 36); + + // other event + stream_allocator.AddSendEventId(all_nodes.at("prefetch1"), 37); + stream_allocator.AddRecvEventId(all_nodes.at("add5"), 37); + + + ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ((stream_allocator.node_to_send_events_.at(all_nodes.at("prefetch1"))).size(), 2); + EXPECT_EQ((stream_allocator.node_to_send_events_.at(all_nodes.at("prefetch5"))).size(), 1); + EXPECT_EQ((stream_allocator.node_to_recv_events_.at(all_nodes.at("prefetch5"))).size(), 1); + EXPECT_EQ((stream_allocator.node_to_recv_events_.at(all_nodes.at("add5"))).size(), 2); + EXPECT_EQ(stream_allocator.event_num_, 5); +} + +TEST_F(UtestBufferPoolMemAssignerTest, test_RefreshEventsWithReuse_fail) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + + std::map all_nodes; + for (auto node : graph->GetAllNodes()) { + EXPECT_NE(node, nullptr); + all_nodes[node->GetName()] = node; + } + std::vector> event_info = {{"SendTo;add1;0"}, + {"SendTo;add2;1"}, + {"SendTo;add3;2"}, + {"SendTo;add4;3", "RecvFrom;add2;0"}, + {"SendTo;add5;0", "RecvFrom;add3;1"}}; + + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + (void) AttrUtils::SetListStr(all_nodes.at("prefetch2")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[1]); + (void) AttrUtils::SetListStr(all_nodes.at("prefetch3")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[2]); + (void) AttrUtils::SetListStr(all_nodes.at("prefetch4")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[3]); + (void) AttrUtils::SetListStr(all_nodes.at("prefetch5")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[4]); + + Graph2SubGraphInfoList sub_graphs; + StreamAllocator stream_allocator(graph, sub_graphs); + stream_allocator.event_num_ = 65520; + + // Item num of raw event info is invalid + event_info[0][0] = "SendTo;add1;0;1"; + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + Status ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, PARAM_INVALID); + + // Event id is invalid argument + event_info[0][0] = "SendTo;add1;event_id"; + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, PARAM_INVALID); + + // Event id is out of range + event_info[0][0] = "SendTo;add1;666666666666666666666666666666666666666"; + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, PARAM_INVALID); + + // Event id is negative + event_info[0][0] = "SendTo;add1;-2"; + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, PARAM_INVALID); + + // Key word is not supported + event_info[0][0] = "SendToKey;add1;2"; + (void) AttrUtils::SetListStr(all_nodes.at("prefetch1")->GetOpDesc(), ATTR_NAME_EVENT_MULTIPLEXING, event_info[0]); + ret = stream_allocator.RefreshEventsWithReuse(); + EXPECT_EQ(ret, PARAM_INVALID); +} +} // namespace ge + diff --git a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc index 5b87939f..218bfd0d 100644 --- a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc +++ b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc @@ -32,6 +32,7 @@ #include "graph/compute_graph.h" #include "graph/utils/attr_utils.h" #include "graph/utils/graph_utils.h" +#include "graph/debug/ge_attr_define.h" using namespace std; @@ -153,6 +154,22 @@ class UtestLogicalStreamAllocator : public testing::Test { return CreateSubgraphWithName("graph", engine, stream_label, in_num, out_num); } + SubGraphInfoPtr CreateParallelGroupSubgraphWithName(const string &name, const string &engine, + const string &stream_label = "", + std::string group_name = "1") { + ComputeGraphPtr compute_graph = make_shared(name); + OpDescPtr op_desc = std::make_shared("relu", "Relu"); + op_desc->AddInputDesc(GeTensorDesc()); + op_desc->AddOutputDesc(GeTensorDesc()); + AttrUtils::SetStr(op_desc, ATTR_NAME_PARALLEL_GROUP, group_name); + compute_graph->AddNode(op_desc); + + SubGraphInfoPtr subgraph = BuildSubGraph(compute_graph, engine, stream_label); + AddPlaceHolderAndEnd(subgraph, 1, 1); + + return subgraph; + } + void LinkSubGraph(SubGraphInfoPtr subgraph1, const string &end_name, SubGraphInfoPtr subgraph2, const string &placeholder_name) { NodePtr end_node = subgraph1->GetSubGraph()->FindNode(end_name); @@ -878,4 +895,30 @@ TEST_F(UtestLogicalStreamAllocator, test_all_reduce_parallel_pass) { EXPECT_EQ(ret, NOT_CHANGED); } +TEST_F(UtestLogicalStreamAllocator, test_parallel_group) { + SubGraphInfoPtr data = CreateDataSubgraph(); + SubGraphInfoPtr subgraph1 = CreateParallelGroupSubgraphWithName("graph1", "engine1", ""); + SubGraphInfoPtr subgraph2 = CreateParallelGroupSubgraphWithName("graph2", "engine2", "", "2"); + SubGraphInfoPtr subgraph3 = CreateParallelGroupSubgraphWithName("graph3", "engine3", "", "3"); + SubGraphInfoPtr subgraph4 = CreateParallelGroupSubgraphWithName("graph4", "engine4", "", "4"); + LinkSubGraph(data, "end", subgraph1, "placeholder"); + LinkSubGraph(subgraph1, "end", subgraph2, "placeholder"); + LinkSubGraph(subgraph2, "end", subgraph3, "placeholder"); + LinkSubGraph(subgraph3, "end", subgraph4, "placeholder"); + + EngineConfPtr conf1 = make_shared(); + conf1->id = subgraph1->GetEngineName(); + EngineConfPtr conf2 = make_shared(); + conf2->id = subgraph2->GetEngineName(); + conf2->attach = false; + EngineConfPtr conf3 = make_shared(); + conf3->id = subgraph3->GetEngineName(); + conf3->attach = false; + EngineConfPtr conf4 = make_shared(); + conf4->id = subgraph4->GetEngineName(); + + Status status = AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4}, {conf1, conf2, conf3, conf4}); + EXPECT_EQ(status, ge::SUCCESS); +} + } // namespace ge diff --git a/tests/ut/ge/graph/build/mem_assigner_unittest.cc b/tests/ut/ge/graph/build/mem_assigner_unittest.cc index 0024185b..92f9b5b4 100644 --- a/tests/ut/ge/graph/build/mem_assigner_unittest.cc +++ b/tests/ut/ge/graph/build/mem_assigner_unittest.cc @@ -33,6 +33,7 @@ #include "graph/build/memory/graph_mem_assigner.h" #include "graph/build/memory/hybrid_mem_assigner.h" #include "graph/build/memory/max_block_mem_assigner.h" +#include "graph/manager/graph_var_manager.h" #undef protected #undef private @@ -77,8 +78,8 @@ class UtestMemoryAssignerTest : public testing::Test { op_def->SetWorkspaceBytes(workspace_bytes); return op_def; } - void MakeGraph(ge::ComputeGraphPtr &graph) { - ge::OpDescPtr op_def_a = CreateOpWithWsSize("A", 6000); + void MakeGraph(ge::ComputeGraphPtr &graph, const string &type = "some") { + ge::OpDescPtr op_def_a = CreateOpWithWsSize("A", 6000, type); op_def_a->SetStreamId(0); ge::OpDescPtr op_def_b = CreateOpWithWsSize("B", 120000); op_def_b->SetStreamId(0); @@ -190,6 +191,30 @@ class UtestMemoryAssignerTest : public testing::Test { return builder.GetGraph(); } + ComputeGraphPtr MakeRefNodeGraph() { + ge::ut::GraphBuilder builder("graph"); + auto var_input = builder.AddNode("var", "Variable", 1, 1); + auto const_input = builder.AddNode("const", "Const", 1, 1); + auto assign = builder.AddNode("assgin", "Assign", 2, 1); + // add link + builder.AddDataEdge(var_input, 0, assign, 0); + builder.AddDataEdge(const_input, 0, assign, 1); + // set offset + assign->GetOpDesc()->SetInputOffset({100, 0}); + assign->GetOpDesc()->SetOutputOffset({10000}); + var_input->GetOpDesc()->SetOutputOffset({10000}); + const_input->GetOpDesc()->SetOutputOffset({1000}); + // set mem type + ge::AttrUtils::SetListInt(assign->GetOpDesc(), ATTR_NAME_INPUT_MEM_TYPE_LIST, {RT_MEMORY_HBM, RT_MEMORY_L1}); + // set ref + auto output_tensordesc = assign->GetOpDesc()->MutableOutputDesc(0); + ge::TensorUtils::SetReuseInput(*output_tensordesc, true); + uint32_t reuse_input_index = 0; + ge::TensorUtils::SetReuseInputIndex(*output_tensordesc, reuse_input_index); + + return builder.GetGraph(); + } + protected: void SetUp() {} @@ -249,3 +274,68 @@ TEST_F(UtestMemoryAssignerTest, graph_memory_assign_continuous_input) { EXPECT_EQ(addn1->GetOpDesc()->GetOutputOffset()[0], 500); EXPECT_EQ(addn2->GetOpDesc()->GetOutputOffset()[0], 600); } + +TEST_F(UtestMemoryAssignerTest, graph_memory_set_last_used_attr) { + ge::ComputeGraphPtr graph = make_shared(""); + MakeGraph(graph); + auto node_f = graph->FindNode("F"); + MemoryAssigner memory_assigner(graph); + map mem_offset; + size_t zero_memory_size = 0; + EXPECT_EQ(memory_assigner.AssignMemory(false, mem_offset, zero_memory_size), GRAPH_SUCCESS); + + int32_t flag = 0; + (void) ge::AttrUtils::GetInt(node_f->GetOpDesc()->GetInputDesc(0), ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE, flag); + EXPECT_EQ(flag, 1); +} + +TEST_F(UtestMemoryAssignerTest, graph_memory_assign_ref_var) { + ge::ComputeGraphPtr graph = make_shared(""); + MakeGraph(graph, VARIABLE); + auto node_a = graph->FindNode("A"); + auto node_b = graph->FindNode("B"); + std::string value = "A"; + (void) ge::AttrUtils::SetStr(node_b->GetOpDesc()->MutableOutputDesc(0), REF_VAR_SRC_VAR_NAME, value); + MemoryAssigner memory_assigner(graph); + map mem_offset; + size_t zero_memory_size = 0; + VarManager::Instance(0)->Init(0, 0, 0, 0); + EXPECT_EQ(memory_assigner.AssignMemory(false, mem_offset, zero_memory_size), GRAPH_SUCCESS); + + EXPECT_EQ(node_b->GetOpDesc()->GetOutputOffset()[0], node_a->GetOpDesc()->GetOutputOffset()[0]); +} + +TEST_F(UtestMemoryAssignerTest, graph_memory_assign_ref_var_not_found) { + ge::ComputeGraphPtr graph = make_shared(""); + MakeGraph(graph, VARIABLE); + + ge::ComputeGraphPtr sub_graph = make_shared(""); + MakeReuseGraph(sub_graph); + graph->AddSubGraph(sub_graph); + + auto node_a = graph->FindNode("A"); + auto node_b = graph->FindNode("B"); + std::string value = "M"; + (void) ge::AttrUtils::SetStr(node_b->GetOpDesc()->MutableOutputDesc(0), REF_VAR_SRC_VAR_NAME, value); + MemoryAssigner memory_assigner(graph); + map mem_offset; + size_t zero_memory_size = 0; + VarManager::Instance(0)->Init(0, 0, 0, 0); + EXPECT_NE(memory_assigner.AssignMemory(false, mem_offset, zero_memory_size), GRAPH_SUCCESS); +} + +TEST_F(UtestMemoryAssignerTest, graph_memory_assign_set_input_offset) { + ge::ComputeGraphPtr graph = MakeRefNodeGraph(); + auto assgin = graph->FindNode("assgin"); + EXPECT_EQ(assgin->GetOpDesc()->GetOutputOffset()[0], 10000); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[0], 100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[1], 0); + GraphMemoryAssigner memoryAssigner(graph); + MemoryOffset memory_offset(RT_MEMORY_HBM, 0); + memoryAssigner.memory_offset_.emplace(RT_MEMORY_HBM, memory_offset); + EXPECT_EQ(memoryAssigner.SetInputOffset(), GRAPH_SUCCESS); + EXPECT_EQ(assgin->GetOpDesc()->GetOutputOffset()[0], 10100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[0], 10100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[1], 0); + EXPECT_EQ(memoryAssigner.CheckOffset(), GRAPH_SUCCESS); +} diff --git a/tests/ut/ge/graph/build/model_builder_unittest.cc b/tests/ut/ge/graph/build/model_builder_unittest.cc new file mode 100644 index 00000000..b9204dbc --- /dev/null +++ b/tests/ut/ge/graph/build/model_builder_unittest.cc @@ -0,0 +1,163 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "graph/anchor.h" +#include "graph/attr_value.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +#include "omg/omg_inner_types.h" +#include "../passes/graph_builder_utils.h" + +#define protected public +#define private public +#include "graph/build/model_builder.h" +#undef protected +#undef private + +using namespace std; +using namespace testing; +using namespace ge; +using domi::GetContext; + +class UtestModelBuilderTest : public testing::Test { + public: + ge::OpDescPtr CreateOpWithWsSize(const string &name, int64_t wsByte, const string &type = "some") { + ge::OpDescPtr op_def = make_shared(name, type); + auto desc_temp_ptr = make_shared(); + auto desc_temp = *desc_temp_ptr; + + TensorUtils::SetSize(desc_temp, 1024); + op_def->AddInputDesc(desc_temp); + op_def->AddOutputDesc(desc_temp); + + std::vector workspace_bytes; + workspace_bytes.push_back(wsByte); + op_def->SetWorkspaceBytes(workspace_bytes); + return op_def; + } + ge::OpDescPtr CreateRefOpWithWsSize(const string &name, int64_t wsByte, const string &type = "some") { + ge::OpDescPtr op_def = make_shared(name, type); + auto desc_temp_ptr = make_shared(); + auto desc_temp = *desc_temp_ptr; + + TensorUtils::SetSize(desc_temp, 1024); + op_def->AddInputDesc(desc_temp); + + auto desc_output_ptr = make_shared(); + auto desc_output = *desc_output_ptr; + TensorUtils::SetSize(desc_output, 6500); + ge::TensorUtils::SetReuseInput(desc_output, true); + ge::TensorUtils::SetReuseInputIndex(desc_output, 0); + op_def->AddOutputDesc(desc_output); + + std::vector workspace_bytes; + workspace_bytes.push_back(wsByte); + op_def->SetWorkspaceBytes(workspace_bytes); + return op_def; + } + void MakeGraph(ge::ComputeGraphPtr &graph) { + ge::OpDescPtr op_def_a = CreateOpWithWsSize("A", 6000); + op_def_a->SetStreamId(0); + ge::OpDescPtr op_def_b = CreateOpWithWsSize("B", 120000); + op_def_b->SetStreamId(0); + ge::OpDescPtr op_def_c = CreateOpWithWsSize("C", 16000); + op_def_c->SetStreamId(1); + ge::OpDescPtr op_def_d = CreateOpWithWsSize("D", 24000); + op_def_d->SetStreamId(2); + ge::OpDescPtr op_def_e = CreateOpWithWsSize("E", 24000); + op_def_e->SetStreamId(3); + ge::OpDescPtr op_def_f = CreateOpWithWsSize("F", 30000); + op_def_f->SetStreamId(2); + ge::OpDescPtr op_def_g = CreateOpWithWsSize("G", 32000); + op_def_g->SetStreamId(3); + ge::OpDescPtr op_def_h = CreateOpWithWsSize("H", 48000); + op_def_h->SetStreamId(2); + ge::OpDescPtr op_def_i = CreateOpWithWsSize("I", 60000); + op_def_i->SetStreamId(2); + ge::OpDescPtr op_def_j = CreateOpWithWsSize("J", 256000, NETOUTPUT); + op_def_j->SetStreamId(3); + + // add node + ge::NodePtr node_a = graph->AddNode(op_def_a); + ge::NodePtr node_b = graph->AddNode(op_def_b); + ge::NodePtr node_c = graph->AddNode(op_def_c); + ge::NodePtr node_d = graph->AddNode(op_def_d); + ge::NodePtr node_e = graph->AddNode(op_def_e); + ge::NodePtr node_f = graph->AddNode(op_def_f); + ge::NodePtr node_g = graph->AddNode(op_def_g); + ge::NodePtr node_h = graph->AddNode(op_def_h); + ge::NodePtr node_i = graph->AddNode(op_def_i); + ge::NodePtr node_j = graph->AddNode(op_def_j); + + // add edge + ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_b->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_c->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_b->GetOutDataAnchor(0), node_d->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_b->GetOutDataAnchor(0), node_e->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_c->GetOutDataAnchor(0), node_g->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_d->GetOutDataAnchor(0), node_f->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_e->GetOutDataAnchor(0), node_g->GetInDataAnchor(1)); + ge::GraphUtils::AddEdge(node_f->GetOutDataAnchor(0), node_h->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_g->GetOutDataAnchor(0), node_j->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_h->GetOutDataAnchor(0), node_i->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(node_i->GetOutDataAnchor(0), node_j->GetInDataAnchor(1)); + + GetContext().out_nodes_map["H"] = {0}; + GetContext().out_nodes_map["I"] = {0}; + GetContext().out_nodes_map["J"] = {0}; + graph->TopologicalSorting(); + } + + + protected: + void SetUp() {} + + void TearDown() { GetContext().out_nodes_map.clear(); } +}; + +// when check GetMemoryRanges return fail, Assign return fail +TEST_F(UtestModelBuilderTest, SetInputIsConst) { + Graph2SubGraphInfoList subgraphs; + std::map stream_max_parallel_num; + ge::ComputeGraphPtr graph = make_shared(""); + MakeGraph(graph); + graph->TopologicalSorting(); + ge::ModelBuilder builder(0, graph, subgraphs, stream_max_parallel_num, false); + EXPECT_EQ(builder.PreBuildModel(), SUCCESS); +} + +TEST_F(UtestModelBuilderTest, test_save_atomic_bin) { + Graph2SubGraphInfoList subgraphs; + std::map stream_max_parallel_num; + ge::ComputeGraphPtr graph = make_shared(""); + ge::ModelBuilder builder(0, graph, subgraphs, stream_max_parallel_num, false); + + auto atomic_op_desc = make_shared("Atomic", "Atomic"); + auto kernel_buffer = static_cast(Buffer(10)); + AttrUtils::SetStr(atomic_op_desc, ATTR_NAME_TBE_KERNEL_NAME, "Atomic"); + AttrUtils::SetBytes(atomic_op_desc, ATTR_NAME_TBE_KERNEL_BUFFER, kernel_buffer); + + ge::NodePtr atomic_node = graph->AddNode(atomic_op_desc); + auto op_desc = make_shared("Sum", "Sum"); + op_desc->SetExtAttr("atomic_clean_node_ptr", atomic_node); + EXPECT_EQ(builder.SaveAtomicTBEKernel(op_desc), SUCCESS); +} diff --git a/tests/ut/ge/graph/build/task_generator_unittest.cc b/tests/ut/ge/graph/build/task_generator_unittest.cc new file mode 100644 index 00000000..7e996cf1 --- /dev/null +++ b/tests/ut/ge/graph/build/task_generator_unittest.cc @@ -0,0 +1,88 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "graph/anchor.h" +#include "graph/attr_value.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +#include "omg/omg_inner_types.h" +#include "../passes/graph_builder_utils.h" + +#define protected public +#define private public +#include "graph/build/task_generator.h" +#undef protected +#undef private + +using namespace std; +using namespace testing; +using namespace ge; + +class UtestTaskGeneratorTest : public testing::Test { + public: + ge::ComputeGraphPtr BuildGraphFpProfiling() { + ge::ut::GraphBuilder builder("graph"); + auto data = builder.AddNode("data", "phony", 1, 1); + auto addn1 = builder.AddNode("addn1", "AddN", 1, 1); + auto netoutput = builder.AddNode("netoutput", "NetOutput", 2, 0); + auto op_desc = data->GetOpDesc(); + (void)AttrUtils::SetStr(op_desc, ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, "IteratorV2"); + op_desc->SetOpKernelLibName("GE"); + builder.AddDataEdge(data, 0, addn1, 0); + builder.AddDataEdge(addn1, 0, netoutput, 0); + return builder.GetGraph(); + } + ge::ComputeGraphPtr BuildGraphBpProfiling() { + ge::ut::GraphBuilder builder("graph"); + auto data = builder.AddNode("data", "phony", 1, 1); + auto addn1 = builder.AddNode("addn1", "AddN", 1, 1); + auto netoutput = builder.AddNode("netoutput", "NetOutput", 2, 0); + auto op_desc = data->GetOpDesc(); + (void)AttrUtils::SetStr(op_desc, ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, "IteratorV2"); + op_desc->SetOpKernelLibName("GE"); + builder.AddDataEdge(data, 0, addn1, 0); + builder.AddControlEdge(addn1, netoutput); + return builder.GetGraph(); + } + + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UtestTaskGeneratorTest, AutoFindFpOpIndex) { + auto graph = BuildGraphFpProfiling(); + TaskGenerator task_generator(nullptr, 0); + ProfilingPoint profiling_point; + profiling_point.fp_index = -1; + EXPECT_EQ(task_generator.AutoFindFpOpIndex(graph, profiling_point), SUCCESS); + // addn1 is fp + EXPECT_EQ(profiling_point.fp_index, 2); +} + +TEST_F(UtestTaskGeneratorTest, FindLastBpFromBpNode) { + auto graph = BuildGraphBpProfiling(); + TaskGenerator task_generator(nullptr, 0); + auto net_output = graph->FindNode("netoutput"); + // netoutput has no data input, return default value 0 + EXPECT_EQ(task_generator.FindLastBpFromBpNode(graph, net_output), 0); +} diff --git a/tests/ut/ge/graph/execute/graph_execute_unittest.cc b/tests/ut/ge/graph/execute/graph_execute_unittest.cc new file mode 100644 index 00000000..b24985be --- /dev/null +++ b/tests/ut/ge/graph/execute/graph_execute_unittest.cc @@ -0,0 +1,129 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#define protected public +#define private public +#include "graph/execute/graph_execute.h" +#include "graph/load/model_manager/model_manager.h" +#include "graph/load/model_manager/davinci_model.h" +#include "omm/csa_interact.h" +#undef private +#undef public + + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace testing; +using namespace ge; +using namespace domi; + +namespace ge { +namespace { +const uint32_t kInvalidModelId = UINT32_MAX; +} + +class UtestGraphExecuteTest : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_invalid) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_id = executor.GetExecuteModelId(ge_root_model); + EXPECT_EQ(model_id, kInvalidModelId); +} + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_1) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_manager = ModelManager::GetInstance(); + shared_ptr davinci_model1 = MakeShared(1, nullptr); + davinci_model1->SetId(1); + model_manager->InsertModel(1, davinci_model1); + ge_root_model->SetModelId(1); + auto model_id = executor.GetExecuteModelId(ge_root_model); + EXPECT_EQ(model_id, 1); +} + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_2) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_manager = ModelManager::GetInstance(); + // model1 with 2 load + shared_ptr davinci_model1 = MakeShared(1, nullptr); + davinci_model1->SetId(1); + davinci_model1->data_inputer_ = new DataInputer(); + auto data = MakeShared(); + davinci_model1->data_inputer_->Push(data); + davinci_model1->data_inputer_->Push(data); + model_manager->InsertModel(1, davinci_model1); + // model 2 with 3 load + shared_ptr davinci_model2 = MakeShared(1, nullptr); + davinci_model2->SetId(2); + davinci_model2->data_inputer_ = new DataInputer(); + davinci_model2->data_inputer_->Push(data); + davinci_model2->data_inputer_->Push(data); + davinci_model2->data_inputer_->Push(data); + model_manager->InsertModel(2, davinci_model2); + // model 3 witH 1 load + shared_ptr davinci_model3 = MakeShared(1, nullptr); + davinci_model3->SetId(3); + davinci_model3->data_inputer_ = new DataInputer(); + davinci_model3->data_inputer_->Push(data); + model_manager->InsertModel(3, davinci_model3); + + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + ge_root_model->SetModelId(3); + + auto model_id = executor.GetExecuteModelId(ge_root_model); + // model 3 is picked for having least loads + EXPECT_EQ(model_id, 3); +} + +TEST_F(UtestGraphExecuteTest, test_set_callback) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + // is_unknown_shape_graph_ = false + GeRootModelPtr ge_root_model = MakeShared(graph); + RunAsyncCallback callback = [](Status, std::vector &) {}; + + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + model_manager->InsertModel(1, davinci_model1); + auto status = executor.SetCallback(1, ge_root_model, callback); + EXPECT_EQ(status, SUCCESS); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/graph/load/data_dumper_unittest.cc b/tests/ut/ge/graph/load/data_dumper_unittest.cc index 1866f4eb..68040bf1 100644 --- a/tests/ut/ge/graph/load/data_dumper_unittest.cc +++ b/tests/ut/ge/graph/load/data_dumper_unittest.cc @@ -56,7 +56,7 @@ TEST_F(UtestDataDumper, LoadDumpInfo_no_output_addrs_fail) { TEST_F(UtestDataDumper, UnloadDumpInfo_success) { RuntimeParam rts_param; - DataDumper data_dumper(rts_param); + DataDumper data_dumper(&rts_param); data_dumper.SetModelName("test"); data_dumper.SetModelId(2333); diff --git a/tests/ut/ge/graph/load/davinci_model_unittest.cc b/tests/ut/ge/graph/load/davinci_model_unittest.cc index 18cc622b..56a91ef8 100644 --- a/tests/ut/ge/graph/load/davinci_model_unittest.cc +++ b/tests/ut/ge/graph/load/davinci_model_unittest.cc @@ -22,6 +22,7 @@ #include "graph/utils/graph_utils.h" #include "common/profiling/profiling_manager.h" #include "graph/load/model_manager/davinci_model.h" +#include "graph/manager/graph_var_manager.h" using namespace std; @@ -51,6 +52,10 @@ int32_t MsprofReport(uint32_t moduleId, uint32_t type, void *data, uint32_t len) TEST_F(UtestDavinciModel, init_success) { DavinciModel model(0, nullptr); + VarManager::Instance(0)->Init(0, 0, 0, 0); + map options; + options[GRAPH_MEMORY_MAX_SIZE] = "1048576"; + VarManager::Instance(0)->SetMemoryMallocSize(options); ComputeGraphPtr graph = make_shared("default"); ProfilingManager::Instance().is_load_profiling_ = true; @@ -141,6 +146,12 @@ TEST_F(UtestDavinciModel, init_success) { ProfilingManager::Instance().is_load_profiling_ = false; } +TEST_F(UtestDavinciModel, CheckCapability) { + DavinciModel model(0, nullptr); + bool is_support = false; + (void)model.CheckCapability(FEATURE_TYPE_MEMORY, MEMORY_INFO_TS_4G_LIMITED, is_support); +} + TEST_F(UtestDavinciModel, init_data_op) { DavinciModel model(0, nullptr); model.ge_model_ = make_shared(); @@ -777,6 +788,10 @@ TEST_F(UtestDavinciModel, init_data_aipp_input_dims_normal) { // test label_set_task Init TEST_F(UtestDavinciModel, label_task_success) { + VarManager::Instance(0)->Init(0, 0, 0, 0); + map options; + options[GRAPH_MEMORY_MAX_SIZE] = "1048576"; + VarManager::Instance(0)->SetMemoryMallocSize(options); DavinciModel model(0, nullptr); ComputeGraphPtr graph = make_shared("default"); @@ -942,4 +957,93 @@ TEST_F(UtestDavinciModel, simple_test_gmock) { EXPECT_EQ(mock_stub.func2(2, 5), 1023); EXPECT_EQ(mock_stub.func2(3, 5), 1023); } + +TEST_F(UtestDavinciModel, NnExecute) { + VarManager::Instance(0)->Init(0, 0, 0, 0); + map options; + options[GRAPH_MEMORY_MAX_SIZE] = "1048576"; + VarManager::Instance(0)->SetMemoryMallocSize(options); + + DavinciModel model(0, nullptr); + ComputeGraphPtr graph = make_shared("default"); + ProfilingManager::Instance().is_load_profiling_ = true; + + GeModelPtr ge_model = make_shared(); + ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph)); + AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 10240); + AttrUtils::SetInt(ge_model, ATTR_MODEL_STREAM_NUM, 1); + + shared_ptr model_task_def = make_shared(); + ge_model->SetModelTaskDef(model_task_def); + + GeTensorDesc tensor(GeShape({1,4,128,128}), FORMAT_NCHW, DT_FLOAT); + TensorUtils::SetSize(tensor, 512); + { + OpDescPtr op_desc = CreateOpDesc("data", DATA); + op_desc->AddInputDesc(tensor); + op_desc->AddOutputDesc(tensor); + op_desc->SetInputOffset({1024}); + op_desc->SetOutputOffset({1024}); + NodePtr node = graph->AddNode(op_desc); // op_index = 0 + } + + { + OpDescPtr op_desc = CreateOpDesc("memcpy", MEMCPYASYNC); + op_desc->AddInputDesc(tensor); + op_desc->AddOutputDesc(tensor); + op_desc->SetInputOffset({1024}); + op_desc->SetOutputOffset({5120}); + NodePtr node = graph->AddNode(op_desc); + + domi::TaskDef *task_def = model_task_def->add_task(); + task_def->set_stream_id(0); + task_def->set_type(RT_MODEL_TASK_MEMCPY_ASYNC); + domi::MemcpyAsyncDef *memcpy_async = task_def->mutable_memcpy_async(); + memcpy_async->set_src(1024); + memcpy_async->set_dst(5120); + memcpy_async->set_dst_max(512); + memcpy_async->set_count(1); + memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE); + memcpy_async->set_op_index(op_desc->GetId()); + } + + { + OpDescPtr op_desc = CreateOpDesc("output", NETOUTPUT); + op_desc->AddInputDesc(tensor); + op_desc->SetInputOffset({5120}); + op_desc->SetSrcName( { "memcpy" } ); + op_desc->SetSrcIndex( { 0 } ); + NodePtr node = graph->AddNode(op_desc); // op_index = 3 + } + + EXPECT_EQ(model.Assign(ge_model), SUCCESS); + EXPECT_EQ(model.Init(), SUCCESS); + + rtStream_t stream = nullptr; + InputData input_data; + OutputData output_data; + vector outputs; + EXPECT_EQ(model.GenOutputTensorInfo(&output_data, outputs), SUCCESS); + EXPECT_EQ(output_data.blobs.size(), 1); + EXPECT_EQ(outputs.size(), 1); + input_data.blobs = output_data.blobs; + EXPECT_EQ(input_data.blobs.size(), 1); + + ProfilingManager::Instance().prof_cb_.msprofReporterCallback = MsprofReport; + ProfilingManager::Instance().device_id_.emplace_back(0); + model.task_list_.resize(1); + EXPECT_EQ(model.NnExecute(stream, false, input_data, output_data), SUCCESS); +} +TEST_F(UtestDavinciModel, update_io_addr_success) { + DavinciModel model(0, nullptr); + uint32_t task_id = 1; + uint32_t stream_id = 2; + model.fixed_mem_base_ = 0x22; + model.mem_base_ = reinterpret_cast(&task_id); + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}}; + model.exception_dumper_.op_desc_info_ = { op_desc_info }; + vector io_addr = {nullptr, nullptr}; + model.UpdateOpIOAddrs(task_id, stream_id, io_addr); +} } // namespace ge diff --git a/tests/ut/ge/graph/load/kernel_task_info_unittest.cc b/tests/ut/ge/graph/load/kernel_task_info_unittest.cc index 4fbfe61d..2cfb2a76 100644 --- a/tests/ut/ge/graph/load/kernel_task_info_unittest.cc +++ b/tests/ut/ge/graph/load/kernel_task_info_unittest.cc @@ -496,6 +496,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_init_cce_task) { KernelTaskInfo kernel_task_info; domi::KernelDef *kernel_def = task_def.mutable_kernel(); kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; kernel_def->set_flowtable("InitCceTask"); domi::KernelContext *context = kernel_def->mutable_context(); @@ -529,6 +530,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed1) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); domi::KernelDef *kernel_def = task_def.mutable_kernel(); EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR); @@ -546,6 +548,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed2) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -569,6 +572,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed3) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -594,6 +598,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed4) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -620,6 +625,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed5) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -647,6 +653,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed6) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -675,6 +682,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed7) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); // KernelTaskInfo::SetContext -> SUCCESS @@ -712,6 +720,7 @@ TEST_F(UtestKernelTaskInfo, success_kernel_taskInfo_init_set_context) { context->set_args_count(1); context->set_args_offset("args111111", 10); + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), SUCCESS); EXPECT_EQ(kernel_task_info.Release(), SUCCESS); @@ -733,6 +742,7 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_set_context_failed1) { context->set_is_flowtable(true); context->set_args_count(0); + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), INTERNAL_ERROR); kernel_def->clear_context(); @@ -752,6 +762,8 @@ TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_set_context_failed2) { context->set_args_count(5); context->set_args_offset("\0\0"); // args_offset = 0 + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); + EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), PARAM_INVALID); kernel_def->clear_context(); @@ -769,6 +781,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_update_cce_args) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -815,6 +828,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_update_cce_args_failed1) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -856,6 +870,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -887,6 +902,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed1) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -911,6 +927,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed2) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -935,6 +952,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed3) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = model.op_list_[0]; domi::KernelDef *kernel_def = task_def.mutable_kernel(); domi::KernelContext *context = kernel_def->mutable_context(); @@ -988,6 +1006,7 @@ TEST_F(UtestKernelTaskInfo, success_distribute_dump_task) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); domi::KernelDef *kernel_def = task_def.mutable_kernel(); @@ -1033,6 +1052,7 @@ TEST_F(UtestKernelTaskInfo, success_store_input_output_tensor) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); std::vector input_data_addrs; std::vector output_data_addrs; @@ -1062,6 +1082,7 @@ TEST_F(UtestKernelTaskInfo, fail_release) { domi::TaskDef task_def; KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); std::vector input_data_addrs; std::vector output_data_addrs; @@ -1091,6 +1112,7 @@ TEST_F(UtestKernelTaskInfo, update_l2data_success) { DavinciModel model(0, nullptr); KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); domi::KernelDef kernel_def; EXPECT_EQ(kernel_task_info.UpdateL2Data(kernel_def), SUCCESS); @@ -1168,6 +1190,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_update_args_te) { KernelTaskInfo kernel_task_info; kernel_task_info.kernel_type_ = ccKernelType::TE; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); EXPECT_EQ(kernel_task_info.UpdateArgs(), SUCCESS); } @@ -1177,6 +1200,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_update_args_aicpu) { KernelTaskInfo kernel_task_info; kernel_task_info.kernel_type_ = ccKernelType::TE; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); kernel_task_info.args_size_ = 120; kernel_task_info.args_addr = std::unique_ptr(new (std::nothrow) uint8_t[kernel_task_info.args_size_]); kernel_task_info.io_addrs_ = { (void*)0x12345678, (void*)0x22345678 }; @@ -1191,6 +1215,7 @@ TEST_F(UtestKernelTaskInfo, kernel_task_info_super_kernel_info) { KernelTaskInfo kernel_task_info; kernel_task_info.davinci_model_ = &model; + kernel_task_info.op_desc_ = CreateOpDesc("FrameworkOp", "FrameworkOp"); EXPECT_EQ(kernel_task_info.SaveSuperKernelInfo(), SUCCESS); diff --git a/tests/ut/ge/graph/load/model_helper_unittest.cc b/tests/ut/ge/graph/load/model_helper_unittest.cc index 03605dc7..8fd8f014 100644 --- a/tests/ut/ge/graph/load/model_helper_unittest.cc +++ b/tests/ut/ge/graph/load/model_helper_unittest.cc @@ -36,13 +36,6 @@ class UtestModelHelper : public testing::Test { void TearDown() override {} }; -TEST_F(UtestModelHelper, save_size_to_modeldef_failed) -{ - GeModelPtr ge_model = ge::MakeShared(); - ModelHelper model_helper; - EXPECT_EQ(ACL_ERROR_GE_MEMORY_ALLOCATION, model_helper.SaveSizeToModelDef(ge_model)); -} - TEST_F(UtestModelHelper, save_size_to_modeldef) { GeModelPtr ge_model = ge::MakeShared(); diff --git a/tests/ut/ge/graph/load/model_manager_unittest.cc b/tests/ut/ge/graph/load/model_manager_unittest.cc index 0e65954d..342f6362 100644 --- a/tests/ut/ge/graph/load/model_manager_unittest.cc +++ b/tests/ut/ge/graph/load/model_manager_unittest.cc @@ -151,6 +151,15 @@ class DModelListener : public ModelListener { uint32_t OnComputeDone(uint32_t model_id, uint32_t data_index, uint32_t resultCode) { return 0; } }; +TEST_F(UtestModelManagerModelManager, case_is_need_hybrid_load) { + ModelManager mm; + uint32_t model_id = 0; + ComputeGraphPtr root_graph = std::make_shared("graph"); + ge::GeRootModel model; + EXPECT_EQ(mm.IsNeedHybridLoad(model), false); + model.SetRootGraph(root_graph); + EXPECT_EQ(mm.IsNeedHybridLoad(model), false); +} TEST_F(UtestModelManagerModelManager, case_load_incorrect_param) { ModelManager mm; diff --git a/tests/ut/ge/graph/load/model_utils_unittest.cc b/tests/ut/ge/graph/load/model_utils_unittest.cc index ac886cea..630a75aa 100644 --- a/tests/ut/ge/graph/load/model_utils_unittest.cc +++ b/tests/ut/ge/graph/load/model_utils_unittest.cc @@ -67,4 +67,22 @@ TEST_F(UtestModelUtils, get_var_addr_rdma_hbm) { EXPECT_EQ(reinterpret_cast(offset), var_addr); VarManager::Instance(runtime_param.session_id)->Destory(); } + +TEST_F(UtestModelUtils, get_var_addr_rdma_hbm_negative_offset) { + uint8_t test = 2; + uint8_t *pf = &test; + RuntimeParam runtime_param; + runtime_param.session_id = 0; + runtime_param.logic_var_base = 0; + runtime_param.var_base = pf; + + int64_t offset = -1; + EXPECT_EQ(VarManager::Instance(runtime_param.session_id)->Init(0, 0, 0, 0), SUCCESS); + EXPECT_NE(VarManager::Instance(runtime_param.session_id)->var_resource_, nullptr); + VarManager::Instance(runtime_param.session_id)->var_resource_->var_offset_map_[offset] = RT_MEMORY_RDMA_HBM; + std::shared_ptr op_desc = std::make_shared("test", "test"); + uint8_t *var_addr = nullptr; + EXPECT_NE(ModelUtils::GetVarAddr(runtime_param, op_desc, offset, var_addr), SUCCESS); + VarManager::Instance(runtime_param.session_id)->Destory(); +} } // namespace ge diff --git a/tests/ut/ge/graph/manager/graph_manager_unittest.cc b/tests/ut/ge/graph/manager/graph_manager_unittest.cc new file mode 100644 index 00000000..dad55f3d --- /dev/null +++ b/tests/ut/ge/graph/manager/graph_manager_unittest.cc @@ -0,0 +1,375 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#define protected public +#define private public +#include "graph/manager/graph_manager.h" +#include "graph/load/model_manager/model_manager.h" +#include "graph/load/model_manager/davinci_model.h" +#define const +#include "common/helper/model_cache_helper.h" +#undef const +#include "init/gelib.h" +#undef private +#undef public + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/math/math_util.h" +#include "common/thread_pool.h" +#include "common/dump/dump_manager.h" +#include "analyzer/analyzer.h" +#include "graph/common/ge_call_wrapper.h" +#include "graph/common/local_context.h" +#include "graph/common/transop_util.h" +#include "graph/ge_context.h" +#include "graph/ge_global_options.h" +#include "graph/manager/util/rt_context_util.h" +#include "graph/partition/dynamic_shape_partition.h" +#include "graph/passes/enter_pass.h" +#include "graph/partition/stage_partition.h" +#include "graph/passes/addn_pass.h" +#include "graph/passes/bitcast_pass.h" +#include "graph/passes/assign_remove_pass.h" +#include "graph/passes/inplace_support_check_pass.h" +#include "graph/passes/atomic_addr_clean_pass.h" +#include "graph/passes/attach_stream_label_pass.h" +#include "graph/passes/cast_remove_pass.h" +#include "graph/passes/common_subexpression_elimination_pass.h" +#include "graph/passes/compile_nodes_pass.h" +#include "graph/passes/cond_remove_pass.h" +#include "graph/passes/constant_folding_pass.h" +#include "graph/passes/constant_fuse_same_pass.h" +#include "graph/passes/control_trigger_pass.h" +#include "graph/passes/ctrl_edge_transfer_pass.h" +#include "graph/passes/dimension_adjust_pass.h" +#include "graph/passes/dimension_compute_pass.h" +#include "graph/passes/flow_ctrl_pass.h" +#include "graph/passes/fuse_data_nodes_with_common_input_pass.h" +#include "graph/passes/identity_pass.h" +#include "graph/passes/input_output_connection_identify_pass.h" +#include "graph/passes/iterator_op_pass.h" +#include "graph/passes/link_gen_mask_nodes_pass.h" +#include "graph/passes/mark_graph_unknown_status_pass.h" +#include "graph/passes/merge_pass.h" +#include "graph/passes/merge_input_memcpy_pass.h" +#include "graph/passes/merge_to_stream_merge_pass.h" +#include "graph/passes/multi_batch_pass.h" +#include "graph/passes/next_iteration_pass.h" +#include "graph/passes/permute_pass.h" +#include "graph/passes/prune_pass.h" +#include "graph/passes/ref_identity_delete_op_pass.h" +#include "graph/passes/remove_same_const_pass.h" +#include "graph/passes/reshape_recovery_pass.h" +#include "graph/passes/reshape_remove_pass.h" +#include "graph/passes/same_transdata_breadth_fusion_pass.h" +#include "graph/passes/subgraph_pass.h" +#include "graph/passes/switch_data_edges_bypass.h" +#include "graph/passes/switch_dead_branch_elimination.h" +#include "graph/passes/switch_logic_remove_pass.h" +#include "graph/passes/switch_to_stream_switch_pass.h" +#include "graph/passes/transop_breadth_fusion_pass.h" +#include "graph/passes/transop_nearby_allreduce_fusion_pass.h" +#include "graph/passes/transop_symmetry_elimination_pass.h" +#include "graph/passes/transop_without_reshape_fusion_pass.h" +#include "graph/passes/transpose_transdata_pass.h" +#include "graph/passes/useless_control_out_remove_pass.h" +#include "graph/passes/variable_op_pass.h" +#include "graph/passes/variable_ref_delete_op_pass.h" +#include "graph/passes/variable_ref_useless_control_out_delete_pass.h" +#include "graph/passes/end_of_sequence_add_control_pass.h" +#include "graph/passes/subexpression_migration_pass.h" +#include "graph/passes/subgraph_const_migration_pass.h" +#include "graph/passes/unused_args_clean_pass.h" +#include "graph/passes/global_step_insert_pass.h" +#include "graph/passes/memcpy_addr_async_pass.h" +#include "graph/passes/hccl_continuous_memcpy_pass.h" +#include "graph/build/label_allocator.h" +#include "graph/utils/tensor_adapter.h" +#include "inc/pass_manager.h" +#include "ir_build/atc_ir_common.h" +#include "graph/common/local_context.h" +#include "graph/common/omg_util.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "register/custom_pass_helper.h" +#include "graph/ops_stub.h" + +using namespace std; +using namespace testing; +using namespace ge; +using namespace domi; + +namespace { +const uint32_t kNotAdded = 0; +const uint32_t kStartAdd = 1; +const uint32_t kDoneAdded = 2; +} +class UtestGraphManagerTest : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +void CreateGraph(Graph &graph) { + TensorDesc desc(ge::Shape({1, 3, 224, 224})); + uint32_t size = desc.GetShape().GetShapeSize(); + desc.SetSize(size); + auto data = op::Data("Data").set_attr_index(0); + data.update_input_desc_data(desc); + data.update_output_desc_out(desc); + + auto flatten = op::Flatten("Flatten").set_input_x(data, data.name_out_out()); + + std::vector inputs{data}; + std::vector outputs{flatten}; + std::vector targets{flatten}; + // Graph graph("test_graph"); + graph.SetInputs(inputs).SetOutputs(outputs).SetTargets(targets); +} + +TEST_F(UtestGraphManagerTest, set_and_get_add_graph_flag) { + GraphId graph_id = 1; + GraphManager graph_manager; + graph_manager.SetAddGraphCondition(graph_id, 1); + uint32_t res = graph_manager.GetAddGraphCondition(graph_id); + EXPECT_EQ(res, 1); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_1) { + GraphId graph_id = 1; + GraphManager graph_manager; + // create graph + Graph graph("test_graph"); + CreateGraph(graph); + + std::map options; + OmgContext context; + Status status = graph_manager.AddGraph(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_2) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.SetAddGraphCondition(graph_id, kDoneAdded); + Graph graph("test_graph"); + CreateGraph(graph); + std::map options; + OmgContext context; + Status status = graph_manager.AddGraph(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_3) { + GraphId graph_id = 1; + GraphManager graph_manager; + Graph graph("test_graph"); + CreateGraph(graph); + + std::map options; + OmgContext context; + + std::future fut1 = std::async(std::launch::async, + &GraphManager::AddGraph, &graph_manager, graph_id, graph, options, context); + std::future fut2 = std::async(std::launch::async, + &GraphManager::AddGraph, &graph_manager, graph_id, graph, options, context); + fut1.wait(); + fut2.wait(); + Status status1 = fut1.get(); + Status status2 = fut2.get(); + EXPECT_EQ(status1, ge::SUCCESS); + EXPECT_EQ(status2, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_remove_graph_1) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + Status status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::GE_GRAPH_GRAPH_NOT_EXIST); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_node->SetRunFlag(true); + status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_remove_graph_2) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + Graph graph("test_graph"); + CreateGraph(graph); + auto compute_graph = GraphUtils::GetComputeGraph(graph); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + shared_ptr davinci_model2 = MakeShared(2, listener); + davinci_model1->SetId(2); + model_manager->InsertModel(1, davinci_model1); + model_manager->InsertModel(2, davinci_model2); + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + graph_node->SetGeRootModel(ge_root_model); + graph_node->SetLoadFlag(true); + graph_manager.AddGraphNode(graph_id, graph_node); + Status status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_pre_run_thread) { + + GraphManager graph_manager; + graph_manager.thread_run_flag_ = true; + + GraphId graph_id = 1; + std::vector input_tensor; + uint64_t session_id = 0; + ErrorMessage::Context error_context; + GEThreadLocalContext context; + RunAsyncCallback callback; + // PreRunArgs args{graph_id, input_tensor, session_id, error_context, context, callback}; + bool ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.PreRunThread(&graph_manager); + // end with failed +} + +TEST_F(UtestGraphManagerTest, test_pre_run_thread_2) { + + GraphManager graph_manager; + graph_manager.thread_run_flag_ = true; + + GraphId graph_id = 1; + GraphNodePtr graph_node_1 = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node_1); + graph_manager.IncreaseGraphCount(graph_id); + graph_manager.IncreaseGraphCount(graph_id); + graph_node_1->SetBuildFlag(true); + std::vector input_tensor; + uint64_t session_id = 0; + ErrorMessage::Context error_context; + GEThreadLocalContext context; + RunAsyncCallback callback; + // PreRunArgs args{graph_id, input_tensor, session_id, error_context, context, callback}; + bool ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + graph_id = 2; + GraphNodePtr graph_node_2 = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node_2); + ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + graph_manager.PreRunThread(&graph_manager); + // end with failed +} + +TEST_F(UtestGraphManagerTest, test_check_and_release_memory) { + + GraphManager graph_manager; + GeModelPtr ge_model = make_shared(); + int64_t memory_size = 25 * 1024UL * 1024UL * 1024UL; + int64_t weight_size = 25 * 1024UL * 1024UL * 1024UL; + uint64_t session_id = 0; + ge::AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, memory_size); + ge::AttrUtils::SetInt(ge_model, ATTR_MODEL_WEIGHT_SIZE, weight_size); + ge::AttrUtils::SetInt(ge_model, MODEL_ATTR_SESSION_ID, session_id); + + + GraphId graph_id = 1; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.IncreaseGraphCount(graph_id); + graph_manager.IncreaseGraphCount(graph_id); + + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + shared_ptr davinci_model2 = MakeShared(2, listener); + davinci_model1->SetId(2); + model_manager->InsertModel(1, davinci_model1); + model_manager->InsertModel(2, davinci_model2); + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + bool is_dynamic_shape = false; + (void)AttrUtils::GetBool(compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + graph_node->SetGeRootModel(ge_root_model); + graph_node->SetLoadFlag(true); + Status status = graph_manager.CheckAndReleaseMemory(ge_model, graph_node); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_1) { + // no need to build + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(true); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_2) { + // need build while buildflag is true, var format changed + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + arg.callback = [](Status, std::vector &) {}; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(true); + graph_node->Lock(); + graph_manager.var_acc_ctrl_.graph_ids_need_rebuild_.insert(graph_id); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_EQ(status, ge::PARAM_INVALID); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_3) { + // need build while buildflag is false, var format unchanged + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + arg.callback = [](Status, std::vector &) {}; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(false); + graph_node->Lock(); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_NE(status, ge::SUCCESS); +} diff --git a/tests/ut/ge/graph/partition/dynamic_shape_partition_unittest.cc b/tests/ut/ge/graph/partition/dynamic_shape_partition_unittest.cc new file mode 100644 index 00000000..b60e0ddd --- /dev/null +++ b/tests/ut/ge/graph/partition/dynamic_shape_partition_unittest.cc @@ -0,0 +1,97 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "graph/partition/dynamic_shape_partition.h" +#include "compute_graph.h" +#include "inc/framework/common/types.h" +#include "utils/graph_utils.h" +#include "graph/debug/ge_attr_define.h" + + +#define private public +#define protected public + +namespace ge { + +namespace { + +GeTensorDescPtr CreateTensorDesc(std::initializer_list shape, Format format = FORMAT_NCHW, + DataType data_type = DT_FLOAT) { + GeShape ge_shape{vector(shape)}; + GeTensorDescPtr tensor_desc = std::make_shared(); + tensor_desc->SetShape(ge_shape); + tensor_desc->SetFormat(format); + tensor_desc->SetDataType(data_type); + return tensor_desc; +} + +class NodeBuilder { + public: + NodeBuilder(const std::string &name, const std::string &type) { op_desc_ = std::make_shared(name, type); } + + NodeBuilder &AddInputDesc(std::initializer_list shape = {1, 1, 224, 224}, Format format = FORMAT_NCHW, + DataType data_type = DT_FLOAT) { + op_desc_->AddInputDesc(CreateTensorDesc(shape, format, data_type)->Clone()); + return *this; + } + + NodeBuilder &AddOutputDesc(std::initializer_list shape = {1, 1, 224, 224}, Format format = FORMAT_NCHW, + DataType data_type = DT_FLOAT) { + op_desc_->AddOutputDesc(CreateTensorDesc(shape, format, data_type)->Clone()); + return *this; + } + + NodeBuilder &AddOutputDesc(GeTensorDescPtr tensor_desc) { + op_desc_->AddOutputDesc(tensor_desc->Clone()); + return *this; + } + + NodePtr Build(const ComputeGraphPtr &graph) { + NodePtr node = graph->AddNode(op_desc_); + return node; + } + + private: + OpDescPtr op_desc_; +}; +} // namespace + +class UtestDynamicShapePartition : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +TEST_F(UtestDynamicShapePartition, single_op_scene_success) { + ComputeGraphPtr graph = std::make_shared("default"); + + NodePtr node1 = + NodeBuilder("node1", CONSTANTOP).AddInputDesc({1, 1, 224, 224}).AddOutputDesc({1, 1, 224, 224}).Build(graph); + NodePtr add_n_node = + NodeBuilder("add_n_node", ADDN).AddInputDesc({1, 1, 224, 224}).AddOutputDesc({1, 1, 224, 224}).Build(graph); + NodePtr node2 = + NodeBuilder("node2", RELU).AddInputDesc({1, 1, 224, 224}).AddOutputDesc({1, 1, 224, 224}).Build(graph); + GraphUtils::AddEdge(node1->GetOutDataAnchor(0), add_n_node->GetInDataAnchor(0)); + GraphUtils::AddEdge(add_n_node->GetOutDataAnchor(0), node2->GetInDataAnchor(0)); + + (void)AttrUtils::SetBool(add_n_node->GetOpDesc(), ATTR_SINGLE_OP_SCENE, true); + + DynamicShapePartitioner partitioner(graph); + EXPECT_EQ(partitioner.Partition(), SUCCESS); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/graph/passes/atomic_addr_clean_pass_unittest.cc b/tests/ut/ge/graph/passes/atomic_addr_clean_pass_unittest.cc new file mode 100644 index 00000000..d9d663d9 --- /dev/null +++ b/tests/ut/ge/graph/passes/atomic_addr_clean_pass_unittest.cc @@ -0,0 +1,96 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "graph/passes/atomic_addr_clean_pass.h" +#include "common/op/ge_op_utils.h" +#include "common/types.h" +#include "graph/anchor.h" +#include "graph/attr_value.h" +#include "graph/compute_graph.h" +#include "graph/op_desc.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +#include "inc/pass_manager.h" +using namespace testing; + +namespace ge { +class UtestGraphPassesAtomicAddrCleanPass : public Test { +public: + UtestGraphPassesAtomicAddrCleanPass() { + graph_ = std::make_shared("test"); + } + + NodePtr NewNode(const string &name, const string &type, int input_cnt, int output_cnt) { + OpDescPtr op_desc = std::make_shared(name, type); + for (int i = 0; i < input_cnt; ++i) { + op_desc->AddInputDesc(GeTensorDesc()); + } + for (int i = 0; i < output_cnt; ++i) { + op_desc->AddOutputDesc(GeTensorDesc()); + } + NodePtr node = graph_->AddNode(op_desc); + return node; + } + + int CountOfAtomicCleanNode() { + int node_num = 0; + for (NodePtr &node : graph_->GetDirectNode()) { + if (node->GetType() == ATOMICADDRCLEAN) { + ++node_num; + } + } + return node_num; + } + + ComputeGraphPtr graph_; +}; + +/* + * Data Data Atomic_clean + * | | / | + * relu relu | + * | ==> | | + * relu(atomic) relu(atomic) + * | | + * netoutput netoutput + */ +TEST_F(UtestGraphPassesAtomicAddrCleanPass, pass_run_success) { + auto node1 = NewNode("node1", DATA, 0, 1); + + auto node2 = NewNode("node2", RELU, 1, 1); + auto node3 = NewNode("node3", RELU, 1, 1); + auto op_desc = node3->GetOpDesc(); + vector atomic_input_index = {123, 456}; + AttrUtils::SetListInt(op_desc, "atomic_input_index", atomic_input_index); + + auto node4 = NewNode("node4", NETOUTPUT, 1, 0); + GraphUtils::AddEdge(node1->GetOutDataAnchor(0), node2->GetInDataAnchor(0)); + GraphUtils::AddEdge(node2->GetOutDataAnchor(0), node3->GetInDataAnchor(0)); + GraphUtils::AddEdge(node3->GetOutDataAnchor(0), node4->GetInDataAnchor(0)); + AtomicAddrCleanPass atomi_addr_clean_pass; + Status ret = atomi_addr_clean_pass.Run(graph_); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(1, CountOfAtomicCleanNode()); + + auto atomic_clean = graph_->FindNode("atomic_addr_clean"); + EXPECT_NE(atomic_clean, nullptr); + auto out_ctrl_nodes = atomic_clean->GetOutControlNodes(); + EXPECT_EQ(out_ctrl_nodes.size(), 2); +} +} // namespace ge diff --git a/tests/ut/ge/graph/passes/base_pass_unittest.cc b/tests/ut/ge/graph/passes/base_pass_unittest.cc index 56a7077a..9bba5d77 100644 --- a/tests/ut/ge/graph/passes/base_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/base_pass_unittest.cc @@ -67,6 +67,22 @@ class UtestTestPass : public BaseNodePass { names_to_add_repass_.erase(iter); } } + // simulate infershape pass + if(node->GetType() == WHILE){ + bool need_repass = false; + AttrUtils::GetBool(node->GetOpDesc(),"_need_infer_again", need_repass); + if(!OptionExists(kOptimizeAfterSubGraph)){ + return SUCCESS; + } + if(need_repass){ + AttrUtils::SetBool(node->GetOpDesc(),"_need_infer_again", false); + AddImmediateRePassNode(node); + } + else{ + // clear attr on while + node->GetOpDesc()->DelAttr("_need_infer_again"); + } + } return SUCCESS; } void clear() { iter_nodes_.clear(); } @@ -429,6 +445,7 @@ TEST_F(UTESTGraphPassesBasePass, dead_loop) { EXPECT_EQ(test_pass.GetRunTimes(), 1007); } */ + TEST_F(UTESTGraphPassesBasePass, while_loop) { NamesToPass names_to_pass; auto test_pass = UtestTestPass(true); @@ -438,4 +455,69 @@ TEST_F(UTESTGraphPassesBasePass, while_loop) { auto ge_pass = GEPass(graph); EXPECT_EQ(ge_pass.Run(names_to_pass), SUCCESS); } + +/// data1 const +/// \ / +/// while +/// / \ +/// | | +/// cast1 cast2 +ComputeGraphPtr BuildWhileGraph1() { + // build sub graph + auto builder_sub = ut::GraphBuilder("sub"); + auto data_1 = builder_sub.AddNode("data_1", DATA, 0, 1); + auto data_2 = builder_sub.AddNode("data_2", DATA, 0, 1); + auto add = builder_sub.AddNode("add", ADD, 2, 1); + + builder_sub.AddDataEdge(data_1, 0, add, 0); + builder_sub.AddDataEdge(data_2, 0, add, 1); + auto sub_graph = builder_sub.GetGraph(); + sub_graph->SetName("while_sub"); + // build root graph + auto builder = ut::GraphBuilder("g1"); + auto data = builder.AddNode("data1", DATA, 0, 1); + auto const_op = builder.AddNode("const_op", CONSTANT, 0, 1); + auto c1 = builder.AddNode("cast1", CAST, 1, 1); + auto c2 = builder.AddNode("cast2", CAST, 1, 1); + // add while op + auto tensor_desc = std::make_shared(); + tensor_desc->SetShape(GeShape({1,1,1,1})); + tensor_desc->SetFormat(FORMAT_ND); + tensor_desc->SetDataType(DT_INT32); + + auto op_desc = std::make_shared("while", WHILE); + for (int i = 0; i < 2; ++i) { + op_desc->AddInputDesc(tensor_desc->Clone()); + } + for (int i = 0; i < 2; ++i) { + op_desc->AddOutputDesc(tensor_desc->Clone()); + } + AttrUtils::SetBool(op_desc,"_need_infer_again", true); + op_desc->AddSubgraphName(sub_graph->GetName()); + op_desc->SetSubgraphInstanceName(0,sub_graph->GetName()); + auto root_graph = builder.GetGraph(); + auto while_op = root_graph->AddNode(op_desc); + + builder.AddDataEdge(data, 0, while_op, 0); + builder.AddDataEdge(const_op, 0, while_op, 1); + builder.AddDataEdge(while_op, 0, c1, 0); + builder.AddDataEdge(while_op, 1, c2, 0); + sub_graph->SetParentGraph(root_graph); + sub_graph->SetParentNode(while_op); + root_graph->AddSubgraph(sub_graph); + return root_graph; +} + +TEST_F(UTESTGraphPassesBasePass, while_infershape) { +NamesToPass names_to_pass; +auto test_pass = UtestTestPass(); +names_to_pass.push_back(std::make_pair("test", &test_pass)); + +auto graph = BuildWhileGraph1(); +auto ge_pass = GEPass(graph); +auto while_node = graph->FindNode("while"); +EXPECT_EQ(while_node->GetOpDesc()->GetSubgraphInstanceNames().size(),1); +EXPECT_EQ(ge_pass.Run(names_to_pass), SUCCESS); +} + } // namespace ge diff --git a/tests/ut/ge/graph/passes/buffer_pool_memory_pass_unittest.cc b/tests/ut/ge/graph/passes/buffer_pool_memory_pass_unittest.cc new file mode 100644 index 00000000..a59ca54f --- /dev/null +++ b/tests/ut/ge/graph/passes/buffer_pool_memory_pass_unittest.cc @@ -0,0 +1,591 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "common/ge_inner_error_codes.h" +#include "common/types.h" +#include "graph/manager/graph_var_manager.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/tensor_utils.h" +#include "inc/pass_manager.h" +#include "graph_builder_utils.h" +#include "../utils/buffer_pool_graph_builder.h" +#include "graph/passes/buffer_pool_memory_pass.h" + +namespace ge { +class UtestBufferPoolMemoryPass : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_normal_success_test) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add2;1"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add3;2"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add4;3"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add2;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add2"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add5;0"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add3;1"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add3"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_normal_graph_with_multi_buffer_pool_success_test) { + ut::BufferPoolGraphBuilder builder("NormalGraphWithMultiBufferPool"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraphWithMultiBufferPool(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add2;3"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add3;1"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add4;2"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add3;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add3"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add5;4"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_contain_one_node_success_test) { + ut::BufferPoolGraphBuilder builder("SerialGraph"); + ge::ComputeGraphPtr graph = builder.BuildSerialGraph(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add2;1"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add1;2"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add1"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add3;2"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add2;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add2"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add4;0"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add3;1"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add3"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add5;1"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add4;2"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add4"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, calc_node_with_multi_buffer_pool_input_success_test) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiPrefetch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiPrefetch(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 0); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 0); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add2;1"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add1;2"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add1"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add3;2"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add2;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add2"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_in_different_subgraph_success_test) { + ut::BufferPoolGraphBuilder builder("GraphWithSubgraph"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithSubgraph(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + std::map all_nodes; + for (auto node : graph->GetAllNodes()) { + EXPECT_NE(node, nullptr); + all_nodes[node->GetName()] = node; + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add2;1"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add3;2"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add4;3"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 0); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add5;4"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 1); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "prefetch4"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_in_different_subgraph_with_inner_dependency_success_test) { + ut::BufferPoolGraphBuilder builder("SubgraphWithInnerDependency"); + ge::ComputeGraphPtr graph = builder.BuildSubgraphWithInnerDependency(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + std::map all_nodes; + for (auto node : graph->GetAllNodes()) { + EXPECT_NE(node, nullptr); + all_nodes[node->GetName()] = node; + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add1;0"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add2;1"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add3;2"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;add4;3"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 1); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "prefetch3"); + } + + { + std::vector event_info; + auto prefetch = all_nodes.at("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add5;4"); + EXPECT_EQ(event_info.at(1), "RecvFrom;add3;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "add3"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_with_batch_label_success_test) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiBatch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiBatch(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("batch_label_256/prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;batch_label_256/add1;4"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("batch_label_256/prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;batch_label_256/add2;5"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("batch_label_256/prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;batch_label_256/add3;6"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("batch_label_256/prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;batch_label_256/add4;7"); + EXPECT_EQ(event_info.at(1), "RecvFrom;batch_label_256/add2;4"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "batch_label_256/add2"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("batch_label_256/prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;batch_label_256/add5;4"); + EXPECT_EQ(event_info.at(1), "RecvFrom;batch_label_256/add3;5"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "batch_label_256/add3"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_node_has_multi_output_success_test) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiOutputPrefetch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiOutputPrefetch(); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch1"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;prefetch1_memcpy_async;0"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch2"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;prefetch2_memcpy_async;1"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 1); + EXPECT_EQ(event_info.at(0), "SendTo;prefetch3_memcpy_async;2"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch4"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;prefetch4_memcpy_async;3"); + EXPECT_EQ(event_info.at(1), "RecvFrom;prefetch2_memcpy_async;0"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "prefetch2_memcpy_async"); + } + + { + std::vector event_info; + auto prefetch = graph->FindNode("prefetch5"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::GetListStr(prefetch->GetOpDesc(), "_event_multiplexing", event_info); + EXPECT_EQ(event_info.size(), 2); + EXPECT_EQ(event_info.at(0), "SendTo;add5;0"); + EXPECT_EQ(event_info.at(1), "RecvFrom;prefetch3_memcpy_async;1"); + auto in_ctrl_nodes = prefetch->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 2); + EXPECT_EQ(in_ctrl_nodes.at(0)->GetName(), "prefetch3_memcpy_async"); + } +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_has_different_size_fail_test) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + const int64_t dummy_size = 256; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + (void) AttrUtils::SetInt(prefetch->GetOpDesc(), "_buffer_pool_size", dummy_size); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, FAILED); +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_size_is_not_enough_fail_test) { + ut::BufferPoolGraphBuilder builder("NormalGraph"); + ge::ComputeGraphPtr graph = builder.BuildNormalGraph(); + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + builder.SetPrefetchNodeInfo(prefetch, buffer_pool_id, buffer_pool_size, {buffer_pool_size + 512}); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, FAILED); +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_size_is_not_enough_for_multi_fail_test) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiPrefetch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiPrefetch(); + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + auto prefetch = graph->FindNode("prefetch3"); + EXPECT_NE(prefetch, nullptr); + builder.SetPrefetchNodeInfo(prefetch, buffer_pool_id, buffer_pool_size, {buffer_pool_size}); + + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, FAILED); +} + +TEST_F(UtestBufferPoolMemoryPass, buffer_pool_node_has_multi_input_output_fail_test) { + ut::BufferPoolGraphBuilder builder("GraphWithMultiInputOutputPrefetch"); + ge::ComputeGraphPtr graph = builder.BuildGraphWithMultiInputOutputPrefetch(); + BufferPoolMemoryPass buffer_pool_mem_pass; + Status ret = buffer_pool_mem_pass.Run(graph); + EXPECT_EQ(ret, FAILED); +} +} // namespace ge diff --git a/tests/ut/ge/graph/passes/cond_branch_v1_unittest.cc b/tests/ut/ge/graph/passes/cond_branch_v1_unittest.cc new file mode 100644 index 00000000..0927aec4 --- /dev/null +++ b/tests/ut/ge/graph/passes/cond_branch_v1_unittest.cc @@ -0,0 +1,125 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/merge_input_memcpy_pass.h" +#include "graph/passes/switch_to_stream_switch_pass.h" +#include "graph/passes/merge_to_stream_merge_pass.h" +#include "graph/passes/attach_stream_label_pass.h" + +#include +#include "graph_builder_utils.h" + +namespace ge { +class UtestCondBranchV1Pass : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +/// +/// net_output +/// | +/// merge +/// / \ +/// square add +/// F| T/ T\ +/// switch1 switch2 +/// / \ / \ +/// var1 var2 var3 +/// +ComputeGraphPtr BuildGraph1() { + auto builder = ut::GraphBuilder("g1"); + auto var1 = builder.AddNode("var1", VARIABLEV2, 0, 1); + auto var2 = builder.AddNode("var2", VARIABLEV2, 0, 1, FORMAT_ND, DT_BOOL, {}); + auto var3 = builder.AddNode("var3", VARIABLEV2, 0, 1); + auto switch1 = builder.AddNode("switch1", REFSWITCH, 2, 2); + auto switch2 = builder.AddNode("switch2", SWITCH, 2, 2); + auto add = builder.AddNode("add", ADD, 2, 1); + auto square = builder.AddNode("square", SQUARE, 1, 1); + auto merge = builder.AddNode("merge", MERGE, 2, 2); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(var1, 0, switch1, 0); + builder.AddDataEdge(var2, 0, switch1, 1); + builder.AddDataEdge(var3, 0, switch2, 0); + builder.AddDataEdge(var2, 0, switch2, 1); + builder.AddDataEdge(switch1, 0, square, 0); + builder.AddDataEdge(switch1, 1, add, 0); + builder.AddDataEdge(switch2, 1, add, 1); + builder.AddDataEdge(square, 0, merge, 0); + builder.AddDataEdge(add, 0, merge, 1); + builder.AddDataEdge(merge, 0, net_output, 0); + return builder.GetGraph(); +} +} // namespace + +TEST_F(UtestCondBranchV1Pass, common_cond_branch_v1) { + auto graph = BuildGraph1(); + MergeInputMemcpyPass memcpy_pass; + SwitchToStreamSwitchPass switch_pass; + MergeToStreamMergePass merge_pass; + AttachStreamLabelPass label_pass; + EXPECT_EQ(memcpy_pass.Run(graph), SUCCESS); + EXPECT_EQ(switch_pass.Run(graph), SUCCESS); + EXPECT_EQ(merge_pass.Run(graph), SUCCESS); + EXPECT_EQ(label_pass.Run(graph), SUCCESS); + + uint32_t switch_num = 0; + uint32_t merge_num = 0; + uint32_t cast_num = 0; + uint32_t stream_switch_num = 0; + uint32_t memcpy_num = 0; + uint32_t active_num = 0; + uint32_t stream_merge_num = 0; + + for (const auto &node : graph->GetAllNodes()) { + const auto &op_desc = node->GetOpDesc(); + std::string type = op_desc->GetType(); + if (type == SWITCH || type == REFSWITCH) { + switch_num++; + } else if (type == MERGE) { + merge_num++; + } else if (type == CAST) { + cast_num++; + } else if (type == STREAMSWITCH) { + stream_switch_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_ACTIVE_LABEL_LIST)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_SWITCH_DATA_TYPE)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)); + } else if (type == STREAMMERGE) { + stream_merge_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + } else if ((type == MEMCPYASYNC) || (type == MEMCPYADDRASYNC)) { + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + memcpy_num++; + } else if (type == STREAMACTIVE) { + active_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_ACTIVE_LABEL_LIST)); + } + } + + EXPECT_EQ(switch_num, 0); + EXPECT_EQ(merge_num, 0); + EXPECT_EQ(cast_num, 1); + EXPECT_EQ(stream_switch_num, 2); + EXPECT_EQ(memcpy_num, 2); + EXPECT_EQ(active_num, 3); + EXPECT_EQ(stream_merge_num, 1); +} + +} // namespace ge diff --git a/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc b/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc index 79e34a60..41ea5828 100644 --- a/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc @@ -28,6 +28,7 @@ #include "graph/types.h" #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" +#include "inc/kernel.h" #include "inc/kernel_factory.h" #undef protected #undef private @@ -37,11 +38,27 @@ using namespace testing; namespace ge { +class TestExpandDimKernel : public Kernel { + public: + Status Compute(const NodePtr &node_ptr) override { + return SUCCESS; + } +}; +REGISTER_KERNEL(EXPANDDIMS, TestExpandDimKernel); +class TestExpandDimKernelNotChange : public Kernel { + public: + Status Compute(const NodePtr &node_ptr) override { + return NOT_CHANGED; + } +}; + class UtestGraphPassesDimensionAdjustPass : public testing::Test { protected: void SetUp() {} - void TearDown() {} + void TearDown() { + KernelFactory::Instance().creator_map_.clear(); + } }; TEST_F(UtestGraphPassesDimensionAdjustPass, succ) { @@ -96,8 +113,11 @@ TEST_F(UtestGraphPassesDimensionAdjustPass, succ) { GraphUtils::AddEdge(op_node->GetOutDataAnchor(0), netoutput_node->GetInDataAnchor(0)); std::shared_ptr pass = make_shared(); + NamesToPass names_to_passes; + EXPECT_EQ(4, graph->GetDirectNodesSize()); ge::Status ret = pass->Run(op_node); EXPECT_EQ(SUCCESS, ret); + EXPECT_EQ(2, op_node->GetOwnerComputeGraph()->GetDirectNodesSize()); } TEST_F(UtestGraphPassesDimensionAdjustPass, input_node_is_nullptr) { diff --git a/tests/ut/ge/graph/passes/infershape_pass_unittest.cc b/tests/ut/ge/graph/passes/infershape_pass_unittest.cc index 8fa5b34e..a7628b2e 100644 --- a/tests/ut/ge/graph/passes/infershape_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/infershape_pass_unittest.cc @@ -26,12 +26,9 @@ #include "graph/operator_factory.h" #include "graph/operator_reg.h" #include "graph_builder_utils.h" -#undef protected -#undef private using namespace std; using namespace testing; -using namespace ge; namespace ge { class UtestGraphInfershapePass : public testing::Test { protected: @@ -52,4 +49,17 @@ TEST_F(UtestGraphInfershapePass, infershape_pass_failed) { InferShapePass infershape_pass; EXPECT_EQ(infershape_pass.Run(addn_node), GE_GRAPH_INFERSHAPE_FAILED); } + +TEST_F(UtestGraphInfershapePass, delete_need_infer_again) { + auto graph = std::make_shared("test"); + + auto no_op_desc = std::make_shared("No", "NoOp"); + auto no_op_node = graph->AddNode(no_op_desc); + AttrUtils::SetBool(no_op_desc, "_need_infer_again", false); + + InferShapePass infershape_pass; + infershape_pass.options_[kOptimizeAfterSubGraph] = "yes"; + EXPECT_EQ(infershape_pass.Run(no_op_node), SUCCESS); +} + } // namespace ge diff --git a/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc b/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc new file mode 100644 index 00000000..511ddece --- /dev/null +++ b/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc @@ -0,0 +1,111 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/link_gen_mask_nodes_pass.h" + + +#include +#include +#include + +#include "graph_builder_utils.h" + +namespace ge { +class UtestLinkGenMaskNodesPass : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { + /* + * do_mask1 do_mask2 do_mask3 do_mask4 do_mask5 do_mask6 + * \| \ / |/ |/ \ / |/ \| \ / |/ + * \ \ / | | \ / | | \ / | + * \ genmask1 | | genmask2 | | genmask3 | + * \ | | | | | | | | | | / + * ----------------------const1 and const2-------------------------- + */ +ut::GraphBuilder Graph1Builder() { + ut::GraphBuilder builder = ut::GraphBuilder("g1"); + auto const1 = builder.AddNode("const1", "Const", 0, 1); + auto const2 = builder.AddNode("const2", "Const", 0, 1); + auto gen_mask1 = builder.AddNode("gen_mask1", "DropOutGenMask", 2, 1); + auto gen_mask2 = builder.AddNode("gen_mask2", "DropOutGenMaskV3", 2, 1); + auto gen_mask3 = builder.AddNode("gen_mask3", "DropOutGenMaskV3D", 2, 1); + auto do_mask1 = builder.AddNode("do_mask1", "DropOutDoMask", 3, 1); + auto do_mask2 = builder.AddNode("do_mask2", "DropOutDoMask", 3, 1); + auto do_mask3 = builder.AddNode("do_mask3", "DropOutDoMask", 3, 1); + auto do_mask4 = builder.AddNode("do_mask4", "DropOutDoMask", 3, 1); + auto do_mask5 = builder.AddNode("do_mask5", "DropOutDoMask", 3, 1); + auto do_mask6 = builder.AddNode("do_mask6", "DropOutDoMask", 3, 1); + gen_mask1->GetOpDesc()->SetOpEngineName("DNN_HCCL"); + gen_mask2->GetOpDesc()->SetOpEngineName("DNN_HCCL"); + gen_mask3->GetOpDesc()->SetOpEngineName("DNN_HCCL"); + + builder.AddDataEdge(const1, 0, gen_mask1, 0); + builder.AddDataEdge(const1, 0, gen_mask2, 0); + builder.AddDataEdge(const1, 0, gen_mask3, 0); + builder.AddDataEdge(const1, 0, do_mask1, 0); + builder.AddDataEdge(const1, 0, do_mask2, 0); + builder.AddDataEdge(const1, 0, do_mask3, 0); + builder.AddDataEdge(const1, 0, do_mask4, 0); + builder.AddDataEdge(const1, 0, do_mask5, 0); + builder.AddDataEdge(const1, 0, do_mask6, 0); + builder.AddDataEdge(gen_mask1, 0, do_mask1, 1); + builder.AddDataEdge(gen_mask1, 0, do_mask2, 1); + builder.AddDataEdge(gen_mask2, 0, do_mask3, 1); + builder.AddDataEdge(gen_mask2, 0, do_mask4, 1); + builder.AddDataEdge(gen_mask3, 0, do_mask5, 1); + builder.AddDataEdge(gen_mask3, 0, do_mask6, 1); + builder.AddDataEdge(const2, 0, gen_mask1, 1); + builder.AddDataEdge(const2, 0, gen_mask2, 1); + builder.AddDataEdge(const2, 0, gen_mask3, 1); + builder.AddDataEdge(const2, 0, do_mask1, 2); + builder.AddDataEdge(const2, 0, do_mask2, 2); + builder.AddDataEdge(const2, 0, do_mask3, 2); + builder.AddDataEdge(const2, 0, do_mask4, 2); + builder.AddDataEdge(const2, 0, do_mask5, 2); + builder.AddDataEdge(const2, 0, do_mask6, 2); + return builder; +} +} // namespace + + +TEST_F(UtestLinkGenMaskNodesPass, link_gen_mask_nodes_pass_success) { + auto builder = Graph1Builder(); + auto graph = builder.GetGraph(); + + std::map stream_max_parallel_num; + stream_max_parallel_num["DNN_HCCL"] = 1; + LinkGenMaskNodesPass link_pass(stream_max_parallel_num); + Status ret = link_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + + auto gen_mask2 = graph->FindNode("gen_mask2"); + EXPECT_NE(gen_mask2, nullptr); + + auto in_ctrl_nodes = gen_mask2->GetInControlNodes(); + EXPECT_EQ(in_ctrl_nodes.size(), 1); + auto in_ctrl_node = in_ctrl_nodes.at(0); + EXPECT_EQ(in_ctrl_node->GetName(), "gen_mask3"); + + auto out_ctrl_nodes = gen_mask2->GetOutControlNodes(); + EXPECT_EQ(out_ctrl_nodes.size(), 1); + auto out_ctrl_node = out_ctrl_nodes.at(0); + EXPECT_EQ(out_ctrl_node->GetName(), "gen_mask1"); +} +} // namespace ge diff --git a/tests/ut/ge/graph/passes/loop_branch_v1_unittest.cc b/tests/ut/ge/graph/passes/loop_branch_v1_unittest.cc new file mode 100644 index 00000000..0663ac54 --- /dev/null +++ b/tests/ut/ge/graph/passes/loop_branch_v1_unittest.cc @@ -0,0 +1,149 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/merge_input_memcpy_pass.h" +#include "graph/passes/next_iteration_pass.h" +#include "graph/passes/switch_to_stream_switch_pass.h" +#include "graph/passes/merge_to_stream_merge_pass.h" +#include "graph/passes/attach_stream_label_pass.h" + +#include +#include "graph_builder_utils.h" + +namespace ge { +class UtestLoopBranchV1Pass : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +/// +/// net_output +/// | +/// exit next_iteration +/// \ | | +/// \ add | +/// F\ T/ \ | +/// switch1 enter1 | +/// / | | | +/// loop_cond | const1 | +/// | | | +/// less | | +/// / \ | | +/// enter2 merge ---------| +/// | | +/// const2 enter3 +/// | +/// var +/// +ComputeGraphPtr BuildGraph1() { + auto builder = ut::GraphBuilder("g1"); + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto enter1 = builder.AddNode("enter1", ENTER, 1, 1); + AttrUtils::SetStr(enter1->GetOpDesc(), ENTER_ATTR_FRAME_NAME, "frame_name"); + auto const2 = builder.AddNode("const2", CONSTANTOP, 0, 1); + auto enter2 = builder.AddNode("enter2", ENTER, 1, 1); + AttrUtils::SetStr(enter2->GetOpDesc(), ENTER_ATTR_FRAME_NAME, "frame_name"); + auto var = builder.AddNode("var", VARIABLEV2, 0, 1); + auto enter3 = builder.AddNode("enter3", ENTER, 1, 1); + AttrUtils::SetStr(enter3->GetOpDesc(), ENTER_ATTR_FRAME_NAME, "frame_name"); + auto merge = builder.AddNode("merge", MERGE, 2, 2); + auto less = builder.AddNode("less", LESS, 2, 1); + auto loop_cond = builder.AddNode("loop_cond", LOOPCOND, 1, 1, FORMAT_ND, DT_BOOL, {}); + auto switch1 = builder.AddNode("switch1", SWITCH, 2, 2); + auto add = builder.AddNode("add", ADD, 2, 1); + auto next_iteration = builder.AddNode("next_iteration", NEXTITERATION, 1, 1); + auto exit = builder.AddNode("exit", EXIT, 1, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(const1, 0, enter1, 0); + builder.AddDataEdge(const2, 0, enter2, 0); + builder.AddDataEdge(var, 0, enter3, 0); + builder.AddDataEdge(enter3, 0, merge, 0); + builder.AddDataEdge(enter2, 0, less, 0); + builder.AddDataEdge(merge, 0, less, 1); + builder.AddDataEdge(merge, 0, switch1, 0); + builder.AddDataEdge(less, 0, loop_cond, 0); + builder.AddDataEdge(loop_cond, 0, switch1, 1); + builder.AddDataEdge(switch1, 1, add, 0); + builder.AddDataEdge(enter1, 0, add, 1); + builder.AddDataEdge(add, 0, next_iteration, 0); + builder.AddDataEdge(next_iteration, 0, merge, 1); + builder.AddDataEdge(switch1, 0, exit, 0); + builder.AddDataEdge(exit, 0, net_output, 0); + return builder.GetGraph(); +} +} // namespace + +TEST_F(UtestLoopBranchV1Pass, common_loop_branch_v1) { + auto graph = BuildGraph1(); + MergeInputMemcpyPass memcpy_pass; + NextIterationPass loop_pass; + SwitchToStreamSwitchPass switch_pass; + MergeToStreamMergePass merge_pass; + AttachStreamLabelPass label_pass; + EXPECT_EQ(memcpy_pass.Run(graph), SUCCESS); + EXPECT_EQ(loop_pass.Run(graph), SUCCESS); + EXPECT_EQ(switch_pass.Run(graph), SUCCESS); + EXPECT_EQ(merge_pass.Run(graph), SUCCESS); + EXPECT_EQ(label_pass.Run(graph), SUCCESS); + + uint32_t switch_num = 0; + uint32_t merge_num = 0; + uint32_t cast_num = 0; + uint32_t stream_switch_num = 0; + uint32_t active_num = 0; + uint32_t stream_merge_num = 0; + uint32_t memcpy_num = 0; + + for (const auto &node : graph->GetAllNodes()) { + const auto &op_desc = node->GetOpDesc(); + std::string type = op_desc->GetType(); + if (type == SWITCH || type == REFSWITCH) { + switch_num++; + } else if (type == MERGE) { + merge_num++; + } else if (type == CAST) { + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + cast_num++; + } else if (type == STREAMSWITCH) { + stream_switch_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_ACTIVE_LABEL_LIST)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_SWITCH_DATA_TYPE)); + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)); + } else if (type == STREAMMERGE) { + stream_merge_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)); + } else if (type == STREAMACTIVE) { + active_num++; + EXPECT_TRUE(op_desc->HasAttr(ATTR_NAME_ACTIVE_LABEL_LIST)); + } else if (type == MEMCPYASYNC) { + memcpy_num++; + } + } + + EXPECT_EQ(switch_num, 0); + EXPECT_EQ(merge_num, 0); + EXPECT_EQ(cast_num, 1); + EXPECT_EQ(stream_switch_num, 2); + EXPECT_EQ(active_num, 3); + EXPECT_EQ(stream_merge_num, 1); + EXPECT_EQ(memcpy_num, 0); +} + +} // namespace ge diff --git a/tests/ut/ge/graph/passes/mark_node_unknown_shape_pass_unittest.cc b/tests/ut/ge/graph/passes/mark_node_unknown_shape_pass_unittest.cc new file mode 100644 index 00000000..5157e510 --- /dev/null +++ b/tests/ut/ge/graph/passes/mark_node_unknown_shape_pass_unittest.cc @@ -0,0 +1,115 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#define private public +#include "graph/passes/mark_node_unknown_shape_pass.h" + +#include "common/ge_inner_error_codes.h" +#include "inc/pass_manager.h" +#include "graph/common/local_context.h" +#undef private + +namespace ge { +class UtestMarkNodeUnknownShapePass : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} +public: + NodePtr MakeNode(const ComputeGraphPtr &graph, uint32_t in_num, uint32_t out_num, string name, string type) { + GeTensorDesc test_desc(GeShape(), FORMAT_NCHW, DT_FLOAT); + auto op_desc = std::make_shared(name, type); + for (auto i = 0; i < in_num; ++i) { + op_desc->AddInputDesc(test_desc); + } + for (auto i = 0; i < out_num; ++i) { + op_desc->AddOutputDesc(test_desc); + } + return graph->AddNode(op_desc); + } +/// netoutput1 +/// | +/// conv1 +/// \ / +/// data + void make_graph(const ComputeGraphPtr &graph) { + GetLocalOmgContext().fuzz_compile_flag = true; + auto conv2d_node = MakeNode(graph, 2, 1, "conv1", "Conv2D"); + { + auto data1 = MakeNode(graph, 1, 1, "data", "Data"); + GeTensorDesc tensor_desc(GeShape({1,3,224,224}), FORMAT_NCHW, DT_FLOAT); + data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc); + data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc); + GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(0)); + GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(1)); + } + + conv2d_node->GetOpDesc()->SetOpKernelLibName("AIcoreEngine"); + AttrUtils::SetBool(conv2d_node->GetOpDesc(), ATTR_NAME_FUZZ_BUILD_RES_ATTRS, true); + auto output_node = MakeNode(graph, 1, 0, "output1", "NetOutput"); + GraphUtils::AddEdge(conv2d_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0)); + } +}; + +TEST_F(UtestMarkNodeUnknownShapePass, test_run_with_GE_kernel) { + OpDescPtr op_desc = std::make_shared("Mul", MATMUL); + ComputeGraphPtr graph = std::make_shared("default"); + op_desc->SetOpKernelLibName("GE"); + graph->AddNode(op_desc); + PassManager pass; + pass.AddPass("MarkNodeUnknownShapePass", new (std::nothrow) MarkNodeUnknownShapePass); + EXPECT_EQ(pass.Run(graph), SUCCESS); +} + +TEST_F(UtestMarkNodeUnknownShapePass, test_run_without_fuzz_attrs) { + OpDescPtr op_desc = std::make_shared("Mul", MATMUL); + ComputeGraphPtr graph = std::make_shared("default"); + op_desc->SetOpKernelLibName("AIcoreEngine"); + graph->AddNode(op_desc); + GetLocalOmgContext().fuzz_compile_flag = true; + PassManager pass; + pass.AddPass("MarkNodeUnknownShapePass", new (std::nothrow) MarkNodeUnknownShapePass); + EXPECT_EQ(pass.Run(graph), SUCCESS); +} + +TEST_F(UtestMarkNodeUnknownShapePass, test_run_with_fuzz_attrs) { + ComputeGraphPtr graph = std::make_shared("test_graph"); + make_graph(graph); + PassManager pass; + pass.AddPass("MarkNodeUnknownShapePass", new (std::nothrow) MarkNodeUnknownShapePass); + EXPECT_EQ(pass.Run(graph), SUCCESS); + EXPECT_EQ(graph->GetAllNodes().size(), 3); + for (const auto &node : graph->GetAllNodes()) { + if (node->GetName() == "conv1") { + auto op_desc = node->GetOpDesc(); + EXPECT_NE(op_desc, nullptr); + for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { + auto input_desc = op_desc->MutableInputDesc(static_cast(i)); + EXPECT_TRUE(input_desc->GetShape().GetDim(0) == -2); + } + for (auto &output_desc : op_desc->GetAllOutputsDescPtr()) { + EXPECT_NE(output_desc, nullptr); + EXPECT_TRUE(output_desc->GetShape().GetDim(0) == -2); + } + } + } +} + +} // namespace ge diff --git a/tests/ut/ge/graph/passes/net_output_pass_unittest.cc b/tests/ut/ge/graph/passes/net_output_pass_unittest.cc index 031985f3..ac6cd63a 100644 --- a/tests/ut/ge/graph/passes/net_output_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/net_output_pass_unittest.cc @@ -631,6 +631,23 @@ TEST_F(UtestGraphPassesNetOutputPass, no_output_no_target_no_retval_success) { EXPECT_EQ(status, ge::SUCCESS); } +TEST_F(UtestGraphPassesNetOutputPass, no_output_no_target_no_retval_no_outnodes_success) { + ge::ComputeGraphPtr compute_graph = build_graph(); + + ge::PassManager pass_managers; + pass_managers.AddPass("", new (std::nothrow) NetOutputPass); + Status status = pass_managers.Run(compute_graph); + EXPECT_EQ(status, ge::SUCCESS); + + NodePtr net_out_node = compute_graph->FindNode(NODE_NAME_NET_OUTPUT); + EXPECT_NE(net_out_node, nullptr); + EXPECT_EQ(net_out_node->GetInControlNodes().size(), 2); + + int stream_label = -1; + EXPECT_TRUE(ge::AttrUtils::GetInt(net_out_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, stream_label)); + EXPECT_EQ(stream_label, 0); +} + TEST_F(UtestGraphPassesNetOutputPass, user_out_node_success) { ge::ComputeGraphPtr compute_graph = build_graph(); diff --git a/tests/ut/ge/graph/passes/parallel_group_pass_unittest.cc b/tests/ut/ge/graph/passes/parallel_group_pass_unittest.cc new file mode 100644 index 00000000..d5b1db41 --- /dev/null +++ b/tests/ut/ge/graph/passes/parallel_group_pass_unittest.cc @@ -0,0 +1,304 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#define private public + +#include "common/ge_inner_error_codes.h" +#include "inc/pass_manager.h" +#include "utils/graph_utils.h" +#include "graph/passes/parallel_group_pass.h" +#undef private + +namespace ge { +namespace { + +class UtestGraphPassesParallelGgroupPass : public testing::Test { + protected: + UtestGraphPassesParallelGgroupPass() { + graph_ = std::make_shared("test"); + sub_graph_ = std::make_shared("test_subgraph"); + vector shape_vec{1, 1, 1, 1}; + GeShape shape = GeShape(shape_vec); + default_tensor_desc_ = std::make_shared(); + default_tensor_desc_->SetShape(shape); + default_tensor_desc_->SetFormat(FORMAT_NCHW); + default_tensor_desc_->SetDataType(DT_FLOAT); + } + + NodePtr NewNode(const std::string &name, const std::string &type, + int input_cnt, int output_cnt, bool isSubgraph = false) { + OpDescPtr op_desc = std::make_shared(name, type); + for (int i = 0; i < input_cnt; ++i) { + op_desc->AddInputDesc(default_tensor_desc_->Clone()); + } + + for (int i = 0; i < output_cnt; ++i) { + op_desc->AddOutputDesc(default_tensor_desc_->Clone()); + } + NodePtr node = nullptr; + if (isSubgraph) { + node = sub_graph_->AddNode(op_desc); + (void)node->SetOwnerComputeGraph(sub_graph_); + } else { + node = graph_->AddNode(op_desc); + (void)node->SetOwnerComputeGraph(graph_); + } + + return node; + } + + void BuildDefaultGraph() { + /// input + /// \ + /// sqrt pred + /// \ / + /// cast + /// / \ + /// switch_t switch_f + /// | | + /// F T + /// | | + /// Merge + /// | + /// relu + /// | + /// sqrt1 + input_node_ = NewNode("input", RELU, 0, 1); + sqrt_node_ = NewNode("sqrt", SQRT, 1, 1); + pred_node_ = NewNode("pred", GREATER, 2, 1); + cast_node_ = NewNode("cast", CAST, 2, 2); + AttrUtils::SetStr(input_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + + switch_node_t = NewNode("switch_t", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_t->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, true); + switch_node_f = NewNode("switch_f", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_f->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, false); + output_false_node_ = NewNode("false_output", RELU, 1, 1); + AttrUtils::SetStr(output_false_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + output_true_node_ = NewNode("true_output", RELU, 1, 1); + AttrUtils::SetStr(output_true_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + merge_node_ = NewNode("merge", STREAMMERGE, 2, 1); + relu_node_ = NewNode("relu", RELU, 1, 1); + sqrt_node1_ = NewNode("sqrt1", SQRT, 1, 1); + AttrUtils::SetStr(sqrt_node1_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + + GraphUtils::AddEdge(input_node_->GetOutDataAnchor(0), sqrt_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(pred_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(sqrt_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(0), switch_node_t->GetInDataAnchor(0)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(1), switch_node_f->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_f->GetOutDataAnchor(0), output_false_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_t->GetOutDataAnchor(0), output_true_node_->GetInDataAnchor(0)); + + GraphUtils::AddEdge(output_false_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(output_true_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(merge_node_->GetOutDataAnchor(0), relu_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(relu_node_->GetOutDataAnchor(0), sqrt_node1_->GetInDataAnchor(0)); + + output_false_node_->GetOpDesc()->SetIsInputConst({false}); + output_true_node_->GetOpDesc()->SetIsInputConst({false}); + } + + void BuildDefaultGraph1() { + /// input + /// \ + /// sqrt pred + /// \ / + /// Switch + /// | | + /// ----F T---- + /// \ | / \ + /// \ Merge1 Merge2 + /// \_________| + input_node_ = NewNode("input", RELU, 0, 1); + AttrUtils::SetStr(input_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + pred_node_ = NewNode("pred", GREATER, 2, 1); + sqrt_node_ = NewNode("sqrt", SQRT, 1, 1); + cast_node_ = NewNode("cast", CAST, 2, 2); + + switch_node_t = NewNode("switch_t", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_t->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, true); + switch_node_f = NewNode("switch_f", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_f->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, false); + output_false_node_ = NewNode("false_output", RELU, 1, 2); + AttrUtils::SetStr(output_false_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + output_true_node_ = NewNode("true_output", RELU, 1, 2); + AttrUtils::SetStr(output_true_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + merge_node_ = NewNode("merge", STREAMMERGE, 2, 1); + merge_node1_ = NewNode("merge1", STREAMMERGE, 2, 1); + + GraphUtils::AddEdge(input_node_->GetOutDataAnchor(0), sqrt_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(pred_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(sqrt_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(0), switch_node_t->GetInDataAnchor(0)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(1), switch_node_f->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_f->GetOutDataAnchor(0), output_false_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_t->GetOutDataAnchor(0), output_true_node_->GetInDataAnchor(0)); + + GraphUtils::AddEdge(output_false_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(output_true_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(output_false_node_->GetOutDataAnchor(1), merge_node1_->GetInDataAnchor(0)); + GraphUtils::AddEdge(output_true_node_->GetOutDataAnchor(1), merge_node1_->GetInDataAnchor(1)); + + output_false_node_->GetOpDesc()->SetIsInputConst({false}); + output_true_node_->GetOpDesc()->SetIsInputConst({false}); + } + + + void BuildDefaultGraph2() { + /// input input1 + /// \ \ + /// sqrt pred sqrt1 pred1 + /// \ / \ / + /// Switch Switch1 + /// | | _______| + /// | | / + /// ____F T____ + /// \ | / \ + /// \ Merge1 Merge2 + /// \__________| + input_node_ = NewNode("input", RELU, 0, 2); + input_node1_ = NewNode("input_1", RELU, 0, 2); + sqrt_node_ = NewNode("sqrt", SQRT, 1, 1); + pred_node_ = NewNode("pred", GREATER, 2, 1); + sqrt_node1_ = NewNode("sqrt_1", SQRT, 1, 1); + pred_node1_ = NewNode("pred_1", LESS, 2, 1); + cast_node_ = NewNode("cast", CAST, 2, 2); + cast_node1_ = NewNode("cast_1", CAST, 2, 2); + AttrUtils::SetStr(input_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + AttrUtils::SetStr(input_node1_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "2"); + + switch_node_t = NewNode("switch_t", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_t->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, true); + switch_node_f = NewNode("switch_f", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node_f->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, false); + switch_node1_t = NewNode("switch1_t", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node1_t->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, true); + switch_node1_f = NewNode("switch1_f", STREAMSWITCH, 1, 1); + AttrUtils::SetBool(switch_node1_f->GetOpDesc(), ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, false); + output_false_node_ = NewNode("false_output", RELU, 2, 2); + AttrUtils::SetStr(output_false_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + output_true_node_ = NewNode("true_output", RELU, 2, 2); + AttrUtils::SetStr(output_true_node_->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "2"); + merge_node_ = NewNode("merge", STREAMMERGE, 2, 1); + merge_node1_ = NewNode("merge1", STREAMMERGE, 2, 1); + + GraphUtils::AddEdge(input_node_->GetOutDataAnchor(0), sqrt_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(pred_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(sqrt_node_->GetOutDataAnchor(0), cast_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(0), switch_node_t->GetInDataAnchor(0)); + GraphUtils::AddEdge(cast_node_->GetOutDataAnchor(1), switch_node_f->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_f->GetOutDataAnchor(0), output_false_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node_t->GetOutDataAnchor(0), output_true_node_->GetInDataAnchor(0)); + + GraphUtils::AddEdge(input_node1_->GetOutDataAnchor(0), sqrt_node1_->GetInDataAnchor(0)); + GraphUtils::AddEdge(pred_node1_->GetOutDataAnchor(0), cast_node1_->GetInDataAnchor(0)); + GraphUtils::AddEdge(sqrt_node1_->GetOutDataAnchor(0), cast_node1_->GetInDataAnchor(1)); + GraphUtils::AddEdge(cast_node1_->GetOutDataAnchor(0), switch_node1_t->GetInDataAnchor(0)); + GraphUtils::AddEdge(cast_node1_->GetOutDataAnchor(1), switch_node1_f->GetInDataAnchor(0)); + GraphUtils::AddEdge(switch_node1_f->GetOutDataAnchor(0), output_false_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(switch_node1_t->GetOutDataAnchor(0), output_true_node_->GetInDataAnchor(1)); + + GraphUtils::AddEdge(output_false_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(0)); + GraphUtils::AddEdge(output_true_node_->GetOutDataAnchor(0), merge_node_->GetInDataAnchor(1)); + GraphUtils::AddEdge(output_false_node_->GetOutDataAnchor(1), merge_node1_->GetInDataAnchor(0)); + GraphUtils::AddEdge(output_true_node_->GetOutDataAnchor(1), merge_node1_->GetInDataAnchor(1)); + + output_false_node_->GetOpDesc()->SetIsInputConst({false}); + output_true_node_->GetOpDesc()->SetIsInputConst({false}); + } + + ComputeGraphPtr graph_; + ComputeGraphPtr sub_graph_; + GeTensorDescPtr default_tensor_desc_; + ParallelGroupPass pass_; + NodePtr pred_node_; + NodePtr pred_node1_; + NodePtr cast_node_; + NodePtr cast_node1_; + NodePtr sqrt_node_; + NodePtr sqrt_node1_; + NodePtr input_node_; + NodePtr input_node1_; + NodePtr switch_node_t; + NodePtr switch_node_f; + NodePtr switch_node1_t; + NodePtr switch_node1_f; + NodePtr output_false_node_; + NodePtr output_true_node_; + NodePtr merge_node_; + NodePtr merge_node1_; + NodePtr relu_node_; +}; + +TEST_F(UtestGraphPassesParallelGgroupPass, null_graph) { + ComputeGraphPtr graph = nullptr; + auto ret = pass_.Run(graph); + EXPECT_EQ(ret, PARAM_INVALID); +} + +TEST_F(UtestGraphPassesParallelGgroupPass, normal_graph) { + BuildDefaultGraph(); + auto ret = pass_.Run(graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); + EXPECT_EQ(true, input_node_->GetOutControlAnchor()->IsLinkedWith(cast_node_->GetInControlAnchor())); + EXPECT_EQ(true, merge_node_->GetOutControlAnchor()->IsLinkedWith(sqrt_node1_->GetInControlAnchor())); + EXPECT_EQ(false, output_false_node_->GetOutControlAnchor()->IsLinkedWith(output_true_node_->GetInControlAnchor())); +} + +TEST_F(UtestGraphPassesParallelGgroupPass, normal_graph1) { + BuildDefaultGraph1(); + auto ret = pass_.Run(graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); + EXPECT_EQ(true, input_node_->GetOutControlAnchor()->IsLinkedWith(cast_node_->GetInControlAnchor())); +} + +TEST_F(UtestGraphPassesParallelGgroupPass, normal_graph2) { + BuildDefaultGraph2(); + auto ret = pass_.Run(graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); + EXPECT_EQ(true, input_node_->GetOutControlAnchor()->IsLinkedWith(cast_node_->GetInControlAnchor())); + EXPECT_EQ(true, input_node1_->GetOutControlAnchor()->IsLinkedWith(cast_node1_->GetInControlAnchor())); +} + +TEST_F(UtestGraphPassesParallelGgroupPass, normal_subgraph) { + BuildDefaultGraph1(); + NodePtr input_node1 = NewNode("input1", RELU, 0, 1, true); + NodePtr input_node2 = NewNode("input2", RELU, 0, 1, true); + NodePtr add = NewNode("add", ADD, 2, 1, true); + AttrUtils::SetStr(input_node1->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + AttrUtils::SetStr(input_node2->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, "1"); + + sub_graph_->SetParentNode(input_node_); + sub_graph_->SetParentGraph(graph_); + auto ret = graph_->AddSubgraph(sub_graph_->GetName(), sub_graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); + ret = input_node_->GetOpDesc()->AddSubgraphName(sub_graph_->GetName()); + EXPECT_EQ(ret, GRAPH_SUCCESS); + ret = input_node_->GetOpDesc()->SetSubgraphInstanceName(0, sub_graph_->GetName()); + EXPECT_EQ(ret, GRAPH_SUCCESS); + ret = pass_.Run(sub_graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); + ret = pass_.Run(graph_); + EXPECT_EQ(ret, GRAPH_SUCCESS); +} + +} // namespace +} // namespace ge diff --git a/tests/ut/ge/graph/passes/reshape_recovery_pass_unittest.cc b/tests/ut/ge/graph/passes/reshape_recovery_pass_unittest.cc new file mode 100644 index 00000000..af60021c --- /dev/null +++ b/tests/ut/ge/graph/passes/reshape_recovery_pass_unittest.cc @@ -0,0 +1,69 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/reshape_recovery_pass.h" + +#include +#include +#include + +#include "graph_builder_utils.h" + +namespace ge { +class UtestReshapeRecoveryPass : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +/// netoutput1 +/// | \ +///transdata1 \ +/// | \ +/// | transdata2 +/// | / +/// var1 const1 +ut::GraphBuilder Graph1Builder() { + ut::GraphBuilder builder = ut::GraphBuilder("g2"); + auto var1 = builder.AddNode("var1", "Variable", 0, 1, FORMAT_ND, DT_FLOAT, {-1}); + auto const1 = builder.AddNode("const1", "Const", 0, 1, FORMAT_ND, DT_FLOAT, {1, 1, 224, 224}); + auto transdata2 = builder.AddNode("transdata2", "Transdata", 1, 1, FORMAT_ND, DT_FLOAT, {224, 224}); + auto transdata1 = builder.AddNode("transdata1", "Transdata", 1, 1, FORMAT_ND, DT_FLOAT, {224, 224}); + auto netoutput1 = builder.AddNode("netoutput1", "Netoutput", 2, 0); + + builder.AddDataEdge(var1, 0, transdata1, 0); + builder.AddDataEdge(const1, 0, transdata2, 0); + builder.AddDataEdge(transdata2, 0, netoutput1, 1); + builder.AddDataEdge(transdata1, 0, netoutput1, 0); + + return builder; +} +} // namespace + +TEST_F(UtestReshapeRecoveryPass, reshape_recovery_with_dynamic_shape) { + auto builder = Graph1Builder(); + auto graph = builder.GetGraph(); + ReshapeRecoveryPass reshape_recovery_pass; + EXPECT_EQ(graph->GetDirectNodesSize(),5); + Status ret = reshape_recovery_pass.Run(graph); + EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(graph->GetDirectNodesSize(),8); + + auto reshape1 = graph->FindNode("Reshape_ReshapeRecoveryPass_0"); + EXPECT_NE(reshape1, nullptr); +} +} // namespace ge diff --git a/tests/ut/ge/graph/passes/switch_dead_branch_elimination_unittest.cc b/tests/ut/ge/graph/passes/switch_dead_branch_elimination_unittest.cc new file mode 100644 index 00000000..c3f21251 --- /dev/null +++ b/tests/ut/ge/graph/passes/switch_dead_branch_elimination_unittest.cc @@ -0,0 +1,163 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "common/ge_inner_error_codes.h" +#include "graph/passes/switch_dead_branch_elimination.h" +#include "graph_builder_utils.h" + +namespace ge { +class UtestSwitchDeadBranchElimination : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +/* + * data1 const1 + * \ / + * case1 + * | + * relu1 + * | + * netoutput + */ +ut::GraphBuilder ParentGraphBuilder() { + ut::GraphBuilder builder = ut::GraphBuilder("g1"); + auto data1 = builder.AddNode("data1", "Data", 0, 1); + auto const1 = builder.AddNode("const1", "Const", 0, 1); + auto case1 = builder.AddNode("case1", CASE, 2, 1); + auto relu1 = builder.AddNode("relu1", "Relu", 1, 1); + auto netoutput = builder.AddNode("netoutput", NETOUTPUT, 1, 0); + + int32_t weight[1] = {1}; + GeTensorDesc weight_desc(GeShape({1}), FORMAT_NHWC, DT_INT32); + GeTensorPtr tensor = std::make_shared(weight_desc, (uint8_t *)weight, sizeof(weight)); + OpDescUtils::SetWeights(const1, {tensor}); + + builder.AddDataEdge(data1, 0, case1, 0); + builder.AddDataEdge(const1, 0, case1, 1); + builder.AddDataEdge(case1, 0, relu1, 0); + builder.AddDataEdge(relu1, 0, netoutput, 0); + return builder; +} + +/* + * data1 data2 + * \ / + * switch + * / \ + * relu1 relu2 + * \ / + * merge + * | + * netoutput + */ +ut::GraphBuilder SwitchSubgraphBuilder(string graph_name, uint32_t num) { + ut::GraphBuilder builder = ut::GraphBuilder(graph_name); + + string data1_name = "data1_" + std::to_string(num); + auto data1 = builder.AddNode(data1_name, "Data", 0, 1); + auto data1_desc = data1->GetOpDesc(); + EXPECT_NE(data1_desc, nullptr); + AttrUtils::SetInt(data1_desc, "_parent_node_index", 0); + + string data2_name = "data2_" + std::to_string(num); + auto data2 = builder.AddNode(data2_name, "Data", 0, 1); + auto data2_desc = data2->GetOpDesc(); + EXPECT_NE(data2_desc, nullptr); + AttrUtils::SetInt(data2_desc, "_parent_node_index", 1); + + string switch_name = "switch_" + std::to_string(num); + auto switch1 = builder.AddNode(switch_name, "Switch", 2, 2); + + string relu1_name = "relu1_" + std::to_string(num); + auto relu1 = builder.AddNode(relu1_name, "Relu", 1, 1); + + string relu2_name = "relu2_" + std::to_string(num); + auto relu2 = builder.AddNode(relu2_name, "Relu", 1, 1); + + string merge_name = "merge_" + std::to_string(num); + auto merge = builder.AddNode(merge_name, "Merge", 2, 1); + + string output_name = "output_" + std::to_string(num); + auto netoutput = builder.AddNode(output_name, NETOUTPUT, 1, 0); + + builder.AddDataEdge(data1, 0, switch1, 0); + builder.AddDataEdge(data2, 0, switch1, 1); + builder.AddDataEdge(switch1, 0, relu1, 0); + builder.AddDataEdge(switch1, 1, relu2, 0); + builder.AddDataEdge(relu1, 0, merge, 0); + builder.AddDataEdge(relu2, 0, merge, 1); + builder.AddDataEdge(merge, 0, netoutput, 0); + + return builder; +} + +void AddCaseSubgraph(ComputeGraphPtr &parent_graph, uint32_t branch_num) { + auto case_node = parent_graph->FindNode("case1"); + EXPECT_NE(case_node, nullptr); + + for (uint32_t i = 0; i < branch_num; ++i) { + string name = "Branch_Graph_" + std::to_string(i); + + auto builder_subgraph = SwitchSubgraphBuilder(name, i); + auto switch_subgraph = builder_subgraph.GetGraph(); + + case_node->GetOpDesc()->AddSubgraphName(switch_subgraph->GetName()); + case_node->GetOpDesc()->SetSubgraphInstanceName(i, switch_subgraph->GetName()); + + switch_subgraph->SetParentNode(case_node); + switch_subgraph->SetParentGraph(parent_graph); + EXPECT_EQ(parent_graph->AddSubgraph(switch_subgraph->GetName(), switch_subgraph), GRAPH_SUCCESS); + } +} +} // namespace + + +TEST_F(UtestSwitchDeadBranchElimination, switch_dead_branch_elimination_across_case_success) { + auto builder = ParentGraphBuilder(); + auto parent_graph = builder.GetGraph(); + + AddCaseSubgraph(parent_graph, 2); + auto subgraphs = parent_graph->GetAllSubgraphs(); + EXPECT_EQ(subgraphs.size(), 2); + + SwitchDeadBranchElimination switch_pass; + for (auto &subgraph : subgraphs) { + auto switch_node = subgraph->FindFirstNodeMatchType("Switch"); + if (switch_node != nullptr) { + EXPECT_EQ(switch_pass.Run(switch_node), SUCCESS); + } + } + + auto all_nodes = parent_graph->GetAllNodes(); + EXPECT_EQ(all_nodes.size(), 17); + + for (auto &subgraph : subgraphs) { + EXPECT_EQ(subgraph->GetDirectNode().size(), 6); + EXPECT_EQ(subgraph->FindFirstNodeMatchType("Switch"), nullptr); + auto merge_node = subgraph->FindFirstNodeMatchType("Merge"); + EXPECT_NE(merge_node, nullptr); + auto merge_innode = merge_node->GetInDataNodes(); + EXPECT_EQ(merge_innode.size(), 1); + } +} +} // namespace ge diff --git a/tests/ut/ge/graph/preprocess/graph_preprocess_unittest.cc b/tests/ut/ge/graph/preprocess/graph_preprocess_unittest.cc index 2f149761..ff49f34c 100644 --- a/tests/ut/ge/graph/preprocess/graph_preprocess_unittest.cc +++ b/tests/ut/ge/graph/preprocess/graph_preprocess_unittest.cc @@ -50,6 +50,28 @@ ComputeGraphPtr BuildGraph1(){ return builder.GetGraph(); } +ComputeGraphPtr BuildGraph2() { + auto builder = ut::GraphBuilder("g2"); + auto data1 = builder.AddNode("data1", DATA, 1, 1, FORMAT_NCHW, DT_FLOAT, std::vector({22, -1})); + ge::AttrUtils::SetStr(data1->GetOpDesc(), ATTR_ATC_USER_DEFINE_DATATYPE, "DT_INT8"); + auto data_opdesc = data1->GetOpDesc(); + AttrUtils::SetInt(data_opdesc, ATTR_NAME_INDEX, 0); + + data1->UpdateOpDesc(data_opdesc); + return builder.GetGraph(); +} + +ComputeGraphPtr BuildGraph3() { + auto builder = ut::GraphBuilder("g3"); + auto data1 = builder.AddNode("data1", DATA, 1, 1, FORMAT_NCHW, DT_FLOAT); + ge::AttrUtils::SetStr(data1->GetOpDesc(), ATTR_ATC_USER_DEFINE_DATATYPE, "DT_INT8"); + auto data_opdesc = data1->GetOpDesc(); + AttrUtils::SetInt(data_opdesc, ATTR_NAME_INDEX, 0); + + data1->UpdateOpDesc(data_opdesc); + return builder.GetGraph(); +} + TEST_F(UtestGraphPreproces, test_dynamic_input_shape_parse) { ge::GraphPrepare graph_prepare; graph_prepare.compute_graph_ = BuildGraph1(); @@ -74,4 +96,26 @@ TEST_F(UtestGraphPreproces, test_dynamic_input_shape_parse) { EXPECT_EQ(result_shape.GetDim(i), expect_shape.at(i)); } } + +TEST_F(UtestGraphPreproces, test_check_user_input) { + ge::GraphPrepare graph_prepare; + graph_prepare.compute_graph_ = BuildGraph1(); + + vector dim = {2, -3}; + GeTensor tensor; + tensor.SetTensorDesc(GeTensorDesc(GeShape(dim))); + std::vector user_input; + user_input.emplace_back(tensor); + + Status ret = graph_prepare.CheckUserInput(user_input); + EXPECT_EQ(ret, GE_GRAPH_INIT_FAILED); +} + +TEST_F(UtestGraphPreproces, test_update_input_output1) { + ge::GraphPrepare graph_prepare; + graph_prepare.compute_graph_ = BuildGraph3(); + + Status ret = graph_prepare.UpdateInputOutputByOptions(); + EXPECT_EQ(ret, SUCCESS); +} } \ No newline at end of file diff --git a/tests/ut/ge/graph/utils/buffer_pool_graph_builder.cc b/tests/ut/ge/graph/utils/buffer_pool_graph_builder.cc new file mode 100644 index 00000000..dd52f287 --- /dev/null +++ b/tests/ut/ge/graph/utils/buffer_pool_graph_builder.cc @@ -0,0 +1,978 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "buffer_pool_graph_builder.h" +#include "common/ge_inner_error_codes.h" +#include "common/types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +namespace ut { +BufferPoolGraphBuilder::BufferPoolGraphBuilder(const std::string &name) { + graph_name_ = name; +} + +BufferPoolGraphBuilder::InnerGraphBuilder::InnerGraphBuilder(const std::string &name) { + graph_ = std::make_shared(name); + EXPECT_NE(graph_, nullptr); +} + +NodePtr BufferPoolGraphBuilder::InnerGraphBuilder::AddNode(const std::string &name, const std::string &type, + int in_cnt, int out_cnt, + Format format, DataType data_type, + std::vector shape) { + auto tensor_desc = std::make_shared(); + EXPECT_NE(tensor_desc, nullptr); + tensor_desc->SetShape(GeShape(std::move(shape))); + tensor_desc->SetFormat(format); + tensor_desc->SetDataType(data_type); + auto op_desc = std::make_shared(name, type); + EXPECT_NE(op_desc, nullptr); + for (int i = 0; i < in_cnt; ++i) { + op_desc->AddInputDesc(tensor_desc->Clone()); + } + for (int i = 0; i < out_cnt; ++i) { + op_desc->AddOutputDesc(tensor_desc->Clone()); + } + return graph_->AddNode(op_desc); +} + +void BufferPoolGraphBuilder::InnerGraphBuilder::AddDataEdge(NodePtr &src_node, int src_idx, + NodePtr &dst_node, int dst_idx) { + EXPECT_NE(src_node, nullptr); + EXPECT_NE(dst_node, nullptr); + GraphUtils::AddEdge(src_node->GetOutDataAnchor(src_idx), dst_node->GetInDataAnchor(dst_idx)); +} + +void BufferPoolGraphBuilder::InnerGraphBuilder::AddControlEdge(NodePtr &src_node, NodePtr &dst_node) { + EXPECT_NE(src_node, nullptr); + EXPECT_NE(dst_node, nullptr); + GraphUtils::AddEdge(src_node->GetOutControlAnchor(), dst_node->GetInControlAnchor()); +} + +void BufferPoolGraphBuilder::SetBufferPool(NodePtr &node, int64_t pool_id, int64_t pool_size, + const std::string &batch_label) { + EXPECT_NE(node, nullptr); + (void) AttrUtils::SetInt(node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_ID, pool_id); + (void) AttrUtils::SetInt(node->GetOpDesc(), ATTR_NAME_BUFFER_POOL_SIZE, pool_size); + if (!batch_label.empty()) { + (void) AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label); + } +} + +void BufferPoolGraphBuilder::SetBatchLabel(NodePtr &node, const std::string &batch_label) { + EXPECT_NE(node, nullptr); + (void) AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label); + +} + +void BufferPoolGraphBuilder::SetOutputMemSize(NodePtr &node, const std::vector &mem_size) { + EXPECT_NE(node, nullptr); + EXPECT_NE(node->GetOpDesc(), nullptr); + size_t output_size = node->GetOpDesc()->GetOutputsSize(); + EXPECT_EQ(output_size, mem_size.size()); + for (size_t i = 0; i < output_size; ++i) { + auto output_op_desc = node->GetOpDesc()->MutableOutputDesc(i); + ge::TensorUtils::SetSize(*output_op_desc, mem_size[i]); + } +} + +void BufferPoolGraphBuilder::SetWorkSpaceMemSize(NodePtr &node, const std::vector &ws_bytes) { + EXPECT_NE(node, nullptr); + EXPECT_NE(node->GetOpDesc(), nullptr); + node->GetOpDesc()->SetWorkspaceBytes(ws_bytes); +} + +void BufferPoolGraphBuilder::SetPrefetchNodeInfo(NodePtr &node, int64_t pool_id, int64_t pool_size, + const std::vector &mem_size, + const std::vector &ws_bytes, + const std::string &batch_label) { + SetBufferPool(node, pool_id, pool_size, batch_label); + SetOutputMemSize(node, mem_size); + SetWorkSpaceMemSize(node, ws_bytes); +} + +/// +/// Normal graph +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// \ \ \ \ \ +/// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output +/// +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildNormalGraph() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto add4 = builder.AddNode("add4", ADD, 2, 1); + auto add5 = builder.AddNode("add5", ADD, 2, 1); + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(add1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(add2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(add3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(add4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add5, 0, net_output, 0); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// Normal graph with multi buffer pool +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// (pool0) (pool1) (pool0) (pool0) (pool1) +/// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output +/// +/// +/// Memory distribution: +/// +/// |___w1__|__w3__|_________| +/// |_____w4_____|___________| +/// +/// |___w2__|_____w5___|_____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildNormalGraphWithMultiBufferPool() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id_0 = 0; + const int64_t buffer_pool_id_1 = 1; + const int64_t buffer_pool_size = 5000; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id_0, buffer_pool_size, {500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id_1, buffer_pool_size, {500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id_0, buffer_pool_size, {500}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id_0, buffer_pool_size, {1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id_1, buffer_pool_size, {1024}); + + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto add4 = builder.AddNode("add4", ADD, 2, 1); + auto add5 = builder.AddNode("add5", ADD, 2, 1); + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(add1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(add2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(add3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(add4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add5, 0, net_output, 0); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// SerialGraph: Buffer pool size only can contain one prefetch node +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// \ \ \ \ \ +/// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output +/// +/// +/// Memory distribution: +/// +/// |____w1_____|__| +/// +/// |____w2_____|__| +/// +/// |____w3_____|__| +/// +/// |______w4______| +/// +/// |______w5______| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildSerialGraph() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 2048; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto add4 = builder.AddNode("add4", ADD, 2, 1); + auto add5 = builder.AddNode("add5", ADD, 2, 1); + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(add1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(add2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(add3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(add4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add5, 0, net_output, 0); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// GraphWithMultiPrefetch: Calc node with more prefetch node +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 const1 +/// \ / \ / \ / +/// \ / \ / \ / +/// \ / \ / \ / +/// add1 ------ c ------- add2 ----- c ----- add3 +/// | | | +/// | | | +/// --------------- net_output ------------ +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildGraphWithMultiPrefetch() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 3, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(prefetch1, 0, add1, 0); + builder.AddDataEdge(prefetch2, 0, add1, 1); + + builder.AddDataEdge(prefetch3, 0, add2, 0); + builder.AddDataEdge(prefetch4, 0, add2, 1); + + builder.AddDataEdge(const1, 0, add3, 0); + builder.AddDataEdge(prefetch5, 0, add3, 1); + + builder.AddDataEdge(add1, 0, net_output, 0); + builder.AddDataEdge(add2, 0, net_output, 1); + builder.AddDataEdge(add3, 0, net_output, 2); + + builder.AddControlEdge(add1, add2); + builder.AddControlEdge(add2, add3); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// GraphWithSubgraph: Calc node in different subgraph +/// +/// +/// call_node1(with Subgraph1) --------------- call_node2 (with Subgraph2) --------------- net_output +/// +/// +/// Subgraph1: Subgraph2: +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// \ \ \ \ \ +/// const1 ----- add1 ----- add2 ----- add3 ---- subgraph1_out data1 ---- add4 ----- add5 ---- subgraph2_out +/// +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildGraphWithSubgraph() { + auto builder = InnerGraphBuilder(graph_name_); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + // Subgraph1 + auto subgraph_builder1 = InnerGraphBuilder("Subgraph1"); + auto w1 = subgraph_builder1.AddNode("w1", VARIABLE, 0, 1); + auto w2 = subgraph_builder1.AddNode("w2", VARIABLE, 0, 1); + auto w3 = subgraph_builder1.AddNode("w3", VARIABLE, 0, 1); + + auto prefetch1 = subgraph_builder1.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = subgraph_builder1.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch3 = subgraph_builder1.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto subgraph1_out = subgraph_builder1.AddNode("subgraph1_out", NETOUTPUT, 1, 0); + auto const1 = subgraph_builder1.AddNode("const1", CONSTANTOP, 0, 1); + + auto add1 = subgraph_builder1.AddNode("add1", ADD, 2, 1); + auto add2 = subgraph_builder1.AddNode("add2", ADD, 2, 1); + auto add3 = subgraph_builder1.AddNode("add3", ADD, 2, 1); + + subgraph_builder1.AddDataEdge(w1, 0, prefetch1, 0); + subgraph_builder1.AddDataEdge(w2, 0, prefetch2, 0); + subgraph_builder1.AddDataEdge(w3, 0, prefetch3, 0); + subgraph_builder1.AddDataEdge(const1, 0, add1, 0); + subgraph_builder1.AddDataEdge(prefetch1, 0, add1, 1); + subgraph_builder1.AddDataEdge(add1, 0, add2, 0); + subgraph_builder1.AddDataEdge(prefetch2, 0, add2, 1); + subgraph_builder1.AddDataEdge(add2, 0, add3, 0); + subgraph_builder1.AddDataEdge(prefetch3, 0, add3, 1); + subgraph_builder1.AddDataEdge(add3, 0, subgraph1_out, 0); + auto subgraph1 = subgraph_builder1.GetGraph(); + for (auto &node : subgraph1->GetDirectNode()) { + node->SetOwnerComputeGraph(subgraph1); + } + + // Subgraph2 + auto subgraph_builder2 = InnerGraphBuilder("Subgraph2"); + auto w4 = subgraph_builder2.AddNode("w4", VARIABLE, 0, 1); + auto w5 = subgraph_builder2.AddNode("w5", VARIABLE, 0, 1); + + auto prefetch4 = subgraph_builder2.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = subgraph_builder2.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto add4 = subgraph_builder2.AddNode("add4", ADD, 2, 1); + auto add5 = subgraph_builder2.AddNode("add5", ADD, 2, 1); + auto data1 = subgraph_builder2.AddNode("data1", DATA, 0, 1); + auto subgraph2_out = subgraph_builder2.AddNode("subgraph2_out", NETOUTPUT, 1, 1); + + subgraph_builder2.AddDataEdge(w4, 0, prefetch4, 0); + subgraph_builder2.AddDataEdge(w5, 0, prefetch5, 0); + subgraph_builder2.AddDataEdge(data1, 0, add4, 0); + subgraph_builder2.AddDataEdge(prefetch4, 0, add4, 1); + subgraph_builder2.AddDataEdge(add4, 0, add5, 0); + subgraph_builder2.AddDataEdge(prefetch5, 0, add5, 1); + subgraph_builder2.AddDataEdge(add5, 0, subgraph2_out, 0); + + auto subgraph2 = subgraph_builder2.GetGraph(); + for (auto &node : subgraph2->GetDirectNode()) { + node->SetOwnerComputeGraph(subgraph2); + } + + // root graph + auto call_node1 = builder.AddNode("call_node1", PARTITIONEDCALL, 0, 1); + auto call_node2 = builder.AddNode("call_node2", PARTITIONEDCALL, 1, 0); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + builder.AddDataEdge(call_node1, 0, call_node2, 0); + builder.AddDataEdge(call_node2, 0, net_output, 0); + auto compute_graph = builder.GetGraph(); + call_node1->SetOwnerComputeGraph(compute_graph); + call_node1->GetOpDesc()->AddSubgraphName(subgraph1->GetName()); + call_node1->GetOpDesc()->SetSubgraphInstanceName(0, subgraph1->GetName()); + call_node2->SetOwnerComputeGraph(compute_graph); + call_node2->GetOpDesc()->AddSubgraphName(subgraph2->GetName()); + call_node2->GetOpDesc()->SetSubgraphInstanceName(0, subgraph2->GetName()); + + subgraph1->SetParentNode(call_node1); + subgraph1->SetParentGraph(compute_graph); + subgraph2->SetParentNode(call_node2); + subgraph2->SetParentGraph(compute_graph); + compute_graph->AddSubGraph(subgraph1); + compute_graph->AddSubGraph(subgraph2); + + return compute_graph; +} + +/// +/// SubgraphWithInnerDependency: Calc node in different subgraph with inner dependency +/// +/// +/// call_node1(with Subgraph1) --------------------- call_node2 (with Subgraph2) ---------- net_output +/// +/// +/// Subgraph1: Subgraph2: +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// \ \ \ \ \ +/// const1 ----- add1 ----- add2 ----- subgraph1_out data1 ---- add3 ---- add4 ----- add5 ---- subgraph2_out +/// +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildSubgraphWithInnerDependency() { + auto builder = InnerGraphBuilder(graph_name_); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + // Subgraph1 + auto subgraph_builder1 = InnerGraphBuilder("Subgraph1"); + auto w1 = subgraph_builder1.AddNode("w1", VARIABLE, 0, 1); + auto w2 = subgraph_builder1.AddNode("w2", VARIABLE, 0, 1); + + auto prefetch1 = subgraph_builder1.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = subgraph_builder1.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto subgraph1_out = subgraph_builder1.AddNode("subgraph1_out", NETOUTPUT, 1, 0); + auto const1 = subgraph_builder1.AddNode("const1", CONSTANTOP, 0, 1); + + auto add1 = subgraph_builder1.AddNode("add1", ADD, 2, 1); + auto add2 = subgraph_builder1.AddNode("add2", ADD, 2, 1); + + subgraph_builder1.AddDataEdge(w1, 0, prefetch1, 0); + subgraph_builder1.AddDataEdge(w2, 0, prefetch2, 0); + subgraph_builder1.AddDataEdge(const1, 0, add1, 0); + subgraph_builder1.AddDataEdge(prefetch1, 0, add1, 1); + subgraph_builder1.AddDataEdge(add1, 0, add2, 0); + subgraph_builder1.AddDataEdge(prefetch2, 0, add2, 1); + subgraph_builder1.AddDataEdge(add2, 0, subgraph1_out, 0); + auto subgraph1 = subgraph_builder1.GetGraph(); + for (auto &node : subgraph1->GetDirectNode()) { + node->SetOwnerComputeGraph(subgraph1); + } + + // Subgraph2 + auto subgraph_builder2 = InnerGraphBuilder("Subgraph2"); + auto w3 = subgraph_builder2.AddNode("w3", VARIABLE, 0, 1); + auto w4 = subgraph_builder2.AddNode("w4", VARIABLE, 0, 1); + auto w5 = subgraph_builder2.AddNode("w5", VARIABLE, 0, 1); + + auto prefetch3 = subgraph_builder2.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch4 = subgraph_builder2.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = subgraph_builder2.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto add3 = subgraph_builder2.AddNode("add3", ADD, 2, 1); + auto add4 = subgraph_builder2.AddNode("add4", ADD, 2, 1); + auto add5 = subgraph_builder2.AddNode("add5", ADD, 2, 1); + auto data1 = subgraph_builder2.AddNode("data1", DATA, 0, 1); + auto subgraph2_out = subgraph_builder2.AddNode("subgraph2_out", NETOUTPUT, 1, 1); + + subgraph_builder2.AddDataEdge(w3, 0, prefetch3, 0); + subgraph_builder2.AddDataEdge(w4, 0, prefetch4, 0); + subgraph_builder2.AddDataEdge(w5, 0, prefetch5, 0); + subgraph_builder2.AddDataEdge(data1, 0, add3, 0); + subgraph_builder2.AddDataEdge(prefetch3, 0, add3, 1); + subgraph_builder2.AddDataEdge(add3, 0, add4, 0); + subgraph_builder2.AddDataEdge(prefetch4, 0, add4, 1); + subgraph_builder2.AddDataEdge(add4, 0, add5, 0); + subgraph_builder2.AddDataEdge(prefetch5, 0, add5, 1); + subgraph_builder2.AddDataEdge(add5, 0, subgraph2_out, 0); + + auto subgraph2 = subgraph_builder2.GetGraph(); + for (auto &node : subgraph2->GetDirectNode()) { + node->SetOwnerComputeGraph(subgraph2); + } + + // root graph + auto call_node1 = builder.AddNode("call_node1", PARTITIONEDCALL, 0, 1); + auto call_node2 = builder.AddNode("call_node2", PARTITIONEDCALL, 1, 0); + auto net_output = subgraph_builder2.AddNode("net_output", NETOUTPUT, 1, 0); + builder.AddDataEdge(call_node1, 0, call_node2, 0); + builder.AddDataEdge(call_node2, 0, net_output, 0); + auto compute_graph = builder.GetGraph(); + call_node1->SetOwnerComputeGraph(compute_graph); + call_node1->GetOpDesc()->AddSubgraphName(subgraph1->GetName()); + call_node1->GetOpDesc()->SetSubgraphInstanceName(0, subgraph1->GetName()); + call_node2->SetOwnerComputeGraph(compute_graph); + call_node2->GetOpDesc()->AddSubgraphName(subgraph2->GetName()); + call_node2->GetOpDesc()->SetSubgraphInstanceName(0, subgraph2->GetName()); + + subgraph1->SetParentNode(call_node1); + subgraph1->SetParentGraph(compute_graph); + subgraph2->SetParentNode(call_node2); + subgraph2->SetParentGraph(compute_graph); + compute_graph->AddSubGraph(subgraph1); + compute_graph->AddSubGraph(subgraph2); + + return compute_graph; +} + +/// +/// BuildGraphWithMultiBatch: Different batch label +/// +/// +/// batch_label_128 +/// +/// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 --- +/// / / / / / / \ +/// /c prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 \ +/// const1 switch_false / / / / / \ +/// \ / / / / / / \ +/// switch1 w1 w2 w3 w4 w5 merge1 -- net_output +/// / \ \ \ \ \ \ / +/// const2 switch_true \ \ \ \ \ / +/// \c prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 / +/// \ \ \ \ \ \ / +/// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 --- +/// +/// batch_label_256 +/// +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildGraphWithMultiBatch() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto const2 = builder.AddNode("const2", CONSTANTOP, 0, 1); + auto switch1 = builder.AddNode("switch1", SWITCH, 2, 2); + auto switch_false = builder.AddNode("switch_false", IDENTITY, 1, 1); + auto switch_true = builder.AddNode("switch_true", IDENTITY, 1, 1); + auto merge1 = builder.AddNode("merge1", MERGE, 2, 2); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 1, 0); + + builder.AddDataEdge(const1, 0, switch1, 0); + builder.AddDataEdge(const2, 0, switch1, 1); + builder.AddDataEdge(switch1, 0, switch_false, 0); + builder.AddDataEdge(switch1, 1, switch_true, 0); + builder.AddDataEdge(merge1, 0, net_output, 0); + + std::string batch_label_128 = "batch_128"; + std::string batch_label_256 = "batch_256"; + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + { + auto prefetch1 = builder.AddNode("batch_label_128/prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_128); + auto prefetch2 = builder.AddNode("batch_label_128/prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_128); + auto prefetch3 = builder.AddNode("batch_label_128/prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_128); + auto prefetch4 = builder.AddNode("batch_label_128/prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}, {1024}, batch_label_128); + auto prefetch5 = builder.AddNode("batch_label_128/prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}, {1024}, batch_label_128); + + auto add1 = builder.AddNode("batch_label_128/add1", ADD, 2, 1); + SetBatchLabel(add1, batch_label_128); + auto add2 = builder.AddNode("batch_label_128/add2", ADD, 2, 1); + SetBatchLabel(add2, batch_label_128); + auto add3 = builder.AddNode("batch_label_128/add3", ADD, 2, 1); + SetBatchLabel(add3, batch_label_128); + auto add4 = builder.AddNode("batch_label_128/add4", ADD, 2, 1); + SetBatchLabel(add4, batch_label_128); + auto add5 = builder.AddNode("batch_label_128/add5", ADD, 2, 1); + SetBatchLabel(add5, batch_label_128); + auto const1 = builder.AddNode("batch_label_128/const1", CONSTANTOP, 0, 1); + SetBatchLabel(const1, batch_label_128); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(add1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(add2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(add3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(add4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add5, 0, merge1, 0); + builder.AddControlEdge(switch_false, const1); + } + + { + auto prefetch1 = builder.AddNode("batch_label_256/prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_256); + auto prefetch2 = builder.AddNode("batch_label_256/prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_256); + auto prefetch3 = builder.AddNode("batch_label_256/prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}, {500}, batch_label_256); + auto prefetch4 = builder.AddNode("batch_label_256/prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}, {1024}, batch_label_256); + auto prefetch5 = builder.AddNode("batch_label_256/prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}, {1024}, batch_label_256); + + auto add1 = builder.AddNode("batch_label_256/add1", ADD, 2, 1); + SetBatchLabel(add1, batch_label_256); + auto add2 = builder.AddNode("batch_label_256/add2", ADD, 2, 1); + SetBatchLabel(add2, batch_label_256); + auto add3 = builder.AddNode("batch_label_256/add3", ADD, 2, 1); + SetBatchLabel(add3, batch_label_256); + auto add4 = builder.AddNode("batch_label_256/add4", ADD, 2, 1); + SetBatchLabel(add4, batch_label_256); + auto add5 = builder.AddNode("batch_label_256/add5", ADD, 2, 1); + SetBatchLabel(add5, batch_label_256); + auto const1 = builder.AddNode("batch_label_256/const1", CONSTANTOP, 0, 1); + SetBatchLabel(const1, batch_label_128); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(add1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(add2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(add3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(add4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add5, 0, merge1, 1); + + builder.AddControlEdge(switch_true, const1); + } + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// GraphWithMultiOutputPrefetch: Prefetch has more than one output +/// +/// w1 w2 w3 w4 w5 +/// \ \ \ \ \ +/// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// / \ / \ / \ / \ / +/// / \ / \ / \ / \ / +/// const1 ----- add1 add2 add3 add4 add5 +/// | \ | / | +/// | \ | / | +/// | \ | / | +/// | \ | / | +/// -------------- net_output --------------- +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildGraphWithMultiOutputPrefetch() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto add4 = builder.AddNode("add4", ADD, 2, 1); + auto add5 = builder.AddNode("add5", ADD, 2, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 5, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(prefetch1, 0, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(prefetch2, 0, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(prefetch3, 0, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(prefetch4, 0, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add1, 0, net_output, 0); + builder.AddDataEdge(add2, 0, net_output, 1); + builder.AddDataEdge(add3, 0, net_output, 2); + builder.AddDataEdge(add4, 0, net_output, 3); + builder.AddDataEdge(add5, 0, net_output, 4); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} + +/// +/// GraphWithMultiOutputPrefetch: Prefetch has more than one output +/// +/// w1 w2 w3 w4 w5 +/// \ / \ / \ / \ / \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 +/// / \ / \ / \ / \ / +/// / \ / \ / \ / \ / +/// const1 ----- add1 add2 add3 add4 add5 +/// | \ | / | +/// | \ | / | +/// | \ | / | +/// | \ | / | +/// -------------- net_output --------------- +/// +/// Memory distribution: +/// +/// |___w1__|__w2__|__w3__|__| +/// +/// |_____w4_____|_____w5____| +/// +ComputeGraphPtr BufferPoolGraphBuilder::BuildGraphWithMultiInputOutputPrefetch() { + auto builder = InnerGraphBuilder(graph_name_); + auto w1 = builder.AddNode("w1", VARIABLE, 0, 1); + auto w2 = builder.AddNode("w2", VARIABLE, 0, 1); + auto w3 = builder.AddNode("w3", VARIABLE, 0, 1); + auto w4 = builder.AddNode("w4", VARIABLE, 0, 1); + auto w5 = builder.AddNode("w5", VARIABLE, 0, 1); + + const int64_t buffer_pool_id = 0; + const int64_t buffer_pool_size = 5600; + + auto prefetch1 = builder.AddNode("prefetch1", HCOMALLGATHER, 2, 2); + SetPrefetchNodeInfo(prefetch1, buffer_pool_id, buffer_pool_size, {500, 500}); + auto prefetch2 = builder.AddNode("prefetch2", HCOMALLGATHER, 2, 2); + SetPrefetchNodeInfo(prefetch2, buffer_pool_id, buffer_pool_size, {500, 500}); + auto prefetch3 = builder.AddNode("prefetch3", HCOMALLGATHER, 2, 2); + SetPrefetchNodeInfo(prefetch3, buffer_pool_id, buffer_pool_size, {500, 1024}); + auto prefetch4 = builder.AddNode("prefetch4", HCOMALLGATHER, 2, 2); + SetPrefetchNodeInfo(prefetch4, buffer_pool_id, buffer_pool_size, {1024, 1024}); + auto prefetch5 = builder.AddNode("prefetch5", HCOMALLGATHER, 1, 1); + SetPrefetchNodeInfo(prefetch5, buffer_pool_id, buffer_pool_size, {1024}); + + auto const1 = builder.AddNode("const1", CONSTANTOP, 0, 1); + auto add1 = builder.AddNode("add1", ADD, 2, 1); + auto add2 = builder.AddNode("add2", ADD, 2, 1); + auto add3 = builder.AddNode("add3", ADD, 2, 1); + auto add4 = builder.AddNode("add4", ADD, 2, 1); + auto add5 = builder.AddNode("add5", ADD, 2, 1); + auto net_output = builder.AddNode("net_output", NETOUTPUT, 5, 0); + + builder.AddDataEdge(w1, 0, prefetch1, 0); + builder.AddDataEdge(w2, 0, prefetch1, 1); + builder.AddDataEdge(w2, 0, prefetch2, 0); + builder.AddDataEdge(w3, 0, prefetch2, 1); + builder.AddDataEdge(w3, 0, prefetch3, 0); + builder.AddDataEdge(w4, 0, prefetch3, 1); + builder.AddDataEdge(w4, 0, prefetch4, 0); + builder.AddDataEdge(w5, 0, prefetch4, 1); + builder.AddDataEdge(w5, 0, prefetch5, 0); + + builder.AddDataEdge(const1, 0, add1, 0); + builder.AddDataEdge(prefetch1, 0, add1, 1); + + builder.AddDataEdge(prefetch1, 1, add2, 0); + builder.AddDataEdge(prefetch2, 0, add2, 1); + + builder.AddDataEdge(prefetch2, 1, add3, 0); + builder.AddDataEdge(prefetch3, 0, add3, 1); + + builder.AddDataEdge(prefetch3, 1, add4, 0); + builder.AddDataEdge(prefetch4, 0, add4, 1); + + builder.AddDataEdge(prefetch4, 1, add5, 0); + builder.AddDataEdge(prefetch5, 0, add5, 1); + + builder.AddDataEdge(add1, 0, net_output, 0); + builder.AddDataEdge(add2, 0, net_output, 1); + builder.AddDataEdge(add3, 0, net_output, 2); + builder.AddDataEdge(add4, 0, net_output, 3); + builder.AddDataEdge(add5, 0, net_output, 4); + + auto compute_graph = builder.GetGraph(); + + return compute_graph; +} +} // namespace ut +} // namespace ge diff --git a/tests/ut/ge/graph/utils/buffer_pool_graph_builder.h b/tests/ut/ge/graph/utils/buffer_pool_graph_builder.h new file mode 100644 index 00000000..24382dd2 --- /dev/null +++ b/tests/ut/ge/graph/utils/buffer_pool_graph_builder.h @@ -0,0 +1,279 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPH_UTILS_BUFFER_POOL_GRAPH_BUILDER_H_ +#define GRAPH_UTILS_BUFFER_POOL_GRAPH_BUILDER_H_ + +#include +#include + +#include "graph/compute_graph.h" +#include "graph/graph.h" +#include "graph/node.h" + +namespace ge { +namespace ut { +class BufferPoolGraphBuilder { + public: + explicit BufferPoolGraphBuilder(const std::string &name = "BufferPoolGraph"); + ~BufferPoolGraphBuilder() {} + class InnerGraphBuilder { + public: + explicit InnerGraphBuilder(const std::string &name); + ~InnerGraphBuilder() {} + NodePtr AddNode(const std::string &name, const std::string &type, int in_cnt, int out_cnt, + Format format = FORMAT_NCHW, DataType data_type = DT_FLOAT, + std::vector shape = {1, 1, 224, 224}); + + void AddDataEdge(NodePtr &src_node, int src_idx, NodePtr &dst_node, int dst_idx); + + void AddControlEdge(NodePtr &src_node, NodePtr &dst_node); + + ComputeGraphPtr GetGraph() { + graph_->TopologicalSorting(); + return graph_; + } + private: + ComputeGraphPtr graph_; + }; + + /// + /// Normal graph + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output + /// + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildNormalGraph(); + + /// + /// Normal graph with multi buffer pool + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// (pool0) (pool1) (pool0) (pool0) (pool1) + /// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output + /// + /// + /// Memory distribution: + /// + /// |___w1__|__w3__|_________| + /// |_____w4_____|___________| + /// + /// |___w2__|_____w5___|_____| + /// + ComputeGraphPtr BuildNormalGraphWithMultiBufferPool(); + + /// + /// SerialGraph: Buffer pool size only can contain one prefetch node + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 ----- net_output + /// + /// + /// Memory distribution: + /// + /// |____w1_____|__| + /// + /// |____w2_____|__| + /// + /// |____w3_____|__| + /// + /// |______w4______| + /// + /// |______w5______| + /// + ComputeGraphPtr BuildSerialGraph(); + + /// + /// GraphWithMultiPrefetch: Calc node with more prefetch node + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 const1 + /// \ / \ / \ / + /// \ / \ / \ / + /// \ / \ / \ / + /// add1 ------ c ------- add2 ----- c ----- add3 + /// | | | + /// | | | + /// --------------- net_output ------------ + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildGraphWithMultiPrefetch(); + + /// + /// GraphWithSubgraph: Calc node in different subgraph + /// + /// + /// call_node1(with Subgraph1) --------------- call_node2 (with Subgraph2) --------------- net_output + /// + /// + /// Subgraph1: Subgraph2: + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- add3 ---- subgraph1_out data1 ---- add4 ----- add5 ---- subgraph2_out + /// + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildGraphWithSubgraph(); + + /// + /// SubgraphWithInnerDependency: Calc node in different subgraph with inner dependency + /// + /// + /// call_node1(with Subgraph1) --------------------- call_node2 (with Subgraph2) ---------- net_output + /// + /// + /// Subgraph1: Subgraph2: + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// \ \ \ \ \ + /// const1 ----- add1 ----- add2 ----- subgraph1_out data1 ---- add3 ---- add4 ----- add5 ---- subgraph2_out + /// + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildSubgraphWithInnerDependency(); + + /// + /// BuildGraphWithMultiBatch: Different batch label + /// + /// + /// batch_label_128 + /// + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 --- + /// / / / / / / \ + /// /c prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 \ + /// const1 switch_false / / / / / \ + /// \ / / / / / / \ + /// switch1 w1 w2 w3 w4 w5 merge1 -- net_output + /// / \ \ \ \ \ \ / + /// const2 switch_true \ \ \ \ \ / + /// \c prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 / + /// \ \ \ \ \ \ / + /// const1 ----- add1 ----- add2 ----- add3 ----- add4 ----- add5 --- + /// + /// batch_label_256 + /// + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildGraphWithMultiBatch(); + + /// + /// GraphWithMultiOutputPrefetch: Prefetch has more than one output + /// + /// w1 w2 w3 w4 w5 + /// \ \ \ \ \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// / \ / \ / \ / \ / + /// / \ / \ / \ / \ / + /// const1 ----- add1 add2 add3 add4 add5 + /// | \ | / | + /// | \ | / | + /// | \ | / | + /// | \ | / | + /// -------------- net_output --------------- + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildGraphWithMultiOutputPrefetch(); + + /// + /// GraphWithMultiOutputPrefetch: Prefetch has more than one output + /// + /// w1 w2 w3 w4 w5 + /// \ / \ / \ / \ / \ + /// prefetch1 prefetch2 prefetch3 prefetch4 prefetch5 + /// / \ / \ / \ / \ / + /// / \ / \ / \ / \ / + /// const1 ----- add1 add2 add3 add4 add5 + /// | \ | / | + /// | \ | / | + /// | \ | / | + /// | \ | / | + /// -------------- net_output --------------- + /// + /// Memory distribution: + /// + /// |___w1__|__w2__|__w3__|__| + /// + /// |_____w4_____|_____w5____| + /// + ComputeGraphPtr BuildGraphWithMultiInputOutputPrefetch(); + + void SetBufferPool(NodePtr &node, int64_t pool_id, int64_t pool_size, const std::string &batch_label = ""); + + void SetBatchLabel(NodePtr &node, const std::string &batch_label = ""); + + void SetOutputMemSize(NodePtr &node, const std::vector &mem_size = {1024}); + + void SetWorkSpaceMemSize(NodePtr &node, const std::vector &ws_bytes = {1024}); + + void SetPrefetchNodeInfo(NodePtr &node, int64_t pool_id, int64_t pool_size, + const std::vector &mem_size = {1024}, + const std::vector &ws_bytes = {1024}, + const std::string &batch_label = ""); + + private: + std::string graph_name_; +}; +} // namespace ut +} // namespace ge + +#endif // GRAPH_UTILS_BUFFER_POOL_GRAPH_BUILDER_H_ diff --git a/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc b/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc new file mode 100644 index 00000000..dd6b1881 --- /dev/null +++ b/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc @@ -0,0 +1,110 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ir_build/atc_ir_common.h" +#include "graph/testcase/ge_graph/graph_builder_utils.h" + +#define protected public +#define private public + +#undef private +#undef protected + +const string DATA = "Data"; +const string AddNYes = "AddNYes"; +const string NETOUTPUT = "NetOutput"; + +using namespace ge; +class UtestIrCommon : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +static ge::OpDescPtr CreateOpDesc(const std::string &name, const std::string &type) { + OpDescPtr op_desc = std::make_shared(name, type); + ge::GeTensorDesc ge_tensor_desc; + op_desc->AddInputDesc("input", ge_tensor_desc); + op_desc->AddOutputDesc("output", ge_tensor_desc); + + return op_desc; +} + +static ComputeGraphPtr BuildComputeGraph() { + auto builder = ut::GraphBuilder("test"); + auto data1 = builder.AddNode("input1", DATA, 1, 1, FORMAT_NCHW, DT_FLOAT, {1, 2, 3}); + auto data2 = builder.AddNode("input2", DATA, 1, 1, FORMAT_NCHW, DT_FLOAT, {4, 10}); + auto addn1 = builder.AddNode("addn1", AddNYes, 2, 1); + auto netoutput = builder.AddNode("netoutput", NETOUTPUT, 1, 0); + + builder.AddDataEdge(data1, 0, addn1, 0); + builder.AddDataEdge(data2, 0, addn1, 1); + builder.AddDataEdge(addn1, 0,netoutput, 0); + + return builder.GetGraph(); +} + +TEST(UtestIrCommon, update_data_op_shape) { + ge::OpDescPtr op_desc = CreateOpDesc("Data", "Data"); + map> shape_map; + shape_map["Data"] = {{1,2}}; + + Status ret = UpdateDataOpShape(op_desc, shape_map); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST(UtestIrCommon, update_dynamic_shape_range_success) { + ComputeGraphPtr graph = BuildComputeGraph(); + std::string input_shape_range = "input1:[1, 2~3, -1];input2:[3~5, 10]"; + + Status ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST(UtestIrCommon, update_dynamic_shape_range_failed) { + ComputeGraphPtr graph = BuildComputeGraph(); + // 1 + std::string input_shape_range = "input1;[1, 2~3, -1]"; + Status ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::PARAM_INVALID); + + // 2 + input_shape_range = "input1:[1, 2~3, -1)"; + ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::PARAM_INVALID); + + //3 + input_shape_range = "input1:[1, 3~2, -1];input2:[3~5, 10]"; + ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::FAILED); + + //4 + input_shape_range = "input1:[1, 2~-3, -1]"; + ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::PARAM_INVALID); + + //5 + input_shape_range = "input:[1, 2~3, -1]"; + ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::PARAM_INVALID); + + //6 + input_shape_range = "addn1:[1, 2~3, -1]"; + ret = UpdateDynamicInputShapeRange(graph, input_shape_range); + EXPECT_EQ(ret, ge::PARAM_INVALID); +} diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 0b6ca271..b5aac527 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -15,26 +15,31 @@ */ #include +#include #include - #include "runtime/rt.h" +#include "graph/utils/node_utils.h" #define protected public #define private public #include "hybrid/model/hybrid_model_builder.h" #include "hybrid/model/hybrid_model.h" +#include "hybrid/node_executor/node_executor.h" #include "model/ge_model.h" #include "model/ge_root_model.h" - #include "hybrid/node_executor/aicore/aicore_op_task.h" #include "framework/common/taskdown_common.h" #include "framework/common/debug/log.h" #include "graph/ge_context.h" #include "hybrid/executor/hybrid_execution_context.h" +#include "hybrid/executor/hybrid_model_executor.h" #include "hybrid/node_executor/aicore/aicore_task_builder.h" #include "graph/load/model_manager/tbe_handle_store.h" +#include "graph/manager/graph_mem_allocator.h" +#include "hybrid/common/npu_memory_allocator.h" #include "graph/types.h" - +#include "graph/utils/tensor_utils.h" +#include "graph/testcase/ge_graph/graph_builder_utils.h" #undef private #undef protected @@ -43,11 +48,14 @@ using namespace testing; using namespace ge; using namespace hybrid; + class UtestGeHybrid : public testing::Test { protected: void SetUp() {} - void TearDown() {} + void TearDown() { + NpuMemoryAllocator::allocators_.clear(); + } }; static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") { @@ -146,12 +154,58 @@ TEST_F(UtestGeHybrid, index_taskdefs_failed) { ComputeGraphPtr graph = std::make_shared("test"); GeRootModelPtr ge_root_model = make_shared(graph); + ge_root_model->SetModelName("test_name"); HybridModel hybrid_model(ge_root_model); HybridModelBuilder hybrid_model_builder(hybrid_model); + ASSERT_EQ(hybrid_model_builder.Build(), INTERNAL_ERROR); ASSERT_EQ(hybrid_model_builder.IndexTaskDefs(graph, ge_model), INTERNAL_ERROR); } +TEST_F(UtestGeHybrid, parse_force_infershape_nodes) { + const char *const kForceInfershape = "_force_infershape_when_running"; + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Conv2D", "Conv2D"); + ge::AttrUtils::SetBool(op_desc, kForceInfershape, true); + auto node = graph->AddNode(op_desc); + std::unique_ptr new_node; + NodeItem::Create(node, new_node); + GeRootModelPtr ge_root_model = make_shared(graph); + HybridModel hybrid_model(ge_root_model); + HybridModelBuilder hybrid_model_builder(hybrid_model); + ASSERT_EQ(hybrid_model_builder.ParseForceInfershapeNodes(node, *new_node), SUCCESS); +} +static ComputeGraphPtr BuildDataDirectConnectGraph() { + const char *kRefIndex = "_parent_node_index"; + ge::ut::GraphBuilder builder("subgraph"); + auto data = builder.AddNode("Data", "Data", 1, 1); + auto netoutput = builder.AddNode("NetOutput", "NetOutput", 1, 1); + (void)AttrUtils::SetInt(netoutput->GetOpDesc()->MutableInputDesc(0), kRefIndex, 0); + + builder.AddDataEdge(data, 0, netoutput, 0); + return builder.GetGraph(); +} +TEST_F(UtestGeHybrid, data_direct_connect) { + std::unique_ptr node_item; + auto root_graph = make_shared("root_graph"); + OpDescPtr op_desc = CreateOpDesc("PartitionedCall", "PartitionedCall"); + auto node = root_graph->AddNode(op_desc); + node->SetOwnerComputeGraph(root_graph); + auto sub_graph = BuildDataDirectConnectGraph(); + sub_graph->SetParentGraph(root_graph); + sub_graph->SetParentNode(node); + node->GetOpDesc()->AddSubgraphName("subgraph"); + node->GetOpDesc()->SetSubgraphInstanceName(0, "subgraph"); + root_graph->AddSubgraph("subgraph", sub_graph); + std::unique_ptr new_node; + NodeItem::Create(node, new_node); + GeRootModelPtr ge_root_model = make_shared(root_graph); + HybridModel hybrid_model(ge_root_model); + HybridModelBuilder hybrid_model_builder(hybrid_model); + auto ret = hybrid_model_builder.IdentifyVariableOutputs(*new_node.get()); + ASSERT_EQ(ret, SUCCESS); +} + TEST_F(UtestGeHybrid, index_taskdefs_success) { // build aicore task domi::ModelTaskDef model_task_def; @@ -190,4 +244,417 @@ TEST_F(UtestGeHybrid, index_taskdefs_success) { HybridModelBuilder hybrid_model_builder(hybrid_model); ASSERT_EQ(hybrid_model_builder.IndexTaskDefs(graph, ge_model), SUCCESS); -} \ No newline at end of file +} + +TEST_F(UtestGeHybrid, init_weight_success) { + NpuMemoryAllocator::allocators_.emplace(make_pair(0, nullptr)); + // make graph with sub_graph + ComputeGraphPtr graph = std::make_shared("root_graph"); + OpDescPtr op_desc = CreateOpDesc("if", IF); + NodePtr node = graph->AddNode(op_desc); + // make sub graph + ComputeGraphPtr sub_graph = std::make_shared("if_sub_graph"); + OpDescPtr const_op_desc = CreateOpDesc("const", CONSTANT); + vector dims_vec_0 = {2, 1, 4, 1, 2}; + vector data_vec_0 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + GeTensorDesc tensor_desc_0(GeShape(dims_vec_0), FORMAT_NCHW, DT_INT32); + (void)TensorUtils::SetRealDimCnt(tensor_desc_0, dims_vec_0.size()); + ConstGeTensorPtr constTensor_0 = + std::make_shared(tensor_desc_0, (uint8_t *)&data_vec_0[0], data_vec_0.size() * sizeof(int32_t)); + AttrUtils::SetTensor(const_op_desc, ge::ATTR_NAME_WEIGHTS, constTensor_0); + const_op_desc->AddOutputDesc(tensor_desc_0); + NodePtr const_node = sub_graph->AddNode(const_op_desc); + graph->AddSubgraph("sub", sub_graph); + + GeRootModelPtr ge_root_model = make_shared(graph); + GeModelPtr ge_sub_model = make_shared(); + //Buffer weight_buffer = Buffer(128,0); + //ge_sub_model->SetWeight(weight_buffer); + ge_root_model->SetSubgraphInstanceNameToModel("sub",ge_sub_model); + HybridModel hybrid_model(ge_root_model); + HybridModelBuilder hybrid_model_builder(hybrid_model); + auto ret = hybrid_model_builder.InitWeights(); + ASSERT_EQ(ret,SUCCESS); + Buffer weight_buffer = Buffer(128,0); + ge_sub_model->SetWeight(weight_buffer); + ret = hybrid_model_builder.InitWeights(); + ASSERT_EQ(ret,PARAM_INVALID); +} + +TEST_F(UtestGeHybrid, hybrid_model_executor) { + ComputeGraphPtr compute_graph = MakeShared("abc"); + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + HybridModel *model_ptr = &model; + + uint32_t device_id = 0; + rtStream_t stream; + HybridModelExecutor executor(model_ptr, device_id, stream); + executor.Init(); +} + +TEST_F(UtestGeHybrid, test_parse_parallel_group) { + NodeExecutorManager::GetInstance().engine_mapping_.emplace("ops_kernel_info_hccl", + NodeExecutorManager::ExecutorType::HCCL); + ComputeGraphPtr compute_graph = MakeShared("test"); + OpDescPtr op_desc = CreateOpDesc("AllReduce", "AllReduce"); + op_desc->SetId(0); + ge::AttrUtils::SetStr(op_desc, ATTR_NAME_PARALLEL_GROUP, "group_1"); + auto node = compute_graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->node_id = 0; + + op_desc->SetOpKernelLibName("ops_kernel_info_hccl"); + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + model.root_graph_ = compute_graph; + + HybridModelBuilder builder(model); + ASSERT_EQ(builder.CollectParallelGroups(node_item.get()), SUCCESS); + + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 1); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 1); + + OpDescPtr op_desc_1 = CreateOpDesc("subgraph", "PartitionedCall"); + op_desc_1->AddSubgraphName("subgraph"); + auto node_1 = compute_graph->AddNode(op_desc_1); + + ComputeGraphPtr subgraph = MakeShared("subgraph"); + ASSERT_EQ(NodeUtils::SetSubgraph(*node_1, 0, subgraph), GRAPH_SUCCESS); + + std::unique_ptr node_item_1; + NodeItem::Create(node_1, node_item_1); + node_item_1->node_id = 1; + + ASSERT_EQ(builder.CollectParallelGroups(node_item_1.get()), SUCCESS); + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 1); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 1); + + OpDescPtr op_desc_2 = CreateOpDesc("sub_node_1", "AllReduce"); + ge::AttrUtils::SetStr(op_desc_2, ATTR_NAME_PARALLEL_GROUP, "group_1"); + auto node_2 = subgraph->AddNode(op_desc_2); + ASSERT_TRUE(node_2 != nullptr); + + OpDescPtr op_desc_3 = CreateOpDesc("sub_node_2", "AllReduce2"); + ge::AttrUtils::SetStr(op_desc_3, ATTR_NAME_PARALLEL_GROUP, "group_2"); + auto node_3 = subgraph->AddNode(op_desc_3); + ASSERT_TRUE(node_3 != nullptr); + + ASSERT_EQ(builder.CollectParallelGroups(node_item_1.get()), SUCCESS); + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_["group_1"].size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_["group_2"].size(), 1); + + builder.parallel_group_to_nodes_.clear(); + builder.node_ref_inputs_.clear(); + model.node_items_[node] = std::move(node_item); + model.node_items_[node_1] = std::move(node_item_1); + + ASSERT_FALSE(model.node_items_[node]->has_observer); + ASSERT_TRUE(model.node_items_[node_1]->dependents_for_execution.empty()); + ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); + ASSERT_TRUE(model.node_items_[node]->has_observer); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 1); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution[0], node); + + // repeat parse + ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); + ASSERT_TRUE(model.node_items_[node]->has_observer); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 1); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution[0], node); +} + +TEST_F(UtestGeHybrid, unfold_subgraphs_success) { + ComputeGraphPtr merged_graph = nullptr; + + ComputeGraphPtr sub_sub_graph1 = std::make_shared("while_cond"); + OpDescPtr sub_sub_graph_while_cond_data_op_desc = CreateOpDesc("cond_data", DATA); + NodePtr sub_sub_graph_while_cond_data_node = sub_sub_graph1->AddNode(sub_sub_graph_while_cond_data_op_desc); + + ComputeGraphPtr sub_sub_graph2 = std::make_shared("while_body"); + /*OpDescPtr sub_sub_graph_while_body_const_op_desc = CreateOpDesc("body_const", CONSTANT); + NodePtr sub_sub_graph_while_body_const_node = sub_sub_graph2->AddNode(sub_sub_graph_while_body_const_op_desc);*/ + OpDescPtr sub_sub_graph_while_body_data_op_desc = CreateOpDesc("body_data", DATA); + NodePtr sub_sub_graph_while_body_data_node = sub_sub_graph2->AddNode(sub_sub_graph_while_body_data_op_desc); + sub_sub_graph2->SetGraphUnknownFlag(true); + /*OpDescPtr sub_sub_graph_while_body_add_op_desc = CreateOpDesc("body_add", ADD); + NodePtr sub_sub_graph_while_body_add_node = sub_sub_graph2->AddNode(sub_sub_graph_while_body_add_node); + sub_sub_graph_while_body_add_node->AddLinkFrom(sub_sub_graph_while_body_data_node); + sub_sub_graph_while_body_add_node->AddLinkFrom(sub_sub_graph_while_body_const_node);*/ + + ComputeGraphPtr sub_graph = std::make_shared("sub_graph"); + OpDescPtr sub_graph_while_op_desc = CreateOpDesc("while", WHILE); + NodePtr sub_graph_while_node = sub_graph->AddNode(sub_graph_while_op_desc); + sub_graph->SetGraphUnknownFlag(true); + sub_graph_while_node->GetOpDesc()->AddSubgraphName("while_cond"); + sub_graph_while_node->GetOpDesc()->AddSubgraphName("while_body"); + sub_graph_while_node->GetOpDesc()->SetSubgraphInstanceName(0, "while_cond"); + sub_graph_while_node->GetOpDesc()->SetSubgraphInstanceName(1, "while_body"); + + ComputeGraphPtr root_graph = std::make_shared("root_graph"); + auto partitioned_call_op_desc = MakeShared("partitioned_call", PARTITIONEDCALL); + auto partitioned_call_node = root_graph->AddNode(partitioned_call_op_desc); + partitioned_call_node->GetOpDesc()->AddSubgraphName("sub_graph"); + partitioned_call_node->GetOpDesc()->SetSubgraphInstanceName(0, "sub_graph"); + + root_graph->AddSubGraph(sub_sub_graph1); + root_graph->AddSubGraph(sub_sub_graph2); + sub_sub_graph1->SetParentGraph(root_graph); + sub_sub_graph2->SetParentGraph(root_graph); + sub_sub_graph1->SetParentNode(sub_graph_while_node); + sub_sub_graph2->SetParentNode(sub_graph_while_node); + + root_graph->AddSubGraph(sub_graph); + sub_graph->SetParentNode(partitioned_call_node); + sub_graph->SetParentGraph(root_graph); + + GeRootModelPtr root_model = MakeShared(root_graph); + HybridModel hybrid_model(root_model); + HybridModelBuilder hybrid_model_builder(hybrid_model); + EXPECT_EQ(hybrid_model_builder.UnfoldSubgraphs(root_graph, merged_graph), SUCCESS); +} + +TEST_F(UtestGeHybrid, TestTaskContext) { + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + auto node = graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->input_start = 0; + node_item->output_start = 0; + + GraphExecutionContext execution_context; + SubgraphContext subgraph_context(nullptr, &execution_context); + subgraph_context.all_inputs_.resize(2); + subgraph_context.all_outputs_.resize(1); + + NodeState node_state(*node_item, &subgraph_context); + auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context); + ASSERT_TRUE(task_context != nullptr); + auto desc = task_context->MutableInputDesc(2); + ASSERT_TRUE(desc == nullptr); + desc = task_context->MutableOutputDesc(0); + ASSERT_TRUE(desc != nullptr); + ASSERT_EQ(desc->GetShape().GetDims(), shape.GetDims()); + GeTensorDesc output_desc; + ASSERT_EQ(task_context->GetOutputDesc(0, output_desc), SUCCESS); + ASSERT_EQ(output_desc.GetShape().GetDims(), shape.GetDims()); + + desc = task_context->MutableInputDesc(0); + ASSERT_TRUE(desc != nullptr); + ASSERT_EQ(desc->GetShape().GetDims(), shape.GetDims()); + GeShape new_shape({8, 2}); + tensor_desc.SetShape(new_shape); + task_context->UpdateInputDesc(1, tensor_desc); + GeTensorDesc new_desc; + ASSERT_EQ(task_context->GetInputDesc(1, new_desc), SUCCESS); + ASSERT_EQ(new_desc.GetShape().GetDims(), new_shape.GetDims()); +} + +TEST_F(UtestGeHybrid, hybrid_model_executor_update_args) { + auto aicore_task = std::unique_ptr(new(std::nothrow)hybrid::AiCoreOpTask()); + + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + auto node = graph->AddNode(op_desc); + + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->input_start = 0; + node_item->output_start = 0; + + GraphExecutionContext execution_context; + SubgraphContext subgraph_context(nullptr, &execution_context); + subgraph_context.all_inputs_.resize(2); + subgraph_context.all_outputs_.resize(1); + + NodeState node_state(*node_item, &subgraph_context); + auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context); + + int32_t buffer[1]; + aicore_task->tiling_buffer_ = TensorBuffer::Create(buffer, sizeof(buffer)); + EXPECT_NE(aicore_task->tiling_buffer_, nullptr); + aicore_task->max_arg_count_ = 0; + EXPECT_EQ(aicore_task->UpdateArgs(*task_context), ACL_ERROR_GE_MEMORY_OPERATE_FAILED); + aicore_task->args_ = std::unique_ptr(new uint8_t[sizeof(uintptr_t) * 2]); + EXPECT_EQ(aicore_task->UpdateArgs(*task_context), SUCCESS); +} + +TEST_F(UtestGeHybrid, hybrid_model_executor_check_shape) { + HybridModelExecutor::ExecuteArgs args; + GeTensorDescPtr ge_tensor = make_shared(GeTensorDesc()); + vector dim = {2 , 3}; + ge_tensor->SetShape(GeShape(dim)); + args.input_desc.push_back(ge_tensor); + + // create node + ge::ComputeGraphPtr graph = std::make_shared("God"); + OpDescPtr op_desc = std::make_shared("data", DATA); + GeTensorDesc tensor_desc(GeShape({2, 3})); + std::vector> shape_range({std::pair(1, 3), + std::pair(2, 4)}); + tensor_desc.SetShapeRange(shape_range); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + + NodePtr node = graph->AddNode(op_desc); + std::unique_ptr new_node; + NodeItem::Create(node, new_node); + new_node->is_dynamic = true; + + GraphItem graph_item; + graph_item.input_nodes_.emplace_back(new_node.get()); + + Status ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args); + ASSERT_EQ(ret, ge::SUCCESS); + + HybridModelExecutor::ExecuteArgs args1; + ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args1); + ASSERT_EQ(ret, ge::INTERNAL_ERROR); + + HybridModelExecutor::ExecuteArgs args2; + GeTensorDescPtr ge_tensor2 = make_shared(GeTensorDesc()); + vector dim2 = {-1 , 3}; + ge_tensor2->SetShape(GeShape(dim2)); + args2.input_desc.push_back(ge_tensor2); + + ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args1); + ASSERT_EQ(ret, ge::INTERNAL_ERROR); + + HybridModelExecutor::ExecuteArgs args3; + ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args3); + ASSERT_EQ(ret, ge::INTERNAL_ERROR); +} + +TEST_F(UtestGeHybrid, TestOptimizeDependenciesForConstInputs) { + ComputeGraphPtr compute_graph = MakeShared("test"); + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + model.root_graph_ = compute_graph; + HybridModelBuilder builder(model); + + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + std::unique_ptr const_node_item; + { + OpDescPtr const_op_desc = CreateOpDesc("Constant", "Const"); + const_op_desc->AddOutputDesc(tensor_desc); + auto const_node = compute_graph->AddNode(const_op_desc); + NodeItem::Create(const_node, const_node_item); + } + + std::unique_ptr non_const_node_item; + { + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + op_desc->AddOutputDesc(tensor_desc); + auto const_node = compute_graph->AddNode(op_desc); + NodeItem::Create(const_node, non_const_node_item); + } + + std::unique_ptr known_node_item; + { + OpDescPtr known_op_desc = CreateOpDesc("known", "PartitionedCall"); + known_op_desc->AddOutputDesc(tensor_desc); + known_op_desc->AddOutputDesc(tensor_desc); + auto known_node = compute_graph->AddNode(known_op_desc); + NodeItem::Create(known_node, known_node_item); + } + + std::unique_ptr dst_node_item; + { + OpDescPtr known_op_desc = CreateOpDesc("SomeOp", "SomeOpType "); + known_op_desc->AddOutputDesc(tensor_desc); + known_op_desc->AddOutputDesc(tensor_desc); + auto known_node = compute_graph->AddNode(known_op_desc); + NodeItem::Create(known_node, dst_node_item); + } + + float buffer[2 * 16]; + unique_ptr tensor_value(new TensorValue(buffer, sizeof(buffer))); + model.constant_tensors_[const_node_item->node] = std::move(tensor_value); + + // Case 1. connect to Const + auto output_id = 1; + builder.host_input_value_dependencies_[dst_node_item.get()].emplace_back(output_id, const_node_item.get()); + builder.host_input_value_dependencies_[dst_node_item.get()].emplace_back(0, non_const_node_item.get()); + dst_node_item->dependents_for_shape_inference.emplace_back(const_node_item->node); + dst_node_item->dependents_for_shape_inference.emplace_back(non_const_node_item->node); + + ASSERT_EQ(builder.OptimizeDependenciesForConstantInputs(), SUCCESS); + ASSERT_EQ(dst_node_item->dependents_for_shape_inference.size(), 1); + ASSERT_EQ(dst_node_item->dependents_for_shape_inference[0], non_const_node_item->node); + + // Case 2. connect to known-subgraph, netoutput connect to Const + builder.host_input_value_dependencies_.clear(); + dst_node_item->dependents_for_shape_inference.clear(); + + builder.known_subgraph_constant_output_refs_[known_node_item.get()].emplace(output_id, const_node_item->node); + builder.host_input_value_dependencies_[dst_node_item.get()].emplace_back(output_id, known_node_item.get()); + builder.host_input_value_dependencies_[dst_node_item.get()].emplace_back(0, non_const_node_item.get()); + + dst_node_item->dependents_for_shape_inference.emplace_back(known_node_item->node); + dst_node_item->dependents_for_shape_inference.emplace_back(non_const_node_item->node); + + ASSERT_EQ(builder.OptimizeDependenciesForConstantInputs(), SUCCESS); + ASSERT_EQ(dst_node_item->dependents_for_shape_inference.size(), 1); + ASSERT_EQ(dst_node_item->dependents_for_shape_inference[0], non_const_node_item->node); +} + +TEST_F(UtestGeHybrid, test_key_for_kernel_bin) { + auto aicore_task = std::unique_ptr(new(std::nothrow)hybrid::AiCoreOpTask()); + OpDesc op_desc("Sum", "Sum"); + EXPECT_EQ(aicore_task->GetKeyForTbeKernel(), OP_EXTATTR_NAME_TBE_KERNEL); + EXPECT_EQ(aicore_task->GetKeyForTvmMagic(), TVM_ATTR_NAME_MAGIC); + EXPECT_EQ(aicore_task->GetKeyForTvmMetaData(), TVM_ATTR_NAME_METADATA); + EXPECT_EQ(aicore_task->GetKeyForKernelName(op_desc), "Sum_kernelname"); + + auto atomic_task = std::unique_ptr(new(std::nothrow)hybrid::AtomicAddrCleanOpTask()); + EXPECT_EQ(atomic_task->GetKeyForTbeKernel(), EXT_ATTR_ATOMIC_TBE_KERNEL); + EXPECT_EQ(atomic_task->GetKeyForTvmMagic(), ATOMIC_ATTR_TVM_MAGIC); + EXPECT_EQ(atomic_task->GetKeyForTvmMetaData(), ATOMIC_ATTR_TVM_METADATA); + EXPECT_EQ(atomic_task->GetKeyForKernelName(op_desc), "Sum_atomic_kernelname"); +} + +TEST_F(UtestGeHybrid, TestParseDependentInputNodesForHccl) { + NodeExecutorManager::GetInstance().engine_mapping_.emplace("ops_kernel_info_hccl", + NodeExecutorManager::ExecutorType::HCCL); + ComputeGraphPtr compute_graph = MakeShared("test"); + + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + auto node = compute_graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->node_id = 0; + + OpDescPtr op_desc_1 = CreateOpDesc("AllReduce", "AllReduce"); + op_desc_1->SetOpKernelLibName("ops_kernel_info_hccl"); + auto node_1 = compute_graph->AddNode(op_desc_1); + std::unique_ptr node_item_1; + NodeItem::Create(node_1, node_item_1); + node_item_1->node_id = 1; + + node->GetOutControlAnchor()->LinkTo(node_1->GetInControlAnchor()); + + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + model.root_graph_ = compute_graph; + model.node_items_.emplace(node, std::move(node_item)); + + HybridModelBuilder builder(model); + std::vector deps; + ASSERT_EQ(builder.ParseDependentInputNodes(*node_item_1, deps), SUCCESS); + ASSERT_TRUE(model.GetNodeItem(node)->has_observer); + ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); +} diff --git a/tests/ut/ge/hybrid/known_node_executor_unittest.cc b/tests/ut/ge/hybrid/known_node_executor_unittest.cc new file mode 100644 index 00000000..16bbe3a0 --- /dev/null +++ b/tests/ut/ge/hybrid/known_node_executor_unittest.cc @@ -0,0 +1,71 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#define protected public +#define private public +#include "hybrid/node_executor/compiledsubgraph/known_node_executor.h" +#include "common/dump/dump_manager.h" +#undef private +#undef protected +#include "graph/manager/graph_mem_allocator.h" + +using namespace std; +using namespace testing; +using namespace ge; +using namespace hybrid; + +class UnknownNodeExecutorTest : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +class KnownNodeTaskMock : public KnownNodeTask { + public: + KnownNodeTaskMock(std::shared_ptr davinci_model): KnownNodeTask(davinci_model) {}; + ~KnownNodeTaskMock() override = default; + MOCK_METHOD2(DoInitDavinciModel, Status(void *, size_t)); +}; +} + +TEST_F(UnknownNodeExecutorTest, test_init_davinci_model) { + auto davinci_model = std::make_shared(0, nullptr); + davinci_model->SetDeviceId(0); + davinci_model->SetKnownNode(true); + + auto ge_model = make_shared(); + AttrUtils::SetInt(ge_model, ATTR_MODEL_VAR_SIZE, 0); + AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 1024); + davinci_model->Assign(ge_model); + + HybridModel model(nullptr); + KnownNodeTaskMock mock(davinci_model); + DumpProperties dump_properties; + dump_properties.enable_dump_ = "1"; + DumpManager::GetInstance().AddDumpProperties(model.GetSessionId(), dump_properties); + EXPECT_CALL(mock, DoInitDavinciModel).WillRepeatedly(::testing::Return(SUCCESS)); + ASSERT_EQ(mock.InitDavinciModel(model, model.GetModelWeight("subgraph")), SUCCESS); + + int32_t buffer[8]; + model.weight_buffer_map_.emplace("subgraph", TensorBuffer::Create(buffer, sizeof(buffer))); + ASSERT_EQ(mock.InitDavinciModel(model, model.GetModelWeight("subgraph")), SUCCESS); +} diff --git a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc index ef19b516..3dfbff41 100644 --- a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc +++ b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc @@ -37,6 +37,10 @@ class UtestGeProfilinganager : public testing::Test { void TearDown() override {} }; +int32_t ReporterCallback(uint32_t moduleId, uint32_t type, void *data, uint32_t len) { + return -1; +} + TEST_F(UtestGeProfilinganager, init_success) { setenv("PROFILING_MODE", "true", true); Options options; @@ -53,16 +57,24 @@ TEST_F(UtestGeProfilinganager, init_success) { } TEST_F(UtestGeProfilinganager, ParseOptions) { -setenv("PROFILING_MODE", "true", true); -Options options; -options.device_id = 0; -options.job_id = "0"; -options.profiling_mode = "1"; -options.profiling_options = R"({"result_path":"/data/profiling","training_trace":"on","task_trace":"on","aicpu_trace":"on","fp_point":"Data_0","bp_point":"addn","ai_core_metrics":"ResourceConflictRatio"})"; + setenv("PROFILING_MODE", "true", true); + Options options; + options.device_id = 0; + options.job_id = "0"; + options.profiling_mode = "1"; + options.profiling_options = R"({"result_path":"/data/profiling","training_trace":"on","task_trace":"on","aicpu_trace":"on","fp_point":"Data_0","bp_point":"addn","ai_core_metrics":"ResourceConflictRatio"})"; + + + struct MsprofGeOptions prof_conf = {{ 0 }}; + Status ret = ProfilingManager::Instance().ParseOptions(options.profiling_options); + EXPECT_EQ(ret, ge::SUCCESS); +} -struct MsprofGeOptions prof_conf = {{ 0 }}; +TEST_F(UtestGeProfilinganager, plungin_init_) { + ProfilingManager::Instance().prof_cb_.msprofReporterCallback = ReporterCallback; -Status ret = ProfilingManager::Instance().ParseOptions(options.profiling_options); -EXPECT_EQ(ret, ge::SUCCESS); + Status ret = ProfilingManager::Instance().PluginInit(); + EXPECT_EQ(ret, INTERNAL_ERROR); + ProfilingManager::Instance().prof_cb_.msprofReporterCallback = nullptr; } diff --git a/tests/ut/ge/single_op/single_op_model_unittest.cc b/tests/ut/ge/single_op/single_op_model_unittest.cc index eaf4564a..dadabaf6 100644 --- a/tests/ut/ge/single_op/single_op_model_unittest.cc +++ b/tests/ut/ge/single_op/single_op_model_unittest.cc @@ -17,7 +17,6 @@ #include #include -//#include "cce/taskdown_common.hpp" #include "graph/load/model_manager/model_utils.h" #include "graph/utils/graph_utils.h" #include "runtime/rt.h" @@ -196,4 +195,31 @@ TEST_F(UtestSingleOpModel, test_op_task_get_profiler_args) { ASSERT_EQ(model_id, 1); } +TEST_F(UtestSingleOpModel, test_build_dynamic_op) { + string model_data_str = "123456789"; + SingleOpModel model("model", model_data_str.c_str(), model_data_str.size()); + model.netoutput_op_ = make_shared("NetOutput", "NetOutput"); + model.model_helper_.model_ = ge::MakeShared(); + + // make graph + auto compute_graph = make_shared("graph"); + auto data_op = make_shared("Data", DATA); + auto data_node = compute_graph->AddNode(data_op); + auto graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph); + model.model_helper_.model_->SetGraph(graph); + + // set task_def + auto model_task_def = make_shared(); + domi::TaskDef *task_def = model_task_def->add_task(); + task_def->set_type(RT_MODEL_TASK_KERNEL); + domi::KernelDef *kernel_def = task_def->mutable_kernel(); + domi::KernelContext *context = kernel_def->mutable_context(); + context->set_kernel_type(2); // ccKernelType::TE + model.model_helper_.model_->SetModelTaskDef(model_task_def); + + std::mutex stream_mu_; + DynamicSingleOp dynamic_single_op(0, &stream_mu_, nullptr); + StreamResource res((uintptr_t)1); + model.BuildDynamicOp(res, dynamic_single_op); +} diff --git a/tests/ut/ge/single_op/single_op_unittest.cc b/tests/ut/ge/single_op/single_op_unittest.cc new file mode 100644 index 00000000..8c2f6e51 --- /dev/null +++ b/tests/ut/ge/single_op/single_op_unittest.cc @@ -0,0 +1,163 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "runtime/rt.h" + +#define protected public +#define private public +#include "single_op/single_op.h" +#include "single_op/single_op_manager.h" +#undef private +#undef protected + +using namespace std; +using namespace ge; + +class UtestSingleOp : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UtestSingleOp, test_dynamic_singleop_execute_async) { + uintptr_t resource_id = 0; + std::mutex stream_mu; + rtStream_t stream = nullptr; + rtStreamCreate(&stream, 0); + DynamicSingleOp dynamic_single_op(resource_id, &stream_mu, stream); + + vector dims_vec_0 = {2}; + vector input_desc; + GeTensorDesc tensor_desc_0(GeShape(dims_vec_0), FORMAT_NCHW, DT_INT32); + // input data from device + AttrUtils::SetInt(tensor_desc_0, ATTR_NAME_PLACEMENT, 0); + input_desc.emplace_back(tensor_desc_0); + + vector input_buffers; + ge::DataBuffer data_buffer; + data_buffer.data = new char[4]; + data_buffer.length = 4; + input_buffers.emplace_back(data_buffer); + + vector output_desc; + vector output_buffers; + + // UpdateRunInfo failed + EXPECT_EQ(dynamic_single_op.ExecuteAsync(input_desc, input_buffers, output_desc, output_buffers), ACL_ERROR_GE_PARAM_INVALID); +} + +TEST_F(UtestSingleOp, test_dynamic_singleop_execute_async1) { + uintptr_t resource_id = 0; + std::mutex stream_mu; + rtStream_t stream = nullptr; + rtStreamCreate(&stream, 0); + DynamicSingleOp dynamic_single_op(resource_id, &stream_mu, stream); + dynamic_single_op.num_inputs_ = 1; + + vector dims_vec_0 = {2}; + vector input_desc; + GeTensorDesc tensor_desc_0(GeShape(dims_vec_0), FORMAT_NCHW, DT_INT32); + // input data from host + AttrUtils::SetInt(tensor_desc_0, ATTR_NAME_PLACEMENT, 1); + input_desc.emplace_back(tensor_desc_0); + + int64_t input_size = 0; + EXPECT_EQ(TensorUtils::GetTensorMemorySizeInBytes(tensor_desc_0, input_size), SUCCESS); + EXPECT_EQ(input_size, 64); + EXPECT_NE(SingleOpManager::GetInstance().GetResource(resource_id, stream), nullptr); + + vector input_buffers; + ge::DataBuffer data_buffer; + data_buffer.data = new char[4]; + data_buffer.length = 4; + input_buffers.emplace_back(data_buffer); + + vector output_desc; + vector output_buffers; + + auto *tbe_task = new (std::nothrow) TbeOpTask(); + ge::OpDescPtr op_desc = std::make_shared("Mul", MATMUL); + ge::ComputeGraphPtr graph = std::make_shared("default"); + ge::NodePtr node = graph->AddNode(op_desc); + tbe_task->node_ = node; + + dynamic_single_op.op_task_.reset((OpTask *)(tbe_task)); + + OpDescPtr desc_ptr = MakeShared("name1", "type1"); + EXPECT_EQ(desc_ptr->AddInputDesc("x", GeTensorDesc(GeShape({2}), FORMAT_NCHW)), GRAPH_SUCCESS); + dynamic_single_op.op_task_->op_desc_ = desc_ptr; + // UpdateRunInfo failed + EXPECT_EQ(dynamic_single_op.ExecuteAsync(input_desc, input_buffers, output_desc, output_buffers), PARAM_INVALID); +} + + +TEST_F(UtestSingleOp, test_singleop_execute_async1) { + StreamResource *res = new (std::nothrow) StreamResource(1); + std::mutex stream_mu; + rtStream_t stream = nullptr; + rtStreamCreate(&stream, 0); + SingleOp single_op(res, &stream_mu, stream); + + vector input_buffers; + ge::DataBuffer data_buffer; + data_buffer.data = new char[4]; + data_buffer.length = 4; + data_buffer.placement = 1; + input_buffers.emplace_back(data_buffer); + vector output_buffers; + + single_op.input_sizes_.emplace_back(4); + SingleOpModelParam model_params; + single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params)); + single_op.args_.resize(1); + EXPECT_EQ(single_op.hybrid_model_executor_, nullptr); + EXPECT_EQ(single_op.running_param_->mem_base, nullptr); + EXPECT_EQ(single_op.tasks_.size(), 0); + EXPECT_EQ(single_op.ExecuteAsync(input_buffers, output_buffers), SUCCESS); +} + +TEST_F(UtestSingleOp, test_singleop_execute_async2) { + StreamResource *res = new (std::nothrow) StreamResource(1); + std::mutex stream_mu; + rtStream_t stream = nullptr; + rtStreamCreate(&stream, 0); + SingleOp single_op(res, &stream_mu, stream); + + vector input_buffers; + ge::DataBuffer data_buffer; + data_buffer.data = new char[4]; + data_buffer.length = 4; + data_buffer.placement = 1; + input_buffers.emplace_back(data_buffer); + vector output_buffers; + + single_op.input_sizes_.emplace_back(4); + SingleOpModelParam model_params; + single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params)); + single_op.args_.resize(1); + + GeTensorDesc tensor_desc(GeShape({1}), FORMAT_NHWC, DT_UINT64); + single_op.inputs_desc_.emplace_back(tensor_desc); + std::shared_ptr root_model = ge::MakeShared(); + single_op.hybrid_model_.reset(new (std::nothrow)hybrid::HybridModel(root_model)); + single_op.hybrid_model_executor_.reset(new (std::nothrow)hybrid::HybridModelExecutor(single_op.hybrid_model_.get(), 0, stream)); + EXPECT_EQ(single_op.running_param_->mem_base, nullptr); + EXPECT_EQ(single_op.tasks_.size(), 0); + EXPECT_EQ(single_op.ExecuteAsync(input_buffers, output_buffers), PARAM_INVALID); +} \ No newline at end of file diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h index 018f4e6c..2cf6712f 100644 --- a/third_party/fwkacllib/inc/runtime/dev.h +++ b/third_party/fwkacllib/inc/runtime/dev.h @@ -59,6 +59,7 @@ typedef enum tagRtAicpuDeployType { typedef enum tagRtFeatureType { FEATURE_TYPE_MEMCPY = 0, + FEATURE_TYPE_MEMORY = 1, FEATURE_TYPE_RSV } rtFeatureType_t; @@ -72,6 +73,11 @@ typedef enum tagMemcpyInfo { MEMCPY_INFO_RSV } rtMemcpyInfo_t; +typedef enum tagMemoryInfo { + MEMORY_INFO_TS_4G_LIMITED = 0, + MEMORY_INFO_RSV +} rtMemoryInfo_t; + /** * @ingroup dvrt_dev * @brief get total device number. diff --git a/third_party/fwkacllib/inc/toolchain/prof_callback.h b/third_party/fwkacllib/inc/toolchain/prof_callback.h index 3fad74bc..18550157 100644 --- a/third_party/fwkacllib/inc/toolchain/prof_callback.h +++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h @@ -74,6 +74,7 @@ enum MsprofReporterCallbackType { MSPROF_REPORTER_REPORT = 0, // report data MSPROF_REPORTER_INIT, // init reporter MSPROF_REPORTER_UNINIT, // uninit reporter + MSPROF_REPORTER_DATA_MAX_LEN, // data max length for calling report callback }; /**