diff --git a/ge/common/ge/op_tiling_manager.cc b/ge/common/ge/op_tiling_manager.cc index 9b5ba2d7..db959368 100644 --- a/ge/common/ge/op_tiling_manager.cc +++ b/ge/common/ge/op_tiling_manager.cc @@ -88,4 +88,8 @@ void OpTilingManager::LoadSo() { } } +OpTilingManager &OpTilingManager::GetInstance() { + static OpTilingManager instance; + return instance; +} } // namespace ge diff --git a/ge/common/ge/op_tiling_manager.h b/ge/common/ge/op_tiling_manager.h index d4e7f34e..17761969 100644 --- a/ge/common/ge/op_tiling_manager.h +++ b/ge/common/ge/op_tiling_manager.h @@ -25,6 +25,7 @@ using SoToHandleMap = std::map; class OpTilingManager { public: OpTilingManager() = default; + static OpTilingManager &GetInstance(); ~OpTilingManager(); void LoadSo(); diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index de8025f3..d7dfdc84 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -72,7 +72,89 @@ set(SRC_LIST "../single_op/task/tbe_task_builder.cc" "../single_op/task/aicpu_task_builder.cc" "../single_op/task/aicpu_kernel_task_builder.cc" - "../hybrid/hybrid_davinci_model_stub.cc" + "../hybrid/common/tensor_value.cc" + "../hybrid/common/npu_memory_allocator.cc" + "../hybrid/executor/rt_callback_manager.cc" + "../hybrid/executor/node_state.cc" + "../hybrid/executor/node_done_manager.cc" + "../hybrid/executor/hybrid_profiler.cc" + "../hybrid/executor/hybrid_model_executor.cc" + "../hybrid/executor/hybrid_model_async_executor.cc" + "../hybrid/executor/hybrid_execution_context.cc" + "../hybrid/executor/subgraph_context.cc" + "../hybrid/executor/subgraph_executor.cc" + "../hybrid/executor/worker/task_compile_engine.cc" + "../hybrid/executor/worker/shape_inference_engine.cc" + "../hybrid/executor/worker/execution_engine.cc" + "../hybrid/model/hybrid_model.cc" + "../hybrid/model/hybrid_model_builder.cc" + "../hybrid/model/node_item.cc" + "../hybrid/model/graph_item.cc" + "../hybrid/node_executor/aicore/aicore_node_executor.cc" + "../hybrid/node_executor/aicore/aicore_op_task.cc" + "../hybrid/node_executor/aicore/aicore_task_builder.cc" + "../hybrid/node_executor/aicpu/aicpu_node_executor.cc" + "../hybrid/node_executor/compiledsubgraph/known_node_executor.cc" + "../hybrid/node_executor/ge_local/ge_local_node_executor.cc" + "../hybrid/node_executor/host_cpu/host_cpu_node_executor.cc" + "../hybrid/node_executor/host_cpu/kernel_factory.cc" + "../hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc" + "../hybrid/node_executor/host_cpu/kernel/variable_kernel.cc" + "../hybrid/node_executor/host_cpu/kernel/assign_kernel.cc" + "../hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc" + "../hybrid/node_executor/controlop/control_op_executor.cc" + "../hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc" + "../hybrid/node_executor/rts/rts_node_executor.cc" + "../hybrid/node_executor/node_executor.cc" + "../hybrid/node_executor/task_context.cc" + "../hybrid/hybrid_davinci_model.cc" + "../ge_local_engine/engine/host_cpu_engine.cc" + "../graph/common/omg_util.cc" + "../graph/manager/host_mem_manager.cc" + "../graph/build/memory/var_mem_assign_util.cc" + "../host_kernels/transpose_kernel.cc" + "../host_kernels/add_kernel.cc" + "../host_kernels/broadcast_args_kernel.cc" + "../host_kernels/broadcast_gradient_args_kernel.cc" + "../host_kernels/cast_kernel.cc" + "../host_kernels/concat_offset_kernel.cc" + "../host_kernels/concat_v2_kernel.cc" + "../host_kernels/dynamic_stitch_kernel.cc" + "../host_kernels/identity_kernel.cc" + "../host_kernels/empty_kernel.cc" + "../host_kernels/expanddims_kernel.cc" + "../host_kernels/fill_kernel.cc" + "../host_kernels/floordiv_kernel.cc" + "../host_kernels/floormod_kernel.cc" + "../host_kernels/gather_v2_kernel.cc" + "../host_kernels/greater_kernel.cc" + "../host_kernels/kernel_utils.cc" + "../host_kernels/maximum_kernel.cc" + "../host_kernels/mul_kernel.cc" + "../host_kernels/pack_kernel.cc" + "../host_kernels/permute_kernel.cc" + "../host_kernels/range_kernel.cc" + "../host_kernels/rank_kernel.cc" + "../host_kernels/reduce_prod_kernel.cc" + "../host_kernels/reshape_kernel.cc" + "../host_kernels/rsqrt_kernel.cc" + "../host_kernels/shape_kernel.cc" + "../host_kernels/shape_n_kernel.cc" + "../host_kernels/size_kernel.cc" + "../host_kernels/slice_d_kernel.cc" + "../host_kernels/slice_kernel.cc" + "../host_kernels/squeeze_kernel.cc" + "../host_kernels/unsqueeze_kernel.cc" + "../host_kernels/ssd_prior_box_kernel.cc" + "../host_kernels/strided_slice_kernel.cc" + "../host_kernels/sub_kernel.cc" + "../host_kernels/transdata_kernel.cc" + "../host_kernels/unpack_kernel.cc" + "../graph/passes/pass_utils.cc" + "../graph/common/bcast.cc" + "../common/fp16_t.cc" + "../common/formats/format_transfers/format_transfer_transpose.cc" + "../common/formats/utils/formats_trans_utils.cc" ) ######## libge_executor.a ######## @@ -105,9 +187,9 @@ target_include_directories(ge_executor PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc - ${GE_CODE_DIR}/../inc/cce + ${GE_CODE_DIR}/../inc/cce #### blue zone #### - ${GE_CODE_DIR}/third_party/fwkacllib/inc + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(ge_executor PRIVATE @@ -147,9 +229,9 @@ target_include_directories(ge_executor_shared PRIVATE ${CMAKE_BINARY_DIR}/proto/ge #### yellow zone #### ${GE_CODE_DIR}/../inc - ${GE_CODE_DIR}/../inc/cce + ${GE_CODE_DIR}/../inc/cce #### blue zone #### - ${GE_CODE_DIR}/third_party/fwkacllib/inc + ${GE_CODE_DIR}/third_party/fwkacllib/inc ) target_link_libraries(ge_executor_shared PRIVATE @@ -158,7 +240,7 @@ target_link_libraries(ge_executor_shared PRIVATE -Wl,--no-as-needed ge_common runtime - slog + slog mmpa graph register diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc index add95372..3e916916 100755 --- a/ge/executor/ge_executor.cc +++ b/ge/executor/ge_executor.cc @@ -39,6 +39,8 @@ #include "graph/manager/graph_var_manager.h" #include "graph/load/new_model_manager/davinci_model.h" #include "opskernel_manager/ops_kernel_builder_manager.h" +#include "graph/opsproto_manager.h" +#include "ge_local_engine/engine/host_cpu_engine.h" using std::string; using std::vector; @@ -221,6 +223,33 @@ class ModelListenerAdapter : public ModelListener { std::shared_ptr listener; }; +static void InitOpsProtoManger() { + string opsproto_path; + const char *path_env = std::getenv("ASCEND_OPP_PATH"); + if (path_env != nullptr) { + string path = path_env; + string file_path = RealPath(path.c_str()); + if (file_path.empty()) { + GELOGE(FAILED, "File path %s is invalid.", path.c_str()); + return; + } + opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/"); + GELOGI("Get opsproto so path from env : %s", path.c_str()); + } else { + string path_base = PluginManager::GetPath(); + GELOGI("path_base is %s", path_base.c_str()); + path_base = path_base.substr(0, path_base.rfind('/')); + path_base = path_base.substr(0, path_base.rfind('/') + 1); + opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/"); + } + + GELOGI("Get opsproto path is %s", opsproto_path.c_str()); + OpsProtoManager *manager = OpsProtoManager::Instance(); + map option_tmp; + option_tmp.emplace(std::pair(string("ge.opsProtoLibPath"), opsproto_path)); + (void)manager->Initialize(option_tmp); +} + GeExecutor::GeExecutor() {} Status GeExecutor::Initialize() { @@ -230,6 +259,16 @@ Status GeExecutor::Initialize() { return ge::SUCCESS; } + OpTilingManager::GetInstance().LoadSo(); + + Status initHostCpuEngineStatus = HostCpuEngine::GetInstance().Initialize(); + if (initHostCpuEngineStatus != SUCCESS) { + GELOGE(initHostCpuEngineStatus, "Failed to initialize HostCpuEngine"); + return initHostCpuEngineStatus; + } + + InitOpsProtoManger(); + std::vector mem_type(1, RT_MEMORY_HBM); mem_type.push_back(RT_MEMORY_P2P_DDR); auto ret = MemManager::Instance().Initialize(mem_type); @@ -600,10 +639,16 @@ Status GeExecutor::UnloadModel(uint32_t model_id) { return ACL_ERROR_GE_INTERNAL_ERROR; } - std::shared_ptr davinci_model = ModelManager::GetInstance()->GetModel(model_id); - if (davinci_model != nullptr) { - uint64_t session_id = davinci_model->GetSessionId(); + std::shared_ptr hybrid_davinci_model = ModelManager::GetInstance()->GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + uint64_t session_id = hybrid_davinci_model->GetSessionId(); VarManagerPool::Instance().RemoveVarManager(session_id); + } else { + std::shared_ptr davinci_model = ModelManager::GetInstance()->GetModel(model_id); + if (davinci_model != nullptr) { + uint64_t session_id = davinci_model->GetSessionId(); + VarManagerPool::Instance().RemoveVarManager(session_id); + } } ret = GraphLoader::UnloadModel(model_id); if (ret != SUCCESS) { @@ -933,6 +978,26 @@ Status GeExecutor::LoadModelWithQ(uint32_t &model_id, const ModelData &model_dat */ Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &run_input_data, ge::RunModelData &run_output_data, bool async_mode) { + std::vector input_desc = {}; + std::vector output_desc = {}; + return ExecModel(model_id, stream, run_input_data, input_desc, run_output_data, output_desc, async_mode); +} + +/** +* @ingroup ge +* @brief Synchronous execution of offline model(Do not create thread) +* @param [in] uint32_t model_id: Model ID to execute + void* stream: stream to execute + const domi::InputData *input_data: Model input data + const std::vector &input_desc: Description of model input data + bool async_mode: is asynchronize mode +* @param [out] domi::OutputData *output_data: Model output data +* @param [out] std::vector &output_desc: Description of model output data +* @return SUCCESS handle successfully / others handle failed +*/ +Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &run_input_data, + const std::vector &input_desc, ge::RunModelData &run_output_data, + std::vector &output_desc, bool async_mode) { if (!isInit_) { GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!"); return ACL_ERROR_GE_EXEC_NOT_INIT; @@ -957,7 +1022,7 @@ Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModel } } - return GraphLoader::ExecuteModel(model_id, stream, async_mode, input_data, output_data); + return GraphLoader::ExecuteModel(model_id, stream, async_mode, input_data, input_desc, output_data, output_desc); } /** diff --git a/ge/executor/module.mk b/ge/executor/module.mk index 4a0188be..9566ca64 100644 --- a/ge/executor/module.mk +++ b/ge/executor/module.mk @@ -61,9 +61,91 @@ local_ge_executor_src_files := \ ../single_op/task/tbe_task_builder.cc \ ../single_op/task/aicpu_task_builder.cc \ ../single_op/task/aicpu_kernel_task_builder.cc \ - ../hybrid/hybrid_davinci_model_stub.cc\ ../hybrid/node_executor/aicpu/aicpu_ext_info.cc \ ../graph/common/local_context.cc \ + ../hybrid/common/tensor_value.cc \ + ../hybrid/common/npu_memory_allocator.cc \ + ../hybrid/executor/rt_callback_manager.cc \ + ../hybrid/executor/node_state.cc \ + ../hybrid/executor/node_done_manager.cc \ + ../hybrid/executor/hybrid_profiler.cc \ + ../hybrid/executor/hybrid_model_executor.cc \ + ../hybrid/executor/hybrid_model_async_executor.cc \ + ../hybrid/executor/hybrid_execution_context.cc \ + ../hybrid/executor/subgraph_context.cc \ + ../hybrid/executor/subgraph_executor.cc \ + ../hybrid/executor/worker/task_compile_engine.cc \ + ../hybrid/executor/worker/shape_inference_engine.cc \ + ../hybrid/executor/worker/execution_engine.cc \ + ../hybrid/model/hybrid_model.cc \ + ../hybrid/model/hybrid_model_builder.cc \ + ../hybrid/model/node_item.cc \ + ../hybrid/model/graph_item.cc \ + ../hybrid/node_executor/aicore/aicore_node_executor.cc \ + ../hybrid/node_executor/aicore/aicore_op_task.cc \ + ../hybrid/node_executor/aicore/aicore_task_builder.cc \ + ../hybrid/node_executor/aicpu/aicpu_node_executor.cc \ + ../hybrid/node_executor/compiledsubgraph/known_node_executor.cc \ + ../hybrid/node_executor/ge_local/ge_local_node_executor.cc \ + ../hybrid/node_executor/host_cpu/host_cpu_node_executor.cc \ + ../hybrid/node_executor/host_cpu/kernel_factory.cc \ + ../hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc \ + ../hybrid/node_executor/host_cpu/kernel/variable_kernel.cc \ + ../hybrid/node_executor/host_cpu/kernel/assign_kernel.cc \ + ../hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc \ + ../hybrid/node_executor/controlop/control_op_executor.cc \ + ../hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc \ + ../hybrid/node_executor/rts/rts_node_executor.cc \ + ../hybrid/node_executor/node_executor.cc \ + ../hybrid/node_executor/task_context.cc \ + ../hybrid/hybrid_davinci_model.cc \ + ../ge_local_engine/engine/host_cpu_engine.cc \ + ../graph/common/omg_util.cc \ + ../graph/manager/host_mem_manager.cc \ + ../graph/build/memory/var_mem_assign_util.cc \ + ../host_kernels/transpose_kernel.cc \ + ../host_kernels/add_kernel.cc \ + ../host_kernels/broadcast_args_kernel.cc \ + ../host_kernels/broadcast_gradient_args_kernel.cc \ + ../host_kernels/cast_kernel.cc \ + ../host_kernels/concat_offset_kernel.cc \ + ../host_kernels/concat_v2_kernel.cc \ + ../host_kernels/dynamic_stitch_kernel.cc \ + ../host_kernels/identity_kernel.cc \ + ../host_kernels/empty_kernel.cc \ + ../host_kernels/expanddims_kernel.cc \ + ../host_kernels/fill_kernel.cc \ + ../host_kernels/floordiv_kernel.cc \ + ../host_kernels/floormod_kernel.cc \ + ../host_kernels/gather_v2_kernel.cc \ + ../host_kernels/greater_kernel.cc \ + ../host_kernels/kernel_utils.cc \ + ../host_kernels/maximum_kernel.cc \ + ../host_kernels/mul_kernel.cc \ + ../host_kernels/pack_kernel.cc \ + ../host_kernels/permute_kernel.cc \ + ../host_kernels/range_kernel.cc \ + ../host_kernels/rank_kernel.cc \ + ../host_kernels/reduce_prod_kernel.cc \ + ../host_kernels/reshape_kernel.cc \ + ../host_kernels/rsqrt_kernel.cc \ + ../host_kernels/shape_kernel.cc \ + ../host_kernels/shape_n_kernel.cc \ + ../host_kernels/size_kernel.cc \ + ../host_kernels/slice_d_kernel.cc \ + ../host_kernels/slice_kernel.cc \ + ../host_kernels/squeeze_kernel.cc \ + ../host_kernels/unsqueeze_kernel.cc \ + ../host_kernels/ssd_prior_box_kernel.cc \ + ../host_kernels/strided_slice_kernel.cc \ + ../host_kernels/sub_kernel.cc \ + ../host_kernels/transdata_kernel.cc \ + ../host_kernels/unpack_kernel.cc \ + ../graph/passes/pass_utils.cc \ + ../graph/common/bcast.cc \ + ../common/fp16_t.cc \ + ../common/formats/format_transfers/format_transfer_transpose.cc \ + ../common/formats/utils/formats_trans_utils.cc \ local_ge_executor_c_include := \ proto/insert_op.proto \ diff --git a/ge/ge_local_engine/CMakeLists.txt b/ge/ge_local_engine/CMakeLists.txt index 76590172..615a968f 100755 --- a/ge/ge_local_engine/CMakeLists.txt +++ b/ge/ge_local_engine/CMakeLists.txt @@ -195,7 +195,7 @@ set_target_properties(atc_ge_local_opskernel_builder PROPERTIES ) ############ libge_local_opskernel_builder.a ############ -add_library(ge_local_opskernel_builder_static SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_HDRS}) +add_library(ge_local_opskernel_builder_static STATIC ${OPS_KERNEL_SRC_LIST} ${PROTO_HDRS}) target_compile_options(ge_local_opskernel_builder_static PRIVATE -Werror diff --git a/ge/ge_local_engine/engine/host_cpu_engine.cc b/ge/ge_local_engine/engine/host_cpu_engine.cc index b14cbb3d..c836d4d6 100755 --- a/ge/ge_local_engine/engine/host_cpu_engine.cc +++ b/ge/ge_local_engine/engine/host_cpu_engine.cc @@ -95,8 +95,8 @@ Status GetDataNumber(const GeTensorDesc &out_desc, uint64_t &data_num) { void HostCpuEngine::CloseSo() { for (auto handle : lib_handles_) { - if (dlclose(handle) != 0) { - GELOGW("failed to close handle, message: %s", dlerror()); + if (mmDlclose(handle) != 0) { + GELOGW("failed to close handle, message: %s", mmDlerror()); } } lib_handles_.clear(); @@ -322,13 +322,13 @@ Status HostCpuEngine::LoadLibs(std::vector &lib_paths) { Status HostCpuEngine::LoadLib(const std::string &lib_path) { GELOGI("To invoke dlopen on lib: %s", lib_path.c_str()); - auto handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + auto handle = mmDlopen(lib_path.c_str(), MMPA_RTLD_NOW | MMPA_RTLD_GLOBAL); if (handle == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), dlerror()); + GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), mmDlerror()); return INTERNAL_ERROR; } - auto initialize = (Status (*)(const HostCpuContext &))dlsym(handle, "Initialize"); + auto initialize = (Status (*)(const HostCpuContext &))mmDlsym(handle, "Initialize"); if (initialize != nullptr) { GELOGI("Invoke function Initialize in lib: %s", lib_path.c_str()); if (initialize(HostCpuContext()) != SUCCESS) { diff --git a/ge/ge_local_engine/engine/host_cpu_engine.h b/ge/ge_local_engine/engine/host_cpu_engine.h index cc6b578c..0b99ecac 100644 --- a/ge/ge_local_engine/engine/host_cpu_engine.h +++ b/ge/ge_local_engine/engine/host_cpu_engine.h @@ -20,7 +20,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "graph/node.h" #include "graph/operator.h" -#include "register/register.h" +#include "external/../register/register.h" namespace ge { class HostCpuEngine { diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc index 0fa1e1ee..19c0083c 100644 --- a/ge/graph/build/graph_builder.cc +++ b/ge/graph/build/graph_builder.cc @@ -30,6 +30,7 @@ #include "model/ge_model.h" #include "graph/ge_context.h" #include "opskernel_manager/ops_kernel_builder_manager.h" +#include "graph/utils/op_desc_utils.h" using domi::BuildMode; @@ -311,6 +312,53 @@ Status GraphBuilder::BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPt return BuildForUnknownShapeGraph(comp_graph, ge_model_ptr, session_id); } +static Status InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_anchor, + const std::vector &in_anchors, const std::string &name) { + GE_CHECK_NOTNULL(out_anchor); + NodePtr in_node = out_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(in_node); + OpDescBuilder op_desc_builder(name, MEMCPYADDRASYNC); + OpDescPtr op_desc = op_desc_builder.AddInput("x", in_node->GetOpDesc()->GetOutputDesc(0)) + .AddOutput("y", in_node->GetOpDesc()->GetOutputDesc(0)) + .Build(); + (void)AttrUtils::SetBool(op_desc, ATTR_NO_NEED_CONSTANT_FOLDING, false); + if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(op_desc)) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Insert IDENTITY node %s after %s failed.", name.c_str(), in_node->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + +static Status GenerateTaskForConstant(const std::shared_ptr &graph) { + for (auto &node : graph->GetDirectNode()) { + // CONSTANT not generate task, so insert IDENTITY between CONSTANT and NETOUTPUT + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + auto op_type = op_desc->GetType(); + if (op_type == NETOUTPUT) { + for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { + const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + NodePtr in_node = peer_out_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(in_node); + + std::string in_node_op_type = in_node->GetType(); + if (in_node_op_type == CONSTANT) { + GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str()); + std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy"; + if (InsertMemcpyNode(graph, peer_out_anchor, {in_data_anchor}, name) != SUCCESS) { + GELOGE(FAILED, "Insert memcpy between %s and %s failed.", in_node->GetName().c_str(), node->GetName().c_str()); + return FAILED; + } + } + } + } + } + return SUCCESS; +} + Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list, GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr, @@ -332,6 +380,9 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, if (sub_graph->GetParentGraph() != comp_graph && !sub_graph->GetParentGraph()->GetGraphUnknownFlag()) { continue; } + + GE_CHK_STATUS_RET(GenerateTaskForConstant(sub_graph), "Generate task For constant node in subgraph failed."); + if (sub_graph->GetGraphUnknownFlag()) { // unknown shape build flow GE_CHK_STATUS_RET(BuildForUnknownShapeGraph(sub_graph, ge_model_ptr, session_id), diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc index 2eeecc0f..aa825a5d 100755 --- a/ge/graph/load/graph_loader.cc +++ b/ge/graph/load/graph_loader.cc @@ -274,13 +274,16 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da /// @param [in] stream stream to execute model on /// @param [in] async_mode is asynchronize mode. /// @param [in] input_data model input data +/// @param [in] input_desc description of model input data /// @param [out] output_data model output data +/// @param [out] output_desc description of model output data /// Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data, - OutputData &output_data) { + const std::vector &input_desc, OutputData &output_data, + std::vector &output_desc) { auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); - Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, input_data, output_data); + Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, input_data, input_desc, output_data, output_desc); if (ret != SUCCESS) { GELOGE(ret, "Execute model failed, model_id:%u.", model_id); return ret; diff --git a/ge/graph/load/graph_loader.h b/ge/graph/load/graph_loader.h index b581f2fa..974af5c1 100755 --- a/ge/graph/load/graph_loader.h +++ b/ge/graph/load/graph_loader.h @@ -65,7 +65,8 @@ class GraphLoader { const std::vector &output_queue_ids); static Status ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data, - OutputData &output_data); + const std::vector &input_desc, OutputData &output_data, + std::vector &output_desc); static Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id); diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index c660f797..37b1fb4f 100755 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -117,7 +117,8 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptrGetWeight(); std::size_t weights_size = weights.GetSize(); GE_CHECK_LE(weights_size, ALLOC_MEMORY_MAX_SIZE); - if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) { - GELOGE(FAILED, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); + if ((weight_ptr != nullptr) && (weight_size < weights_size)) { + GELOGE(FAILED, "Invalid mem param: weight_size=%zu totalsize=%zu.", weight_size, weights_size); return FAILED; } - if ((weight_ptr != nullptr) && (weight_size < weights_size)) { - GELOGE(FAILED, "Invalid mem param: weight_size=%zu totalsize=%zu.", weight_size, weights_size); + weights_mem_base_ = static_cast(dev_ptr); + is_inner_weight_base_ = false; + + if (weights_size != 0) { + weights_mem_base_ = static_cast(weight_ptr); + is_inner_weight_base_ = false; + if (weight_ptr == nullptr) { + weights_mem_base_ = MallocWeightsMem(weights_size); + if (weights_mem_base_ == nullptr) { + GELOGE(GE_EXEC_ALLOC_WEIGHT_MEM_FAILED, "Alloc weight memory failed. size: %zu", weights_size); + return GE_EXEC_ALLOC_WEIGHT_MEM_FAILED; + } + is_inner_weight_base_ = true; + } + GELOGI("[IMAS]InitWeightMem graph_%u MallocMemory type[W] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + weights_mem_base_, weights_size); + GE_CHK_RT_RET(rtMemcpy(weights_mem_base_, weights_size, weights.GetData(), weights_size, RT_MEMCPY_HOST_TO_DEVICE)); + GELOGI("copy weights data to device"); + } + + runtime_param_.weight_base = weights_mem_base_; + return SUCCESS; +} + + +Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { + if (is_feature_map_mem_has_inited_) { + GELOGE(FAILED, "call InitFeatureMapMem more than once ."); + return FAILED; + } + is_feature_map_mem_has_inited_ = true; + + std::size_t data_size = TotalMemSize(); + std::size_t p2p_data_size = P2PMemInfos().at(RT_MEMORY_P2P_DDR).memory_size; + + if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) { + GELOGE(FAILED, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); return FAILED; } mem_base_ = static_cast(dev_ptr); p2p_mem_base_ = static_cast(dev_ptr); - weights_mem_base_ = static_cast(dev_ptr); is_inner_mem_base_ = false; - is_inner_weight_base_ = false; if (TotalMemSize() && mem_base_ == nullptr) { mem_base_ = MallocFeatureMapMem(data_size); @@ -298,12 +330,14 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size); return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED; } - GEEVENT("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, mem_base_, data_size); - weights_mem_base_ = mem_base_; + if (!is_inner_weight_base_) { + weights_mem_base_ = mem_base_; + is_inner_weight_base_ = true; + } is_inner_mem_base_ = true; - is_inner_weight_base_ = true; } if (p2p_data_size != 0) { @@ -312,27 +346,11 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p GELOGE(GE_EXEC_ALLOC_P2P_MEM_FAILED, "Alloc p2p memory failed,size: %zu", p2p_data_size); return GE_EXEC_ALLOC_P2P_MEM_FAILED; } - GELOGI("InitModelMem graph_%u MallocMemory type[P] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + GELOGI("InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, p2p_mem_base_, p2p_data_size); is_inner_p2p_mem_base_ = true; } - if (weights_size != 0) { - weights_mem_base_ = static_cast(weight_ptr); - is_inner_weight_base_ = false; - if (weight_ptr == nullptr) { - weights_mem_base_ = MallocWeightsMem(weights_size); - if (weights_mem_base_ == nullptr) { - GELOGE(GE_EXEC_ALLOC_WEIGHT_MEM_FAILED, "Alloc weight memory failed. size: %zu", weights_size); - return GE_EXEC_ALLOC_WEIGHT_MEM_FAILED; - } - is_inner_weight_base_ = true; - } - GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[W] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, - weights_mem_base_, weights_size); - GE_CHK_RT_RET(rtMemcpy(weights_mem_base_, weights_size, weights.GetData(), weights_size, RT_MEMCPY_HOST_TO_DEVICE)); - } - GE_CHK_STATUS_RET(InitVariableMem(), "Init variable memory failed."); runtime_param_.mem_base = mem_base_; runtime_param_.weight_base = weights_mem_base_; @@ -642,8 +660,9 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size GE_TIMESTAMP_START(InitModelMem); GELOGD("Known node is %d", known_node_); + GE_CHK_STATUS_RET_NOLOG(InitWeightMem(dev_ptr, weight_ptr, weight_size)); if (!known_node_) { - GE_CHK_STATUS_RET_NOLOG(InitModelMem(dev_ptr, mem_size, weight_ptr, weight_size)); + GE_CHK_STATUS_RET_NOLOG(InitFeatureMapAndP2PMem(dev_ptr, mem_size)); data_inputer_ = new (std::nothrow) DataInputer(); GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, MEMALLOC_FAILED, "data_inputer_ is nullptr."); } @@ -1140,6 +1159,7 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) { GE_IF_BOOL_EXEC(GetGearAndRealOutShapeInfo(input_count, op_desc) != SUCCESS, GELOGE(PARAM_INVALID, "Failed to get gear and real out shape info."); return PARAM_INVALID;); } + return SUCCESS; } diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h index 893c3d49..650f19eb 100755 --- a/ge/graph/load/new_model_manager/davinci_model.h +++ b/ge/graph/load/new_model_manager/davinci_model.h @@ -584,7 +584,8 @@ class DavinciModel { Status SyncVarData(); - Status InitModelMem(void *dev_ptr, size_t memsize, void *weight_ptr, size_t weightsize); + Status InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size); + Status InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size); void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input); @@ -850,7 +851,9 @@ class DavinciModel { Status GetRealOutputSizeOfMerge(size_t input_index, const NodePtr &merge_node); Status GetGearAndRealOutShapeInfo(size_t input_count, const OpDescPtr &op_desc); - bool is_model_has_inited_; + bool is_weight_mem_has_inited_; + bool is_feature_map_mem_has_inited_; + uint32_t model_id_; uint32_t runtime_model_id_; string name_; diff --git a/ge/graph/load/new_model_manager/model_manager.cc b/ge/graph/load/new_model_manager/model_manager.cc index 080ca889..6f20f63d 100755 --- a/ge/graph/load/new_model_manager/model_manager.cc +++ b/ge/graph/load/new_model_manager/model_manager.cc @@ -31,6 +31,7 @@ #include "model/ge_root_model.h" #include "graph/common/local_context.h" #include "common/formats/utils/formats_trans_utils.h" +#include "hybrid/hybrid_davinci_model.h" namespace ge { thread_local uint32_t device_count = 0; @@ -204,6 +205,13 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { std::lock_guard lock(map_mutex_); + auto hybrid_davinci_model = hybrid_model_map_.find(model_id); + if (hybrid_davinci_model != hybrid_model_map_.end()) { + uint64_t session_id = hybrid_davinci_model->second->GetSessionId(); + DestroyAicpuSession(session_id); + return SUCCESS; + } + auto it = model_map_.find(model_id); if (it == model_map_.end()) { GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id); @@ -925,6 +933,12 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector &output_desc, std::vector &inputFormats, std::vector &outputFormats, bool new_model_desc) { + std::shared_ptr hybrid_davinci_model = GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + hybrid_davinci_model->SetModelDescVersion(new_model_desc); + return hybrid_davinci_model->GetInputOutputDescInfo(input_desc, output_desc, inputFormats, outputFormats); + } + std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, GE_EXEC_MODEL_ID_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id); @@ -943,6 +957,11 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector> &batch_info, int32_t &dynamic_type) { + std::shared_ptr hybrid_davinci_model = GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + return hybrid_davinci_model->GetDynamicBatchInfo(batch_info, dynamic_type); + } + std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "GetDynamicBatchInfo failed, Invalid model id %u!", model_id); @@ -975,6 +994,12 @@ Status ModelManager::GetCombinedDynamicDims(const uint32_t model_id, vector &user_input_shape_order) { + auto hybrid_davinci_model = GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + hybrid_davinci_model->GetUserDesignateShapeOrder(user_input_shape_order); + return SUCCESS; + } + auto davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "GetUserDesignateShapeOrder Failed, Invalid Model ID %u!", model_id) @@ -990,6 +1015,12 @@ Status ModelManager::GetCurShape(const uint32_t model_id, std::vector & } Status ModelManager::GetModelAttr(uint32_t model_id, std::vector &dynamic_output_shape_info) { + std::shared_ptr hybrid_davinci_model = GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + hybrid_davinci_model->GetModelAttr(dynamic_output_shape_info); + return SUCCESS; + } + std::shared_ptr davinci_model = GetModel(model_id); GE_CHECK_NOTNULL(davinci_model); davinci_model->GetModelAttr(dynamic_output_shape_info); @@ -1201,10 +1232,25 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d /// @param [in] stream model stream /// @param [in] async_mode is asynchronize mode. /// @param [in] input_data input data +/// @param [in] input_desc description of input data /// @param [out] output_data output data +/// @param [out] output_desc description of output data /// Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data, - OutputData &output_data) { + const std::vector &input_desc, OutputData &output_data, + std::vector &output_desc) { + std::shared_ptr hybrid_davinci_model = GetHybridModel(model_id); + if (hybrid_davinci_model != nullptr) { + auto inputs = input_data.blobs; + auto outputs = output_data.blobs; + + Status status = hybrid_davinci_model->Execute(inputs, input_desc, outputs, output_desc, stream); + if (status == SUCCESS) { + GELOGI("Execute model %u success.", model_id); + } + return status; + } + std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid model id %u.", model_id); diff --git a/ge/graph/load/new_model_manager/model_manager.h b/ge/graph/load/new_model_manager/model_manager.h index 9821a4ab..e3780d5b 100755 --- a/ge/graph/load/new_model_manager/model_manager.h +++ b/ge/graph/load/new_model_manager/model_manager.h @@ -148,10 +148,13 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { /// @param [in] stream model stream /// @param [in] async_mode is asynchronize mode. /// @param [in] input_data model input data + /// @param [in] input_desc description of model input data /// @param [out] output_data model output data + /// @param [out] output_desc description of model output data /// ge::Status ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data, - OutputData &output_data); + const std::vector &input_desc, OutputData &output_data, + std::vector &output_desc); ge::Status SyncExecuteModel(uint32_t model_id, const std::vector &inputs, std::vector &outputs); diff --git a/ge/graph/partition/dynamic_shape_partition.cc b/ge/graph/partition/dynamic_shape_partition.cc index 87fac994..95f13b6f 100755 --- a/ge/graph/partition/dynamic_shape_partition.cc +++ b/ge/graph/partition/dynamic_shape_partition.cc @@ -26,6 +26,7 @@ #include #include "common/ge/ge_util.h" #include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" #include "framework/common/types.h" #include "graph/debug/ge_attr_define.h" #include "graph/utils/graph_utils.h" @@ -72,7 +73,7 @@ Status DynamicShapePartitioner::Partition() { } REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, true), "Failed set dynamic shape partitioned flag on root graph %s.", root_graph_->GetName().c_str()); - + REQUIRE_SUCCESS(CtrlEdgeTransfer(), "Failed do ctrl edge transfer!"); DumpGraph("_Before_DSP"); auto status = PartitionImpl(); GELOGD("%s.", DebugString().c_str()); @@ -86,6 +87,50 @@ Status DynamicShapePartitioner::Partition() { return status; } +Status DynamicShapePartitioner::CtrlEdgeTransfer() { + GELOGD("Do ctrl edge transfer start!"); + GE_CHECK_NOTNULL(root_graph_); + + bool is_dynamic_shape = false; + (void)AttrUtils::GetBool(root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape); + if (!is_dynamic_shape) { + return SUCCESS; + } + for (auto &subgraph : root_graph_->GetAllSubgraphs()) { + for (ge::NodePtr &n : subgraph->GetDirectNode()) { + auto op_desc = n->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + auto op_type = op_desc->GetType(); + if (op_type == CONSTANT || op_type == CONSTANTOP) { + if (n->GetInAllNodes().empty()) { + GELOGD("[CtrlEdgeTransferPass] node [%s] in nodes is empty", n->GetName().c_str()); + continue; + } + + GELOGD("start to tranfer ctrl edge for const node [%s]", n->GetName().c_str()); + + for (auto &in_control_node : n->GetInControlNodes()) { + GE_CHECK_NOTNULL(in_control_node); + GE_CHK_STATUS_RET(ge::GraphUtils::RemoveEdge(in_control_node->GetOutControlAnchor(), + n->GetInControlAnchor()), "remove edge failed"); + for (auto &out_node : n->GetOutNodes()) { + if (out_node == nullptr) { + continue; + } + GE_CHK_STATUS_RET(ge::GraphUtils::AddEdge(in_control_node->GetOutControlAnchor(), + out_node->GetInControlAnchor()), "add edge failed."); + } + } + } + } + } + + GELOGD("Do ctrl edge transfer end!"); + return SUCCESS; +} + Status DynamicShapePartitioner::PartitionImpl() { REQUIRE_SUCCESS(root_graph_->TopologicalSorting(), "Graph topological sort failed."); REQUIRE_SUCCESS(InitClusters(), "Failed init cluster nodes."); diff --git a/ge/graph/partition/dynamic_shape_partition.h b/ge/graph/partition/dynamic_shape_partition.h index b0477ae8..9772615e 100644 --- a/ge/graph/partition/dynamic_shape_partition.h +++ b/ge/graph/partition/dynamic_shape_partition.h @@ -151,6 +151,7 @@ class DynamicShapePartitioner { Status IsUnknownShapeGraph(ge::ComputeGraphPtr graph, bool &is_unknow); Status IsUnknownShapeNode(ge::NodePtr node, bool &is_unknow); bool IsUnknownShapeTensor(const ge::GeTensorDesc &tensor); + Status CtrlEdgeTransfer(); ge::ComputeGraphPtr root_graph_; // The original graph to partition std::unordered_map> node_2_cluster_; // Record nodes and the cluster it belongs to // topological sorted clusters, this field will change with the splitting. diff --git a/ge/graph/passes/pass_utils.cc b/ge/graph/passes/pass_utils.cc index 5359ff63..3adfbde3 100644 --- a/ge/graph/passes/pass_utils.cc +++ b/ge/graph/passes/pass_utils.cc @@ -37,10 +37,6 @@ #include "graph/utils/type_utils.h" namespace ge { -namespace { -const uint32_t kShapeDimSize = 1; -const uint32_t DIM_SIZE_TWO = 2; -} // namespace Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std::vector &data, std::vector &v_output, const bool scalar_output) { diff --git a/ge/graph/passes/transop_breadth_fusion_pass.cc b/ge/graph/passes/transop_breadth_fusion_pass.cc index 21fb1eaf..689510f0 100644 --- a/ge/graph/passes/transop_breadth_fusion_pass.cc +++ b/ge/graph/passes/transop_breadth_fusion_pass.cc @@ -63,7 +63,7 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No GE_IF_BOOL_EXEC(node == nullptr || node->GetOpDesc() == nullptr, GELOGE(FAILED, "node is null"); return ""); if (node->GetType() == CAST) { trans_data_type = true; - } else if (node->GetType() == TRANSPOSE || node->GetType() == TRANSPOSED) { + } else if (node->GetType() == TRANSPOSE || node->GetType() == TRANSPOSED || node->GetType() == EXPANDDIMS) { trans_format = true; trans_shape = true; } else if (node->GetType() == TRANSDATA) { diff --git a/ge/host_cpu_engine/CMakeLists.txt b/ge/host_cpu_engine/CMakeLists.txt index 02b5f996..97b5a0f5 100644 --- a/ge/host_cpu_engine/CMakeLists.txt +++ b/ge/host_cpu_engine/CMakeLists.txt @@ -8,7 +8,7 @@ set(SRC_LIST "engine/host_cpu_engine.cc" "ops_kernel_store/host_cpu_ops_kernel_info.cc" "ops_kernel_store/op/op_factory.cc" - "ops_kernel_store/op/host_op.cc" + "ops_kernel_store/op/host_op.cc" ) set(CPU_OPS_KERNEL_LIST @@ -98,7 +98,7 @@ target_link_libraries(atc_host_cpu_engine PRIVATE set_target_properties(atc_host_cpu_engine PROPERTIES OUTPUT_NAME host_cpu_engine - LIBRARY_OUTPUT_DIRECTORY atclib + LIBRARY_OUTPUT_DIRECTORY atclib ) ############ libhost_cpu_opskernel_builder.so ############ @@ -185,7 +185,7 @@ set_target_properties(atc_host_cpu_opskernel_builder PROPERTIES ) ############ libhost_cpu_opskernel_builder.a ############ -add_library(host_cpu_opskernel_builder_static SHARED ${CPU_OPS_KERNEL_LIST}) +add_library(host_cpu_opskernel_builder_static STATIC ${CPU_OPS_KERNEL_LIST}) target_compile_options(host_cpu_opskernel_builder_static PRIVATE -Werror diff --git a/ge/host_kernels/floordiv_kernel.cc b/ge/host_kernels/floordiv_kernel.cc index e254af09..df381212 100644 --- a/ge/host_kernels/floordiv_kernel.cc +++ b/ge/host_kernels/floordiv_kernel.cc @@ -112,8 +112,8 @@ void FloorDivKernel::ShapeCal(const std::vector &input, Ge template T FloorDivKernel::DivCal(const T &x_i, const T &y_i) { if ((x_i < static_cast(0)) != (y_i < static_cast(0))) { - T abs_x_i = std::abs(x_i); - T abs_y_i = std::abs(y_i); + T abs_x_i = x_i < 0 ? -x_i : x_i; + T abs_y_i = y_i < 0 ? -y_i : y_i; return static_cast(static_cast(-(abs_x_i + abs_y_i - 1) / abs_y_i)); } else { return static_cast(static_cast(x_i / y_i)); diff --git a/ge/host_kernels/floordiv_kernel.h b/ge/host_kernels/floordiv_kernel.h index d3dc3ff7..b8f6dd12 100755 --- a/ge/host_kernels/floordiv_kernel.h +++ b/ge/host_kernels/floordiv_kernel.h @@ -40,10 +40,6 @@ class FloorDivKernel : public Kernel { template Status DataCal(const std::vector &input, ge::GeTensorPtr output_ptr); Status ComputeByDataType(DataType data_type, const std::vector &input, GeTensorPtr output_ptr); - - int64_t axis_dim_; - int64_t head_dim_; - int64_t end_dim_; }; } // namespace ge diff --git a/ge/host_kernels/ssd_prior_box_kernel.cc b/ge/host_kernels/ssd_prior_box_kernel.cc index b93a4047..57af4026 100644 --- a/ge/host_kernels/ssd_prior_box_kernel.cc +++ b/ge/host_kernels/ssd_prior_box_kernel.cc @@ -187,7 +187,7 @@ Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint32_t aspect_ratios_size, uin return PARAM_INVALID; } - uint tmp_value = aspect_ratios_size * min_sizes_size; + uint32_t tmp_value = aspect_ratios_size * min_sizes_size; if (ge::CheckUint32AddOverflow(tmp_value, max_sizes_size) != SUCCESS) { GELOGW("Failed to get list param."); return PARAM_INVALID; @@ -199,7 +199,7 @@ Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint32_t aspect_ratios_size, uin return PARAM_INVALID; } num_priors = static_cast(tmp_value); - + if (ge::CheckIntMulOverflow(layer_width, layer_height) != SUCCESS) { GELOGW("Failed to get list param."); return PARAM_INVALID; @@ -288,7 +288,7 @@ std::unique_ptr SsdPriorboxKernel::BoundaryCalulate(int dim_size, int l } } - return std::move(output_data); + return output_data; } Status SsdPriorboxKernel::Compute(const NodePtr &node, std::vector &v_output) { diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h index 0910d2c7..0fa5a5d7 100644 --- a/ge/hybrid/executor/hybrid_execution_context.h +++ b/ge/hybrid/executor/hybrid_execution_context.h @@ -77,7 +77,7 @@ do { \ RECORD_PROFILING_EVENT((context), HybridProfiler::EXECUTION, fmt, "Execution", name, ##__VA_ARGS__) #define RECORD_CALLBACK_EVENT(context, name, fmt, ...) \ - RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACK, fmt, "Callback", name, ##__VA_ARGS__) + RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACKS, fmt, "Callback", name, ##__VA_ARGS__) } // namespace hybrid } // namespace ge #endif // GE_HYBRID_EXECUTOR_HYBRID_EXECUTION_CONTEXT_H_ diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index 468a7014..91996ab3 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -353,6 +353,44 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a return SUCCESS; } +Status HybridModelAsyncExecutor::Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc) { + GELOGI("Start to execute model."); + + HybridModelExecutor::ExecuteArgs args; + args.inputs.resize(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) { + TensorValue tensor_value(inputs[i].data, inputs[i].length); + args.inputs[i] = tensor_value; + } + GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); + for (const auto &output_tensor_desc : args.output_desc) { + output_desc.emplace_back(*output_tensor_desc); + } + + for (size_t i = 0; i < args.outputs.size(); ++i) { + int64_t output_real_size = 0; + ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc[i], output_real_size); + if (graph_status != GRAPH_SUCCESS) { + GELOGE(FAILED, "Get tensor size in bytes failed."); + return FAILED; + } + if (output_real_size > 0) { + if (outputs[i].length < static_cast(output_real_size)) { + GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by user should be greater than or equal to the real size of output[%ld]", + i, outputs[i].length, output_real_size); + return FAILED; + } + GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, RT_MEMCPY_DEVICE_TO_DEVICE)); + } + outputs[i].length = output_real_size; + } + + return SUCCESS; +} + Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector &outputs) { GELOGD("Start to execute model."); // prepare inputs diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h index 8de2beb6..21833b0b 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.h +++ b/ge/hybrid/executor/hybrid_model_async_executor.h @@ -35,6 +35,11 @@ class HybridModelAsyncExecutor { Status Init(); + Status Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc); + Status Execute(const vector &inputs, vector &outputs); Status Start(const std::shared_ptr &listener); diff --git a/ge/hybrid/executor/hybrid_profiler.h b/ge/hybrid/executor/hybrid_profiler.h index f6027a0b..94a042e4 100644 --- a/ge/hybrid/executor/hybrid_profiler.h +++ b/ge/hybrid/executor/hybrid_profiler.h @@ -33,7 +33,7 @@ class HybridProfiler { SHAPE_INFERENCE, COMPILE, EXECUTION, - CALLBACK + CALLBACKS }; struct Event { diff --git a/ge/hybrid/executor/node_state.h b/ge/hybrid/executor/node_state.h index 48b2ed72..04f1ee4b 100644 --- a/ge/hybrid/executor/node_state.h +++ b/ge/hybrid/executor/node_state.h @@ -27,7 +27,7 @@ namespace ge { namespace hybrid { class NodeTask; -class GraphExecutionContext; +struct GraphExecutionContext; class SubgraphContext; class ShapeFuture { diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc index b6f5bb84..7009331c 100755 --- a/ge/hybrid/hybrid_davinci_model.cc +++ b/ge/hybrid/hybrid_davinci_model.cc @@ -38,6 +38,14 @@ class HybridDavinciModel::Impl { return SUCCESS; } + Status Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc, + rtStream_t stream) { + return executor_.Execute(inputs, input_desc, outputs, output_desc); + } + Status Execute(const vector &inputs, vector &outputs) { return executor_.Execute(inputs, outputs); } @@ -68,6 +76,33 @@ class HybridDavinciModel::Impl { executor_.SetDeviceId(device_id); } + uint64_t GetSessionId() { + return model_.GetSessionId(); + } + + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { + return model_.GetDynamicBatchInfo(batch_info, dynamic_type); + } + + void GetUserDesignateShapeOrder(std::vector &user_input_shape_order) { + model_.GetUserDesignateShapeOrder(user_input_shape_order); + } + + void GetModelAttr(std::vector &dynamic_output_shape_info) { + model_.GetModelAttr(dynamic_output_shape_info); + } + + Status GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &output_formats) { + return model_.GetInputOutputDescInfo(input_desc, output_desc, input_formats, output_formats); + } + + void SetModelDescVersion(bool is_new_model_desc) { + model_.SetModelDescVersion(is_new_model_desc); + } + private: std::shared_ptr listener_; HybridModel model_; @@ -95,6 +130,14 @@ Status HybridDavinciModel::Init() { return impl_->Init(); } +Status HybridDavinciModel::Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc, rtStream_t stream) { + GE_CHECK_NOTNULL(impl_); + return impl_->Execute(inputs, input_desc, outputs, output_desc, stream); +} + Status HybridDavinciModel::Execute(const vector &inputs, vector &outputs) { GE_CHECK_NOTNULL(impl_); return impl_->Execute(inputs, outputs); @@ -132,5 +175,41 @@ void HybridDavinciModel::SetDeviceId(uint32_t device_id) { impl_->SetDeviceId(device_id); } } + +Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDynamicBatchInfo(batch_info, dynamic_type); +} + +void HybridDavinciModel::GetUserDesignateShapeOrder(std::vector &user_input_shape_order) { + if (impl_ != nullptr) { + impl_->GetUserDesignateShapeOrder(user_input_shape_order); + } +} + +void HybridDavinciModel::GetModelAttr(std::vector &dynamic_output_shape_info) { + if (impl_ != nullptr) { + impl_->GetModelAttr(dynamic_output_shape_info); + } +} + +Status HybridDavinciModel::GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &output_formats) { + GE_CHECK_NOTNULL(impl_); + return impl_->GetInputOutputDescInfo(input_desc, output_desc, input_formats, output_formats); +} + +void HybridDavinciModel::SetModelDescVersion(bool is_new_model_desc) { + if (impl_ != nullptr) { + impl_->SetModelDescVersion(is_new_model_desc); + } +} + +uint64_t HybridDavinciModel::GetSessionId() { + GE_CHECK_NOTNULL(impl_); + return impl_->GetSessionId(); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/hybrid_davinci_model.h b/ge/hybrid/hybrid_davinci_model.h index 00a48c1e..5349390c 100644 --- a/ge/hybrid/hybrid_davinci_model.h +++ b/ge/hybrid/hybrid_davinci_model.h @@ -37,6 +37,12 @@ class HybridDavinciModel { Status Init(); + Status Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc, + rtStream_t stream); + Status Execute(const vector &inputs, vector &outputs); Status ModelRunStart(); @@ -51,6 +57,21 @@ class HybridDavinciModel { void SetDeviceId(uint32_t device_id); + uint64_t GetSessionId(); + + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type); + + void GetUserDesignateShapeOrder(std::vector &user_input_shape_order); + + void GetModelAttr(std::vector &dynamic_output_shape_info); + + Status GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &output_formats); + + void SetModelDescVersion(bool is_new_model_desc); + private: HybridDavinciModel() = default; class Impl; diff --git a/ge/hybrid/hybrid_davinci_model_stub.cc b/ge/hybrid/hybrid_davinci_model_stub.cc index b95b9efc..366845c5 100644 --- a/ge/hybrid/hybrid_davinci_model_stub.cc +++ b/ge/hybrid/hybrid_davinci_model_stub.cc @@ -28,6 +28,14 @@ Status HybridDavinciModel::Init() { return UNSUPPORTED; } +Status HybridDavinciModel::Execute(const std::vector &inputs, + const std::vector &input_desc, + std::vector &outputs, + std::vector &output_desc, + rtStream_t stream) { + return UNSUPPORTED; +} + Status HybridDavinciModel::Execute(const vector &inputs, vector &outputs) { return UNSUPPORTED; } @@ -52,5 +60,29 @@ void HybridDavinciModel::SetModelId(uint32_t model_id) { void HybridDavinciModel::SetDeviceId(uint32_t device_id) { } + +uint64_t HybridDavinciModel::GetSessionId() { + return 0; +} + +Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { + return UNSUPPORTED; +} + +void HybridDavinciModel::GetUserDesignateShapeOrder(std::vector &user_input_shape_order) { +} + +void HybridDavinciModel::GetModelAttr(std::vector &dynamic_output_shape_info) { +} + +Status HybridDavinciModel::GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &output_formats) { + return UNSUPPORTED; +} + +void HybridDavinciModel::SetModelDescVersion(bool is_new_model_desc) { +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc index 59c7be9a..c319b06b 100644 --- a/ge/hybrid/model/hybrid_model.cc +++ b/ge/hybrid/model/hybrid_model.cc @@ -21,12 +21,18 @@ #include "graph/utils/graph_utils.h" #include "graph/utils/node_utils.h" #include "graph/utils/tensor_utils.h" +#include "graph/utils/type_utils.h" #include "hybrid/common/npu_memory_allocator.h" #include "hybrid/model/hybrid_model_builder.h" #include "hybrid/node_executor/node_executor.h" +#include "common/op/ge_op_utils.h" namespace ge { namespace hybrid { +namespace { +const int64_t kMemSizeUnknownShape = -1; // Unknown shape mem size +} + HybridModel::HybridModel(GeRootModelPtr ge_model) : ge_root_model_(std::move(ge_model)) { } @@ -128,7 +134,187 @@ const GraphItem *HybridModel::GetSubgraphItem(const ComputeGraphPtr &subgraph) c } const string &HybridModel::GetModelName() const { - return model_name_; + return model_name_; +} + +Status HybridModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { + // dynamic shape do not need dynamic batch + batch_info = {}; + dynamic_type = -1; + return SUCCESS; +} + +void HybridModel::GetUserDesignateShapeOrder(std::vector &user_input_shape_order) { + // dynamic shape do not need dynamic batch + user_input_shape_order = {}; +} + +void HybridModel::GetModelAttr(std::vector &dynamic_output_shape_info) { + dynamic_output_shape_info = {}; +} + +Status HybridModel::GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &output_formats) { + auto node_item_list = root_graph_item_->GetInputNodes(); + if (node_item_list.empty()) { + GELOGE(FAILED, "node item list is empty!"); + return FAILED; + } + + GE_CHECK_NOTNULL(node_item_list[0]->node); + GE_CHECK_NOTNULL(node_item_list[0]->node->GetOpDesc()); + if (node_item_list[0]->node->GetOpDesc()->GetInputsSize() != 1) { + GELOGE(FAILED, "input size of op is not 1!"); + return FAILED; + } + + GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed"); + GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get ouput desc info failed"); + + return SUCCESS; +} + +void HybridModel::SetInputDimsAndShapeRangesInfo(const vector &model_input_dims, std::vector> &shape_ranges, + InputOutputDescInfo &input) { + for (auto model_input_dim : model_input_dims) { + input.shape_info.dims.push_back(model_input_dim); + } + input.shape_info.shape_ranges = shape_ranges; + return; +} + +void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input) { + std::vector> shape_ranges; + if (is_new_model_desc_ && op_desc->HasAttr(ATTR_NAME_INPUT_DIMS)) { + // When static aipp is set, need to get the model input dims which processed by aipp + vector model_input_dims; + (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_DIMS, model_input_dims); + SetInputDimsAndShapeRangesInfo(model_input_dims, shape_ranges, input); + return; + } + // judge if this data is linked dynamic aipp first, multiply batch has been considered + if (op_desc->HasAttr("_dynamic_aipp_input_dims")) { + vector dynamic_aipp_input_dims; + (void)AttrUtils::GetListInt(op_desc, "_dynamic_aipp_input_dims", dynamic_aipp_input_dims); + SetInputDimsAndShapeRangesInfo(dynamic_aipp_input_dims, shape_ranges, input); + return; + } else { + vector input_dims = op_desc->GetInputDescPtr(0)->GetShape().GetDims(); + op_desc->GetInputDescPtr(0)->GetShapeRange(shape_ranges); + SetInputDimsAndShapeRangesInfo(input_dims, shape_ranges, input); + return; + } +} + +Status HybridModel::GetInputDescInfo(vector &input_desc, std::vector &formats) { + auto node_item_list = root_graph_item_->GetInputNodes(); + for (auto &node_item : node_item_list) { + InputOutputDescInfo input; + + GE_CHECK_NOTNULL(node_item->node); + auto op_desc = node_item->node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(0)); + + Format format = op_desc->GetInputDescPtr(0)->GetFormat(); + input.data_type = op_desc->GetInputDescPtr(0)->GetDataType(); + input.name = op_desc->GetName(); + + int64_t input_size = 0; + GE_CHK_STATUS_RET(TensorUtils::GetSize(*op_desc->GetInputDescPtr(0), input_size), "get input size failed."); + + // support dynamic shape + if (input_size < 0) { + GELOGD("dynamic shape scene, input size is unknown. " + "format=%d, data_type=%d, input_size=%ld", + format, input.data_type, input_size); + input_size = kMemSizeUnknownShape; // -1 + } + + // not support dynamic shape input for now, so input_size here will be not less than zero. + input.size = input_size; + + CreateInputDimsInfo(op_desc, input); + + formats.push_back(format); + input_desc.push_back(input); + } + is_new_model_desc_ = false; + return SUCCESS; +} + +void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDescInfo &output_desc_info, uint32_t &format_result) { + GE_IF_BOOL_EXEC(output_desc == nullptr, GELOGE(FAILED, "output desc ptr is nullptr"); return ); + Format format = output_desc->GetFormat(); + GeShape shape = output_desc->GetShape(); + std::vector> shape_ranges; + output_desc->GetShapeRange(shape_ranges); + DataType data_type = output_desc->GetDataType(); + format_result = format; + if (format == FORMAT_FRACTAL_Z) { // FraczToHWCK + int64_t k = shape.GetDim(0); // 0: first dim + int64_t c = shape.GetDim(1); // 1: second dim + int64_t h = shape.GetDim(2); // 2: third dim + int64_t w = shape.GetDim(3); // 3: forth dim + output_desc_info.shape_info.dims.push_back(h); + output_desc_info.shape_info.dims.push_back(w); + output_desc_info.shape_info.dims.push_back(c); + output_desc_info.shape_info.dims.push_back(k); + if (shape_ranges.size() == 4) { // 4 dims + output_desc_info.shape_info.shape_ranges.push_back(shape_ranges[2]); // h:2 + output_desc_info.shape_info.shape_ranges.push_back(shape_ranges[3]); // w:3 + output_desc_info.shape_info.shape_ranges.push_back(shape_ranges[1]); // c:1 + output_desc_info.shape_info.shape_ranges.push_back(shape_ranges[0]); // k:0 + } + format_result = FORMAT_HWCN; + } else { + for (size_t j = 0; j < shape.GetDimNum(); j++) { + output_desc_info.shape_info.dims.push_back(shape.GetDim(j)); + } + output_desc_info.shape_info.shape_ranges = shape_ranges; + } + int64_t tensor_size = 0; + (void)TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size); + output_desc_info.size = static_cast(tensor_size); + output_desc_info.data_type = output_desc->GetDataType(); +} + +Status HybridModel::GetOutputDescInfo(vector &output_desc, std::vector &formats) { + std::vector output_desc_list; + GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed"); // output_desc_list contains vaild input desc + + vector out_node_names; + (void)ge::AttrUtils::GetListStr(ge_root_model_->GetRootGraph(), ATTR_MODEL_OUT_NODES_NAME, out_node_names); + + GE_CHECK_NOTNULL(root_graph_item_->GetOutputNode()); + auto op_desc = root_graph_item_->GetOutputNode()->op_desc; + GE_CHECK_NOTNULL(op_desc); + + auto out_size = static_cast(op_desc->GetInputsSize()); + GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(), FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size()); + + for (uint32_t index = 0; index < out_size; ++index) { + string output_name; + std::vector src_name = op_desc->GetSrcName(); + std::vector src_index = op_desc->GetSrcIndex(); + if (out_size == out_node_names.size()) { + bool contains_colon = out_node_names[index].find(":") != std::string::npos; + output_name = contains_colon ? out_node_names[index] : out_node_names[index] + ":" + std::to_string(src_index[index]); + } else { + output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] + "_" + std::to_string(src_index[index]); + } + + InputOutputDescInfo output_desc_info; + output_desc_info.name = output_name; + + uint32_t format_result; + CreateOutput(output_desc_list[index], output_desc_info, format_result); + output_desc.push_back(output_desc_info); + formats.push_back(format_result); + } + return SUCCESS; } } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h index 11311968..1bc08053 100644 --- a/ge/hybrid/model/hybrid_model.h +++ b/ge/hybrid/model/hybrid_model.h @@ -83,6 +83,30 @@ class HybridModel { const string &GetModelName() const; + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type); + + void GetUserDesignateShapeOrder(std::vector &user_input_shape_order); + + void GetModelAttr(std::vector &dynamic_output_shape_info); + + Status GetInputOutputDescInfo(vector &input_desc, + vector &output_desc, + std::vector &input_formats, + std::vector &outputFormats); + + Status GetInputDescInfo(vector &input_desc, std::vector &formats); + + void CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDescInfo &output, uint32_t &format_result); + + Status GetOutputDescInfo(vector &output_desc, std::vector &formats); + + void CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input); + + void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; } + + void SetInputDimsAndShapeRangesInfo(const vector &model_input_dims, std::vector> &shape_ranges, + InputOutputDescInfo &input); + private: friend class HybridModelBuilder; friend class HybridModelAsyncExecutor; @@ -101,6 +125,8 @@ class HybridModel { std::map> subgraph_items_; std::map> node_items_; + bool is_new_model_desc_ = false; // support aipp + // runtime fields uint32_t device_id_ = 0; uint32_t model_id_ = 0; diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index cd4c0a83..d519c35b 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -27,16 +27,41 @@ #include "graph/utils/graph_utils.h" #include "hybrid/common/npu_memory_allocator.h" #include "hybrid/node_executor/node_executor.h" +#include "framework/common/debug/ge_log.h" +#include "graph/utils/attr_utils.h" namespace ge { namespace hybrid { namespace { const uint32_t kSubgraphIndex = 0U; const uint32_t kVarOutputIndex = 0U; -const uint32_t kAlignment = 32; const int kBytes = 8; const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown"; +Status SetOutputNameAttr(ComputeGraph &graph) { + vector output_names; + for (const auto &node : graph.GetDirectNode()) { + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + auto op_type = op_desc->GetType(); + if (op_type == NETOUTPUT) { + for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { + const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + NodePtr in_node = peer_out_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(in_node); + output_names.push_back(in_node->GetName()); + } + } + } + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&graph, ATTR_MODEL_OUT_NODES_NAME, output_names), + GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed."); + return FAILED); + return SUCCESS; +} + int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { int64_t var_size = 0; auto data_type = desc.GetDataType(); @@ -939,6 +964,10 @@ Status HybridModelBuilder::LoadGeModel(ComputeGraph &sub_graph, const GeModelPtr Status HybridModelBuilder::IndexTaskDefs() { const auto &root_graph = ge_root_model_->GetRootGraph(); + if (SetOutputNameAttr(*root_graph) != SUCCESS) { + GELOGW("Set output name attr failed."); + } + for (auto &it : ge_root_model_->GetSubgraphInstanceNameToModel()) { auto &name = it.first; auto &ge_model = it.second; diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc index 998afd02..80ea579b 100644 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc @@ -19,6 +19,7 @@ #include "framework/common/debug/log.h" #include "hybrid/executor/hybrid_execution_context.h" #include "hybrid/node_executor/aicore/aicore_task_builder.h" +#include "graph/load/new_model_manager/tbe_handle_store.h" using optiling::OpRunInfo; @@ -36,6 +37,58 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) return SUCCESS; } +Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) { + auto op_desc_ptr = std::make_shared(op_desc); + GE_CHECK_NOTNULL(op_desc_ptr); + auto tbe_kernel = op_desc_ptr->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); + if (tbe_kernel == nullptr) { + GELOGE(INTERNAL_ERROR, "TBE: %s can't find tvm bin file!", op_desc_ptr->GetName().c_str()); + return INTERNAL_ERROR; + } + TBEHandleStore &kernel_store = TBEHandleStore::GetInstance(); + rtError_t rt_ret = rtQueryFunctionRegistered(stub_name_.c_str()); + if (rt_ret != RT_ERROR_NONE) { + void *bin_handle = nullptr; + if (!kernel_store.FindTBEHandle(stub_name_.c_str(), bin_handle)) { + GELOGI("TBE: can't find the kernel_name[%s] in HandleMap", stub_name_.c_str()); + rtDevBinary_t binary; + std::string json_string; + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, TVM_ATTR_NAME_MAGIC, json_string), + GELOGI("Get original type of session_graph_id.")); + if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AICPU") { + binary.magic = RT_DEV_BINARY_MAGIC_ELF_AICPU; + } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF") { + binary.magic = RT_DEV_BINARY_MAGIC_ELF; + } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") { + binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; + } else { + GELOGE(PARAM_INVALID, "TBE: Invalid parameter magic number! json: %s", json_string.c_str()); + return PARAM_INVALID; + } + binary.version = 0; + binary.data = tbe_kernel->GetBinData(); + binary.length = tbe_kernel->GetBinDataSize(); + GELOGI("TBE: binary.length: %lu", binary.length); + GE_CHK_RT_RET(rtDevBinaryRegister(&binary, &bin_handle)); + std::string meta_data; + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, TVM_ATTR_NAME_METADATA, meta_data), + GELOGI("Get original type of json_string")); + GELOGI("TBE: meta data: %s", meta_data.empty() ? "null" : meta_data.c_str()); + GE_IF_BOOL_EXEC(!meta_data.empty(), GE_CHK_RT_RET(rtMetadataRegister(bin_handle, meta_data.c_str()))); + kernel_store.StoreTBEHandle(stub_name_.c_str(), bin_handle, tbe_kernel); + } else { + GELOGI("TBE: find the kernel_name[%s] in HandleMap", stub_name_.c_str()); + kernel_store.ReferTBEHandle(stub_name_.c_str()); + } + std::string kernel_name; + GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_ptr, op_desc_ptr->GetName() + "_kernelname", kernel_name), + GELOGI("Get original type of kernel_name")); + GELOGI("TBE: binfile_key=%s, kernel_name=%s", stub_name_.c_str(), kernel_name.c_str()); + GE_CHK_RT_RET(rtFunctionRegister(bin_handle, stub_name_.c_str(), stub_name_.c_str(), kernel_name.c_str(), 0)); + } + return SUCCESS; +} + Status AiCoreOpTask::InitWithTaskDef(const OpDesc &op_desc, const domi::TaskDef &task_def) { GE_CHK_STATUS_RET(ValidateTaskDef(task_def), "[%s] Failed to validate task def: [%s]", @@ -45,6 +98,9 @@ Status AiCoreOpTask::InitWithTaskDef(const OpDesc &op_desc, const domi::TaskDef const domi::KernelDef &kernel_def = task_def.kernel(); const domi::KernelContext &context = kernel_def.context(); stub_name_ = kernel_def.stub_func(); + + GE_CHK_STATUS_RET(RegisterTbeHandle(op_desc)); + GE_CHK_RT_RET(rtGetFunctionByName(stub_name_.c_str(), &stub_func_)); args_size_ = kernel_def.args_size(); block_dim_ = kernel_def.block_dim(); diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.h b/ge/hybrid/node_executor/aicore/aicore_op_task.h index 0447ade7..5818f384 100755 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.h +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h @@ -62,6 +62,7 @@ class AiCoreOpTask { static Status ValidateTaskDef(const domi::TaskDef &task_def); Status InitWithTaskDef(const OpDesc &node, const domi::TaskDef &task_def); Status InitTilingInfo(const OpDesc &op_desc); + Status RegisterTbeHandle(const OpDesc &op_desc); std::string stub_name_; void *stub_func_ = nullptr; diff --git a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h index bf948349..b6dfd82b 100755 --- a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h +++ b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h @@ -26,7 +26,7 @@ namespace hybrid { class AiCoreTaskCompiler : public TaskCompiler { public: AiCoreTaskCompiler() = default; - ~AiCoreTaskCompiler() = default; + ~AiCoreTaskCompiler() override = default; Status CompileOp(const NodePtr &node, std::vector &tasks) override; Status Initialize() override; diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h index b984cc86..1205b190 100644 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h @@ -37,6 +37,8 @@ class AicpuNodeTaskBase : public NodeTask { ~AicpuNodeTaskBase() override = default; + using NodeTask::Init; + virtual Status Init(const HybridModel &model) = 0; Status UpdateArgs(TaskContext &context) override; diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.h b/ge/hybrid/node_executor/controlop/control_op_executor.h index 7520afd1..3becfaaa 100644 --- a/ge/hybrid/node_executor/controlop/control_op_executor.h +++ b/ge/hybrid/node_executor/controlop/control_op_executor.h @@ -25,6 +25,7 @@ namespace ge { namespace hybrid { class ControlOpNodeTask : public NodeTask { public: + using NodeTask::Init; virtual Status Init(const NodePtr &node, const HybridModel &model) = 0; Status UpdateArgs(TaskContext &context) override; diff --git a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc index 7a83641d..a52e5670 100755 --- a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc +++ b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc @@ -68,7 +68,7 @@ Status RefInputTask::RefOneByOne(TaskContext &context) { node_name_.c_str(), node_type_.c_str(), output_num, input_num); return INTERNAL_ERROR; } - for (uint32_t out_index = 0; out_index < output_num; ++out_index) { + for (uint32_t out_index = 0; out_index < static_cast(output_num); ++out_index) { auto input = context.GetInput(out_index); GE_CHECK_NOTNULL(input); GE_CHK_STATUS_RET(context.SetOutput(out_index, *input)); diff --git a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc index 3bf71013..01fd391d 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc @@ -20,7 +20,6 @@ #include "hybrid/node_executor/host_cpu/kernel_factory.h" namespace { -const size_t kAssignInputNum = 2; const size_t kAssignRefInputIndex = 0; const size_t kAssignValueInputIndex = 1; const size_t kAssignRefOutputIndex = 0; diff --git a/ge/hybrid/node_executor/node_executor.cc b/ge/hybrid/node_executor/node_executor.cc index e577f09b..95e50c31 100755 --- a/ge/hybrid/node_executor/node_executor.cc +++ b/ge/hybrid/node_executor/node_executor.cc @@ -34,7 +34,6 @@ const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel"; const char *const kEngineNameHccl = "ops_kernel_info_hccl"; const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE"; const char *const kEngineNameHostCpu = "DNN_VM_HOST_CPU_OP_STORE"; -const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown"; } Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs()); diff --git a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h index 9ea544a1..73873002 100644 --- a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h +++ b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h @@ -41,7 +41,6 @@ class PartitionedCallNodeTask : public NodeTask { const GraphItem *graph_item_; std::unique_ptr subgraph_executor_; - GraphExecutionContext *context_ = nullptr; }; class PartitionedCallNodeExecutor : public NodeExecutor { diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h index 2cff0536..0549a1dc 100644 --- a/ge/hybrid/node_executor/task_context.h +++ b/ge/hybrid/node_executor/task_context.h @@ -29,7 +29,7 @@ namespace ge { namespace hybrid { -class GraphExecutionContext; +struct GraphExecutionContext; class SubgraphContext; class TaskContext { diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h index 17dbf928..5a73126f 100644 --- a/inc/framework/executor/ge_executor.h +++ b/inc/framework/executor/ge_executor.h @@ -234,6 +234,22 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { ge::Status ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &input_data, ge::RunModelData &output_data, bool async_mode = false); + /// + /// @ingroup ge + /// @brief Synchronous execution of offline model(Do not create thread) + /// @param [in] uint32_t model_id: Model ID to execute + /// @param [in] void* stream: stream to execute + /// @param [in] bool async_mode: is asynchronize mode. + /// @param [in] const domi::InputData *input_data: Model input data + /// @param [in] const std::vector &input_desc: description of model input data + /// @param [out] domi::OutputData *output_data: Model output data + /// @param [out] std::vector &output_desc: description of model output data + /// @return SUCCESS handle successfully / others handle failed + /// + ge::Status ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &run_input_data, + const std::vector &input_desc, ge::RunModelData &run_output_data, + std::vector &output_desc, bool async_mode = false); + /// /// @ingroup ge /// @brief Get weight memory size from model file