| @@ -302,6 +302,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin | |||||
| } | } | ||||
| data.append(" model_id:").append(std::to_string(model_id)); | data.append(" model_id:").append(std::to_string(model_id)); | ||||
| data.append(" task_id:").append(std::to_string(graph.task_id)); | |||||
| data.append(" stream_id:").append(std::to_string(graph.stream_id)); | |||||
| data.append("\n"); | data.append("\n"); | ||||
| GraphDescReport(device_id, data); | GraphDescReport(device_id, data); | ||||
| @@ -480,6 +480,9 @@ REGISTER_OPTYPE_DEFINE(HVDWAIT, "HorovodWait"); | |||||
| // aicpu op for online_infer dynamic_dims | // aicpu op for online_infer dynamic_dims | ||||
| REGISTER_OPTYPE_DEFINE(GETDYNAMICDIMS, "GetDynamicDims"); | REGISTER_OPTYPE_DEFINE(GETDYNAMICDIMS, "GetDynamicDims"); | ||||
| // profiling training trace node | |||||
| REGISTER_OPTYPE_DEFINE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace"); | |||||
| const std::string MODEL_ATTR_TASKS = "tasks"; | const std::string MODEL_ATTR_TASKS = "tasks"; | ||||
| const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR = "task_gen_base_addr"; | const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR = "task_gen_base_addr"; | ||||
| const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR = "task_gen_weight_addr"; | const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR = "task_gen_weight_addr"; | ||||
| @@ -421,6 +421,52 @@ static Status GenerateTaskForConstant(const std::shared_ptr<ComputeGraph> &graph | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { | |||||
| bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag(); | |||||
| com_graph->SetGraphUnknownFlag(false); | |||||
| GELOGD("Start to mark profiling task attr for fp and bp."); | |||||
| TaskGenerator task_generator; | |||||
| ProfilingPoint profiling_point; | |||||
| std::vector<uint32_t> all_reduce_node_index; | |||||
| Status ret = task_generator.FindProfilingNodeIndex(com_graph, profiling_point, all_reduce_node_index); | |||||
| com_graph->SetGraphUnknownFlag(original_unknown_shape_flag); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGW("Find profiling node index failed."); | |||||
| } | |||||
| if (profiling_point.fp_index == 0 || profiling_point.bp_index == 0 || profiling_point.end_index.empty()) { | |||||
| GELOGD("No need to mark fp bp profiling task attr."); | |||||
| return SUCCESS; | |||||
| } | |||||
| // mark profiling task attr for node | |||||
| uint32_t node_index = 0; | |||||
| for (const auto &node : com_graph->GetAllNodes()) { | |||||
| OpDescPtr op_desc = node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(node->GetOpDesc()); | |||||
| node_index++; | |||||
| if (profiling_point.fp_index == node_index) { | |||||
| GELOGI("The first fp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
| (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, true); | |||||
| } | |||||
| if (profiling_point.bp_index == node_index) { | |||||
| GELOGI("The bp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
| (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); | |||||
| } | |||||
| for (size_t i = 0; i < all_reduce_node_index.size(); i++) { | |||||
| if (all_reduce_node_index[i] == node_index) { | |||||
| GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
| (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) { | |||||
| GELOGI("The end node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
| (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, true); | |||||
| } | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, | Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, | ||||
| std::vector<SubGraphInfoPtr> &subgraph_ptr_list, | std::vector<SubGraphInfoPtr> &subgraph_ptr_list, | ||||
| GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr, | GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr, | ||||
| @@ -437,6 +483,12 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, | |||||
| } | } | ||||
| } | } | ||||
| // Set fp bp profiling task attr for graph | |||||
| if (MarkFpBpProfilingTaskAttr(comp_graph) != SUCCESS) { | |||||
| GELOGE(FAILED, "Set fp bp profiling task attr for graph."); | |||||
| return FAILED; | |||||
| } | |||||
| auto all_graphs = comp_graph->GetAllSubgraphs(); | auto all_graphs = comp_graph->GetAllSubgraphs(); | ||||
| if (all_graphs.empty()) { | if (all_graphs.empty()) { | ||||
| all_graphs.push_back(comp_graph); | all_graphs.push_back(comp_graph); | ||||
| @@ -60,6 +60,7 @@ class GraphBuilder { | |||||
| Status UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr); | Status UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr); | ||||
| Status CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc); | Status CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc); | ||||
| Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list); | Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list); | ||||
| Status MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph); | |||||
| Status BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list, | Status BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list, | ||||
| GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr, | GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr, | ||||
| uint64_t session_id = INVALID_SESSION_ID); | uint64_t session_id = INVALID_SESSION_ID); | ||||
| @@ -274,6 +274,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| }; | }; | ||||
| GE_MAKE_GUARD(release, callback); | GE_MAKE_GUARD(release, callback); | ||||
| uint64_t all_reduce_node_idx = 0; | |||||
| for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
| OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
| GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
| @@ -292,7 +293,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| // Part2: Call | // Part2: Call | ||||
| auto fusion_task_info = | auto fusion_task_info = | ||||
| FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, | FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, | ||||
| ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes}; | |||||
| ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes, all_reduce_node_idx}; | |||||
| GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen), | GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen), | ||||
| "Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str()); | "Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str()); | ||||
| // continue directly | // continue directly | ||||
| @@ -316,7 +317,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| type.c_str()); | type.c_str()); | ||||
| // Profiling task | // Profiling task | ||||
| size_t task_list_size_before = task_def_list.size(); | size_t task_list_size_before = task_def_list.size(); | ||||
| GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||||
| GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||||
| node_index, task_def_list, all_reduce_node_idx)); | |||||
| int64_t op_id = op_desc->GetId(); | int64_t op_id = op_desc->GetId(); | ||||
| // Compatible with dynamic shape scenes, the default is 0 | // Compatible with dynamic shape scenes, the default is 0 | ||||
| int64_t stream_id = 0; | int64_t stream_id = 0; | ||||
| @@ -336,8 +338,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| // Profiling task | // Profiling task | ||||
| GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||||
| GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||||
| node_index, task_def_list, all_reduce_node_idx)); | |||||
| size_t task_list_size_after = task_def_list.size(); | size_t task_list_size_after = task_def_list.size(); | ||||
| // If tasks is reduced | // If tasks is reduced | ||||
| if (task_list_size_after < task_list_size_before) { | if (task_list_size_after < task_list_size_before) { | ||||
| @@ -380,6 +382,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
| auto &op_name_map = fusion_task_info.op_name_map; | auto &op_name_map = fusion_task_info.op_name_map; | ||||
| auto &profiling_point = fusion_task_info.profiling_point; | auto &profiling_point = fusion_task_info.profiling_point; | ||||
| auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes; | auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes; | ||||
| auto &all_reduce_idx = fusion_task_info.all_reduce_node_idx; | |||||
| // If op_desc have this attr, call nodes with same group key in a stream together | // If op_desc have this attr, call nodes with same group key in a stream together | ||||
| if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) && | if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) && | ||||
| (fusion_nodes_seen.count(node.get()) == 0)) { | (fusion_nodes_seen.count(node.get()) == 0)) { | ||||
| @@ -426,7 +429,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| } | } | ||||
| // profiling task | // profiling task | ||||
| (void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||||
| (void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||||
| node_index, task_def_list, all_reduce_idx); | |||||
| run_context.stream = run_context.graphStreamList[stream_id]; | run_context.stream = run_context.graphStreamList[stream_id]; | ||||
| GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", | GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", | ||||
| op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); | op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); | ||||
| @@ -439,7 +443,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| // profiling task | // profiling task | ||||
| (void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||||
| (void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||||
| node_index, task_def_list, all_reduce_idx); | |||||
| size_t task_list_size_after = task_def_list.size(); | size_t task_list_size_after = task_def_list.size(); | ||||
| // if tasks is reduced | // if tasks is reduced | ||||
| if (task_list_size_after < task_list_size_before) { | if (task_list_size_after < task_list_size_before) { | ||||
| @@ -830,6 +835,11 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status TaskGenerator::FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | |||||
| std::vector<uint32_t> &all_reduce_nodes) { | |||||
| return FindProfilingTaskIndex(graph, profiling_point, all_reduce_nodes); | |||||
| } | |||||
| Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | ||||
| vector<uint32_t> &all_reduce_nodes) const { | vector<uint32_t> &all_reduce_nodes) const { | ||||
| GE_CHECK_NOTNULL(graph); | GE_CHECK_NOTNULL(graph); | ||||
| @@ -840,7 +850,6 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||||
| GELOGD("Profiling is not open."); | GELOGD("Profiling is not open."); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| GELOGI("Start get FP/BP index."); | GELOGI("Start get FP/BP index."); | ||||
| std::string fp_point_str; | std::string fp_point_str; | ||||
| std::string bp_point_str; | std::string bp_point_str; | ||||
| @@ -878,18 +887,27 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
| vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
| vector<domi::TaskDef> &task_def_list) { | |||||
| vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx) { | |||||
| const char *profiling_mode = std::getenv(kProfilingMode); | const char *profiling_mode = std::getenv(kProfilingMode); | ||||
| bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
| ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ||||
| if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | |||||
| (profiling_point.end_index.empty())) { | |||||
| bool is_insert_fp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); | |||||
| bool is_insert_bp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||||
| bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | |||||
| (profiling_point.end_index.empty())) && | |||||
| (!(is_insert_fp_profiling_task || is_insert_bp_profiling_task)); | |||||
| if (!is_profiling || no_insert_profiling_task) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (profiling_point.fp_index == node_index) { | |||||
| GELOGD("Insert fp profiling task: %d, insert bp profiling task: %d, fp index: %u, bp index: %u, end index size: %zu", | |||||
| is_insert_fp_profiling_task, is_insert_bp_profiling_task, profiling_point.fp_index, profiling_point.bp_index, | |||||
| profiling_point.end_index.size()); | |||||
| if ((profiling_point.fp_index == node_index) || is_insert_fp_profiling_task) { | |||||
| uint64_t jobid_log_id = ge::GetContext().TraceId(); | uint64_t jobid_log_id = ge::GetContext().TraceId(); | ||||
| GELOGI("The first FP operator is %s, idx %u, job_id %lu", op_desc->GetName().c_str(), node_index, jobid_log_id); | GELOGI("The first FP operator is %s, idx %u, job_id %lu", op_desc->GetName().c_str(), node_index, jobid_log_id); | ||||
| @@ -913,22 +931,40 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||||
| task_def_list.emplace_back(fp_task_def); | task_def_list.emplace_back(fp_task_def); | ||||
| } | } | ||||
| for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||||
| if (all_reduce_nodes[i] != node_index) { | |||||
| continue; | |||||
| bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||||
| uint64_t all_reduce_task_idx = 0; | |||||
| bool is_insert_all_reduce_task = false; | |||||
| if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
| all_reduce_task_idx = all_reduce_node_idx; | |||||
| is_insert_all_reduce_task = true; | |||||
| } | |||||
| if (is_all_reduce) { | |||||
| all_reduce_node_idx++; | |||||
| } | |||||
| if (!is_insert_all_reduce_task) { | |||||
| for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||||
| if (all_reduce_nodes[i] == node_index) { | |||||
| all_reduce_task_idx = i; | |||||
| is_insert_all_reduce_task = true; | |||||
| break; | |||||
| } | |||||
| } | } | ||||
| } | |||||
| if (is_insert_all_reduce_task) { | |||||
| GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
| TaskDef ar_task_def; | TaskDef ar_task_def; | ||||
| ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
| ar_task_def.set_stream_id(op_desc->GetStreamId()); | ar_task_def.set_stream_id(op_desc->GetStreamId()); | ||||
| LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | ||||
| if (ar_log_def != nullptr) { | if (ar_log_def != nullptr) { | ||||
| GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||||
| GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||||
| GELOGE(FAILED, "Multiply result is out of range."); | GELOGE(FAILED, "Multiply result is out of range."); | ||||
| return FAILED); | return FAILED); | ||||
| auto log_id = i * kProfilingArStep + kProfilingArStartLogid; | |||||
| auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArStartLogid; | |||||
| ar_log_def->set_logid(log_id); | ar_log_def->set_logid(log_id); | ||||
| ar_log_def->set_notify(false); | ar_log_def->set_notify(false); | ||||
| (void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||||
| } | } | ||||
| task_def_list.push_back(ar_task_def); | task_def_list.push_back(ar_task_def); | ||||
| } | } | ||||
| @@ -937,16 +973,27 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||||
| Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
| vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
| vector<domi::TaskDef> &task_def_list) { | |||||
| vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx) { | |||||
| GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
| const char *profiling_mode = std::getenv(kProfilingMode); | const char *profiling_mode = std::getenv(kProfilingMode); | ||||
| bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
| ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ||||
| if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | |||||
| (profiling_point.end_index.empty())) { | |||||
| bool is_insert_bp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||||
| bool is_insert_end_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task); | |||||
| bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) || | |||||
| (profiling_point.end_index.empty())) && | |||||
| (!(is_insert_bp_profiling_task || is_insert_end_profiling_task)); | |||||
| if (!is_profiling || no_insert_profiling_task) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| if (profiling_point.bp_index == node_index) { | |||||
| GELOGD("Insert bp profiling task: %d, insert end profiling task: %d, fp index: %u, bp index: %u, end index size: %zu", | |||||
| is_insert_bp_profiling_task, is_insert_end_profiling_task, profiling_point.fp_index, profiling_point.bp_index, | |||||
| profiling_point.end_index.size() ); | |||||
| bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||||
| if ((profiling_point.bp_index == node_index) || (!is_all_reduce && is_insert_bp_profiling_task)) { | |||||
| GELOGI("The last BP operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The last BP operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
| TaskDef bp_task_def; | TaskDef bp_task_def; | ||||
| bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
| @@ -957,7 +1004,9 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||||
| bp_log_def->set_notify(false); | bp_log_def->set_notify(false); | ||||
| task_def_list.emplace_back(bp_task_def); | task_def_list.emplace_back(bp_task_def); | ||||
| } | } | ||||
| if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) { | |||||
| if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end() || | |||||
| is_insert_end_profiling_task) { | |||||
| GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
| TaskDef end_task_def; | TaskDef end_task_def; | ||||
| end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
| @@ -969,20 +1018,32 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||||
| task_def_list.emplace_back(end_task_def); | task_def_list.emplace_back(end_task_def); | ||||
| } | } | ||||
| uint32_t all_reduce_task_idx = 0; | |||||
| bool is_insert_all_reduce_task = false; | |||||
| if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
| all_reduce_task_idx = all_reduce_node_idx; | |||||
| is_insert_all_reduce_task = true; | |||||
| } | |||||
| for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | ||||
| if (all_reduce_nodes[i] != node_index) { | |||||
| continue; | |||||
| if (all_reduce_nodes[i] == node_index) { | |||||
| all_reduce_task_idx = i; | |||||
| is_insert_all_reduce_task = true; | |||||
| break; | |||||
| } | } | ||||
| } | |||||
| if (is_insert_all_reduce_task) { | |||||
| GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
| TaskDef ar_task_def; | TaskDef ar_task_def; | ||||
| ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
| ar_task_def.set_stream_id(op_desc->GetStreamId()); | ar_task_def.set_stream_id(op_desc->GetStreamId()); | ||||
| LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | ||||
| GE_CHECK_NOTNULL(ar_log_def); | GE_CHECK_NOTNULL(ar_log_def); | ||||
| GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||||
| GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||||
| GELOGE(FAILED, "Multiply result is out of range."); | GELOGE(FAILED, "Multiply result is out of range."); | ||||
| return FAILED); | return FAILED); | ||||
| auto log_id = i * kProfilingArStep + kProfilingArEndLogid; | |||||
| auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArEndLogid; | |||||
| ar_log_def->set_logid(log_id); | ar_log_def->set_logid(log_id); | ||||
| ar_log_def->set_notify(false); | ar_log_def->set_notify(false); | ||||
| task_def_list.emplace_back(ar_task_def); | task_def_list.emplace_back(ar_task_def); | ||||
| @@ -51,6 +51,7 @@ struct FusionTaskInfo { | |||||
| std::map<uint32_t, string> &op_name_map; | std::map<uint32_t, string> &op_name_map; | ||||
| ProfilingPoint &profiling_point; | ProfilingPoint &profiling_point; | ||||
| vector<uint32_t> all_reduce_nodes; | vector<uint32_t> all_reduce_nodes; | ||||
| uint64_t all_reduce_node_idx; | |||||
| }; | }; | ||||
| class TaskGenerator { | class TaskGenerator { | ||||
| @@ -76,6 +77,8 @@ class TaskGenerator { | |||||
| /// | /// | ||||
| Status GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t session_id, RunContext &run_context); | Status GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t session_id, RunContext &run_context); | ||||
| Status FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point, | |||||
| std::vector<uint32_t> &all_reduce_nodes); | |||||
| private: | private: | ||||
| Status UpdateAnchorStatus(const NodePtr &node); | Status UpdateAnchorStatus(const NodePtr &node); | ||||
| @@ -126,10 +129,10 @@ class TaskGenerator { | |||||
| std::vector<uint32_t> &all_reduce_nodes) const; | std::vector<uint32_t> &all_reduce_nodes) const; | ||||
| Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
| std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
| std::vector<domi::TaskDef> &task_def_list); | |||||
| std::vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx); | |||||
| Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
| std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
| std::vector<domi::TaskDef> &task_def_list); | |||||
| std::vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx); | |||||
| static bool IsProfPoint(const OpDescPtr &op, const std::string &name); | static bool IsProfPoint(const OpDescPtr &op, const std::string &name); | ||||
| @@ -3113,6 +3113,8 @@ Status DavinciModel::DistributeTask() { | |||||
| task_desc_info.stream_id = task->GetStreamId(); | task_desc_info.stream_id = task->GetStreamId(); | ||||
| task_desc_info.shape_type = "static"; | task_desc_info.shape_type = "static"; | ||||
| task_desc_info.cur_iter_num = 0; | task_desc_info.cur_iter_num = 0; | ||||
| profiler_report_op_info_[task_desc_info.op_name] = | |||||
| std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||||
| task_desc_info_.emplace_back(task_desc_info); | task_desc_info_.emplace_back(task_desc_info); | ||||
| if (flag) { | if (flag) { | ||||
| if (task->GetSktTaskID() != 0xFFFFFFFF) { | if (task->GetSktTaskID() != 0xFFFFFFFF) { | ||||
| @@ -3120,6 +3122,8 @@ Status DavinciModel::DistributeTask() { | |||||
| string op_name = "super_kernel_" + to_string(task_index); | string op_name = "super_kernel_" + to_string(task_index); | ||||
| task_desc_info.op_name = op_name; | task_desc_info.op_name = op_name; | ||||
| task_desc_info.task_id = task->GetSktTaskID(); | task_desc_info.task_id = task->GetSktTaskID(); | ||||
| profiler_report_op_info_[task_desc_info.op_name] = | |||||
| std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id); | |||||
| task_desc_info_.emplace_back(task_desc_info); | task_desc_info_.emplace_back(task_desc_info); | ||||
| } | } | ||||
| } | } | ||||
| @@ -3991,7 +3995,15 @@ Status DavinciModel::GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_des | |||||
| compute_graph_info.output_format = op_desc.output_format; | compute_graph_info.output_format = op_desc.output_format; | ||||
| compute_graph_info.output_shape = op_desc.output_shape; | compute_graph_info.output_shape = op_desc.output_shape; | ||||
| compute_graph_info.output_data_type = op_desc.output_data_type; | compute_graph_info.output_data_type = op_desc.output_data_type; | ||||
| uint32_t task_id = 0; | |||||
| uint32_t stream_id = 0; | |||||
| auto iter = profiler_report_op_info_.find(op_desc.op_name); | |||||
| if (iter != profiler_report_op_info_.end()) { | |||||
| task_id = iter->second.first; | |||||
| stream_id = iter->second.second; | |||||
| } | |||||
| compute_graph_info.task_id = task_id; | |||||
| compute_graph_info.stream_id = stream_id; | |||||
| graph_desc_info.emplace_back(compute_graph_info); | graph_desc_info.emplace_back(compute_graph_info); | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| @@ -978,6 +978,8 @@ class DavinciModel { | |||||
| // for profiling task and graph info | // for profiling task and graph info | ||||
| vector<TaskDescInfo> task_desc_info_; | vector<TaskDescInfo> task_desc_info_; | ||||
| std::map<std::string, std::pair<uint32_t, uint32_t>> profiler_report_op_info_; | |||||
| int64_t maxDumpOpNum_; | int64_t maxDumpOpNum_; | ||||
| // for data dump | // for data dump | ||||
| DataDumper data_dumper_; | DataDumper data_dumper_; | ||||
| @@ -221,6 +221,8 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel | |||||
| tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | ||||
| tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | ||||
| } | } | ||||
| tmp_compute_graph_info.task_id = context_->GetTaskId(); | |||||
| tmp_compute_graph_info.stream_id = context_->GetStreamId(); | |||||
| compute_graph_info.emplace_back(tmp_compute_graph_info); | compute_graph_info.emplace_back(tmp_compute_graph_info); | ||||
| GELOGD("GetComputeGraphInfo of node [%s] end.", node->GetName().c_str()); | GELOGD("GetComputeGraphInfo of node [%s] end.", node->GetName().c_str()); | ||||
| } | } | ||||
| @@ -35,11 +35,22 @@ | |||||
| namespace ge { | namespace ge { | ||||
| namespace hybrid { | namespace hybrid { | ||||
| using domi::LogTimeStampDef; | |||||
| using domi::TaskDef; | |||||
| namespace { | namespace { | ||||
| const uint32_t kSubgraphIndex = 0U; | const uint32_t kSubgraphIndex = 0U; | ||||
| const uint32_t kVarOutputIndex = 0U; | const uint32_t kVarOutputIndex = 0U; | ||||
| const uint64_t kProfilingFpStartLogid = 1U; | |||||
| const uint64_t kProfilingBpEndLogid = 2U; | |||||
| const uint64_t kProfilingIterEndLogid = 65535U; | |||||
| const int kBytes = 8; | const int kBytes = 8; | ||||
| const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown"; | const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown"; | ||||
| const char *const kProfilingGraph = "ProfilingGraph"; | |||||
| const char *const kProfilingFpNode = "ProfilingFpNode"; | |||||
| const char *const kProfilingBpNode = "ProfilingBpNode"; | |||||
| const char *const kProfilingEndNode = "ProfilingEndNode"; | |||||
| const char *const kProfilingArNode = "ProfilingAllReduceNode"; | |||||
| const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE"; | |||||
| Status SetOutputNameAttr(ComputeGraph &graph) { | Status SetOutputNameAttr(ComputeGraph &graph) { | ||||
| vector<string> output_names; | vector<string> output_names; | ||||
| @@ -1531,6 +1542,188 @@ Status HybridModelBuilder::RecoverGraphUnknownFlag() { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status HybridModelBuilder::GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) { | |||||
| uint64_t jobid_log_id = ge::GetContext().TraceId(); | |||||
| GELOGD("The first FP operator is %s,, job_id %lu", op_desc->GetName().c_str(), jobid_log_id); | |||||
| TaskDef job_task_def; | |||||
| job_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
| job_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
| LogTimeStampDef *job_log_def = job_task_def.mutable_log_timestamp(); | |||||
| if (job_log_def != nullptr) { | |||||
| job_log_def->set_logid(jobid_log_id); | |||||
| job_log_def->set_notify(false); | |||||
| } | |||||
| task_def_list.emplace_back(job_task_def); | |||||
| TaskDef fp_task_def; | |||||
| fp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
| fp_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
| LogTimeStampDef *fp_log_def = fp_task_def.mutable_log_timestamp(); | |||||
| if (fp_log_def != nullptr) { | |||||
| fp_log_def->set_logid(kProfilingFpStartLogid); | |||||
| fp_log_def->set_notify(false); | |||||
| } | |||||
| task_def_list.emplace_back(fp_task_def); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, | |||||
| vector<domi::TaskDef> &task_def_list) { | |||||
| TaskDef ar_task_def; | |||||
| ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
| ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
| LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||||
| if (ar_log_def != nullptr) { | |||||
| ar_log_def->set_logid(log_id); | |||||
| ar_log_def->set_notify(false); | |||||
| } | |||||
| task_def_list.emplace_back(ar_task_def); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) { | |||||
| TaskDef bp_task_def; | |||||
| bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
| bp_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
| LogTimeStampDef *bp_log_def = bp_task_def.mutable_log_timestamp(); | |||||
| GE_CHECK_NOTNULL(bp_log_def); | |||||
| bp_log_def->set_logid(kProfilingBpEndLogid); | |||||
| bp_log_def->set_notify(false); | |||||
| task_def_list.emplace_back(bp_task_def); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) { | |||||
| TaskDef end_task_def; | |||||
| end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
| end_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
| LogTimeStampDef *end_log_def = end_task_def.mutable_log_timestamp(); | |||||
| GE_CHECK_NOTNULL(end_log_def); | |||||
| end_log_def->set_logid(kProfilingIterEndLogid); | |||||
| end_log_def->set_notify(true); | |||||
| task_def_list.emplace_back(end_task_def); | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node) { | |||||
| GE_CHECK_NOTNULL(node); | |||||
| const OpDescPtr &op_desc = node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(op_desc); | |||||
| const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph); | |||||
| GE_CHECK_NOTNULL(compute_graph); | |||||
| NodePtr node_ptr = nullptr; | |||||
| vector<domi::TaskDef> task_def_list; | |||||
| // create fp node | |||||
| bool is_insert_fp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); | |||||
| if (is_insert_fp_profiling_task) { | |||||
| (void)GenerateFpProfilingTask(op_desc, task_def_list); | |||||
| auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE); | |||||
| GE_CHECK_NOTNULL(fp_desc); | |||||
| fp_desc->SetOpKernelLibName(kEngineNameRts); | |||||
| node_ptr = compute_graph->AddNode(fp_desc); | |||||
| GELOGD("Create fp profiling node success before."); | |||||
| } | |||||
| // creat all reduce start node | |||||
| bool is_insert_bp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||||
| bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||||
| if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
| int64_t log_id = 0; | |||||
| (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||||
| GELOGD("All reduce node profiling task log id: %ld before", log_id); | |||||
| (void) GenerateArProfilingTask(op_desc, log_id, task_def_list); | |||||
| string op_name = string(kProfilingArNode) + std::to_string(log_id); | |||||
| auto ar_desc_start = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE); | |||||
| GE_CHECK_NOTNULL(ar_desc_start); | |||||
| ar_desc_start->SetOpKernelLibName(kEngineNameRts); | |||||
| node_ptr = compute_graph->AddNode(ar_desc_start); | |||||
| GELOGD("Create all reduce start profiling node success before."); | |||||
| } | |||||
| if (node_ptr != nullptr) { | |||||
| for (const auto &task_def : task_def_list) { | |||||
| hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||||
| } | |||||
| NodeItem *node_item = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||||
| node_item->input_start = 0; | |||||
| node_item->output_start = 0; | |||||
| graph_item.node_items_.emplace_back(node_item); | |||||
| } else { | |||||
| GELOGD("No need to create profiling node before."); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node) { | |||||
| GE_CHECK_NOTNULL(node); | |||||
| const OpDescPtr &op_desc = node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(op_desc); | |||||
| const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph); | |||||
| GE_CHECK_NOTNULL(compute_graph); | |||||
| NodePtr node_ptr = nullptr; | |||||
| vector<domi::TaskDef> task_def_list; | |||||
| // Create all reduce end node | |||||
| bool is_insert_bp_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||||
| bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||||
| if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
| int64_t log_id = 0; | |||||
| (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||||
| GELOGD("All reduce node profiling task log id: %ld after", log_id); | |||||
| (void) GenerateArProfilingTask(op_desc, log_id + 1, task_def_list); | |||||
| string op_name = string(kProfilingArNode) + std::to_string(log_id + 1); | |||||
| auto ar_desc_end = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE); | |||||
| GE_CHECK_NOTNULL(ar_desc_end); | |||||
| ar_desc_end->SetOpKernelLibName(kEngineNameRts); | |||||
| node_ptr = compute_graph->AddNode(ar_desc_end); | |||||
| GELOGD("Create all reduce end profiling node success after."); | |||||
| } | |||||
| // create bp node | |||||
| if (!is_all_reduce && is_insert_bp_profiling_task) { | |||||
| (void) GenerateBpProfilingTask(op_desc, task_def_list); | |||||
| auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE); | |||||
| GE_CHECK_NOTNULL(bp_op_desc); | |||||
| bp_op_desc->SetOpKernelLibName(kEngineNameRts); | |||||
| node_ptr = compute_graph->AddNode(bp_op_desc); | |||||
| GELOGD("Create bp profiling node success after."); | |||||
| } | |||||
| // create end node | |||||
| bool is_insert_end_profiling_task = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task); | |||||
| if (is_insert_end_profiling_task) { | |||||
| (void)GenerateEndProfilingTask(op_desc, task_def_list); | |||||
| auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE); | |||||
| GE_CHECK_NOTNULL(end_desc); | |||||
| end_desc->SetOpKernelLibName(kEngineNameRts); | |||||
| node_ptr = compute_graph->AddNode(end_desc); | |||||
| GELOGD("Create end profiling node success after."); | |||||
| } | |||||
| if (node_ptr != nullptr) { | |||||
| for (const auto &task_def : task_def_list) { | |||||
| hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||||
| } | |||||
| NodeItem *node_item = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||||
| node_item->input_start = 0; | |||||
| node_item->output_start = 0; | |||||
| graph_item.node_items_.emplace_back(node_item); | |||||
| } else { | |||||
| GELOGD("No need to create profiling node after."); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root_graph) { | Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root_graph) { | ||||
| GELOGD("Start to load subgraph [%s]", graph.GetName().c_str()); | GELOGD("Start to load subgraph [%s]", graph.GetName().c_str()); | ||||
| // for known partitioned call, load all nodes | // for known partitioned call, load all nodes | ||||
| @@ -1567,8 +1760,9 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root | |||||
| graph_item->output_node_ = node_item; | graph_item->output_node_ = node_item; | ||||
| GE_CHK_STATUS_RET_NOLOG(BuildOutputMapping(*graph_item, *node_item, is_root_graph)); | GE_CHK_STATUS_RET_NOLOG(BuildOutputMapping(*graph_item, *node_item, is_root_graph)); | ||||
| } | } | ||||
| GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeBefore(*graph_item, node)); | |||||
| graph_item->node_items_.emplace_back(node_item); | graph_item->node_items_.emplace_back(node_item); | ||||
| GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeAfter(*graph_item, node)); | |||||
| // parse var outputs | // parse var outputs | ||||
| GE_CHK_STATUS_RET_NOLOG(ParseVarOutputs(*node_item)); | GE_CHK_STATUS_RET_NOLOG(ParseVarOutputs(*node_item)); | ||||
| GELOGD("NodeItem created: %s", node_item->DebugString().c_str()); | GELOGD("NodeItem created: %s", node_item->DebugString().c_str()); | ||||
| @@ -79,6 +79,12 @@ class HybridModelBuilder { | |||||
| Status LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem *parent_node_item); | Status LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem *parent_node_item); | ||||
| Status RecoverGraphUnknownFlag(); | Status RecoverGraphUnknownFlag(); | ||||
| Status CheckAicpuOpList(); | Status CheckAicpuOpList(); | ||||
| Status CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node); | |||||
| Status CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node); | |||||
| Status GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list); | |||||
| Status GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list); | |||||
| Status GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list); | |||||
| Status GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, vector<domi::TaskDef> &task_def_list); | |||||
| const char* GetGraphName() const { | const char* GetGraphName() const { | ||||
| return hybrid_model_.model_name_.c_str(); | return hybrid_model_.model_name_.c_str(); | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include "common/debug/log.h" | #include "common/debug/log.h" | ||||
| #include "common/ge/ge_util.h" | #include "common/ge/ge_util.h" | ||||
| #include "graph/utils/tensor_utils.h" | #include "graph/utils/tensor_utils.h" | ||||
| #include "hybrid/model/hybrid_model.h" | |||||
| #include "runtime/rt.h" | #include "runtime/rt.h" | ||||
| namespace ge { | namespace ge { | ||||
| @@ -79,12 +80,44 @@ Status IdentityNNodeTask::ExecuteAsync(TaskContext &context, std::function<void( | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status ProfilingTraceNodeTask::UpdateArgs(TaskContext &context) { | |||||
| return SUCCESS; | |||||
| } | |||||
| Status ProfilingTraceNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | |||||
| for (const auto &task_def : task_defs_) { | |||||
| auto log_time_stamp_def = task_def.log_timestamp(); | |||||
| uint64_t log_id = log_time_stamp_def.logid(); | |||||
| bool notify = log_time_stamp_def.notify(); | |||||
| uint32_t flat = log_time_stamp_def.flat(); | |||||
| GELOGD("ProfilingTraceTask execute async start. logid = %lu, notify = %d.", log_id, notify); | |||||
| rtError_t rt_ret = rtProfilerTrace(log_id, notify, flat, context.GetStream()); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||||
| return RT_ERROR_TO_GE_STATUS(rt_ret); | |||||
| } | |||||
| GELOGD("[%s] ProfilingTraceTask[%lu] execute success.", context.GetNodeName(), log_id); | |||||
| } | |||||
| return SUCCESS; | |||||
| }; | |||||
| Status RtsNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const { | Status RtsNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const { | ||||
| GE_CHECK_NOTNULL(node); | |||||
| auto op_type = node->GetType(); | auto op_type = node->GetType(); | ||||
| if (op_type == IDENTITY) { | if (op_type == IDENTITY) { | ||||
| task = MakeShared<IdentityNodeTask>(); | task = MakeShared<IdentityNodeTask>(); | ||||
| } else if (op_type == IDENTITYN) { | } else if (op_type == IDENTITYN) { | ||||
| task = MakeShared<IdentityNNodeTask>(); | task = MakeShared<IdentityNNodeTask>(); | ||||
| } else if (op_type == PROFILINGTRAININGTRACE) { | |||||
| auto *task_defs = model.GetTaskDefs(node); | |||||
| if (task_defs == nullptr || task_defs->empty()) { | |||||
| GELOGE(INTERNAL_ERROR, "Profiling node has no task to execute."); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| task = MakeShared<ProfilingTraceNodeTask>(*task_defs); | |||||
| } else { | } else { | ||||
| GELOGE(INTERNAL_ERROR, "[%s] Unsupported RTS op type: %s", node->GetName().c_str(), op_type.c_str()); | GELOGE(INTERNAL_ERROR, "[%s] Unsupported RTS op type: %s", node->GetName().c_str(), op_type.c_str()); | ||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| @@ -18,6 +18,7 @@ | |||||
| #define GE_HYBRID_NODE_EXECUTOR_RTS_RTS_NODE_EXECUTOR_H_ | #define GE_HYBRID_NODE_EXECUTOR_RTS_RTS_NODE_EXECUTOR_H_ | ||||
| #include "hybrid/node_executor/node_executor.h" | #include "hybrid/node_executor/node_executor.h" | ||||
| #include "proto/task.pb.h" | |||||
| namespace ge { | namespace ge { | ||||
| namespace hybrid { | namespace hybrid { | ||||
| @@ -35,6 +36,18 @@ class IdentityNNodeTask : public IdentityNodeTask { | |||||
| Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override; | Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override; | ||||
| }; | }; | ||||
| class ProfilingTraceNodeTask : public NodeTask { | |||||
| public: | |||||
| explicit ProfilingTraceNodeTask(const std::vector<domi::TaskDef> &task_defs) : task_defs_(task_defs) {} | |||||
| ~ProfilingTraceNodeTask() override = default; | |||||
| Status UpdateArgs(TaskContext &context) override; | |||||
| Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override; | |||||
| private: | |||||
| std::vector<domi::TaskDef> task_defs_; | |||||
| }; | |||||
| class RtsNodeExecutor : public NodeExecutor { | class RtsNodeExecutor : public NodeExecutor { | ||||
| public: | public: | ||||
| Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const override; | Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const override; | ||||
| @@ -123,7 +123,7 @@ class TaskContext { | |||||
| Status status_ = SUCCESS; | Status status_ = SUCCESS; | ||||
| std::vector<void *> workspaces_; | std::vector<void *> workspaces_; | ||||
| uint64_t iteration_ = 0; | uint64_t iteration_ = 0; | ||||
| uint32_t task_id_= 0; | |||||
| uint32_t task_id_ = 0; | |||||
| uint32_t stream_id_ = 0; | uint32_t stream_id_ = 0; | ||||
| }; | }; | ||||
| } // namespace hybrid | } // namespace hybrid | ||||
| @@ -263,6 +263,8 @@ struct ComputeGraphDescInfo { | |||||
| std::vector<Format> output_format; | std::vector<Format> output_format; | ||||
| std::vector<std::vector<int64_t>> output_shape; | std::vector<std::vector<int64_t>> output_shape; | ||||
| std::vector<DataType> output_data_type; | std::vector<DataType> output_data_type; | ||||
| uint32_t task_id; | |||||
| uint32_t stream_id; | |||||
| }; | }; | ||||
| struct OpDescInfo { | struct OpDescInfo { | ||||
| @@ -529,6 +529,9 @@ REGISTER_OPTYPE_DECLARE(HVDWAIT, "HorovodWait"); | |||||
| // aicpu op for online_infer dynamic_dims | // aicpu op for online_infer dynamic_dims | ||||
| REGISTER_OPTYPE_DECLARE(GETDYNAMICDIMS, "GetDynamicDims"); | REGISTER_OPTYPE_DECLARE(GETDYNAMICDIMS, "GetDynamicDims"); | ||||
| // profiling training trace node | |||||
| REGISTER_OPTYPE_DECLARE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace"); | |||||
| enum InputMode { INPUT = 0, CONST_INPUT }; | enum InputMode { INPUT = 0, CONST_INPUT }; | ||||
| // Definition of the processing status enum of the process module | // Definition of the processing status enum of the process module | ||||