From: @zhengyuanhua Reviewed-by: @youui,@xchu42,@xchu42 Signed-off-by: @youuitags/v1.2.0
@@ -37,6 +37,8 @@ using domi::BuildMode; | |||
namespace { | |||
const int32_t kInvalidPerfLevel = -1; | |||
const int64_t kProfilingArStep = 2; | |||
const int64_t kProfilingArStartLogid = 3; | |||
enum NodeType { kSubgraphData, kSubgraphNode, kOthers }; | |||
} // namespace | |||
namespace ge { | |||
@@ -457,6 +459,11 @@ Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { | |||
if (all_reduce_node_index[i] == node_index) { | |||
GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); | |||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||
GELOGE(FAILED, "Multiply result is out of range."); | |||
return FAILED); | |||
int64_t log_id = i * kProfilingArStep + kProfilingArStartLogid; | |||
(void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||
continue; | |||
} | |||
} | |||
@@ -234,6 +234,19 @@ Status TaskGenerator::SaveFusionNodes(map<int64_t, std::vector<NodePtr>> &fusion | |||
return SUCCESS; | |||
} | |||
bool TaskGenerator::IsSubGraphOfDynamicGraph(const ComputeGraphPtr &graph) const { | |||
auto parent_graph_ptr = graph->GetParentGraph(); | |||
if (parent_graph_ptr == nullptr) { | |||
return false; | |||
} | |||
auto root_graph_ptr = GraphUtils::FindRootGraph(parent_graph_ptr); | |||
if (root_graph_ptr == nullptr) { | |||
return false; | |||
} | |||
return root_graph_ptr->GetGraphUnknownFlag(); | |||
} | |||
Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &graph, | |||
vector<domi::TaskDef> &task_def_list, map<uint32_t, string> &op_name_map) { | |||
GELOGD("Beign to generate task, graph name is %s.", graph->GetName().c_str()); | |||
@@ -274,7 +287,6 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||
}; | |||
GE_MAKE_GUARD(release, callback); | |||
uint64_t all_reduce_node_idx = 0; | |||
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | |||
OpDescPtr op_desc = node->GetOpDesc(); | |||
GE_CHECK_NOTNULL(op_desc); | |||
@@ -293,7 +305,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||
// Part2: Call | |||
auto fusion_task_info = | |||
FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, | |||
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes, all_reduce_node_idx}; | |||
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes}; | |||
GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen), | |||
"Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str()); | |||
// continue directly | |||
@@ -317,8 +329,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||
type.c_str()); | |||
// Profiling task | |||
size_t task_list_size_before = task_def_list.size(); | |||
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||
node_index, task_def_list, all_reduce_node_idx)); | |||
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||
int64_t op_id = op_desc->GetId(); | |||
// Compatible with dynamic shape scenes, the default is 0 | |||
int64_t stream_id = 0; | |||
@@ -338,8 +349,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||
return ret; | |||
} | |||
// Profiling task | |||
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||
node_index, task_def_list, all_reduce_node_idx)); | |||
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||
size_t task_list_size_after = task_def_list.size(); | |||
// If tasks is reduced | |||
if (task_list_size_after < task_list_size_before) { | |||
@@ -382,7 +392,6 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||
auto &op_name_map = fusion_task_info.op_name_map; | |||
auto &profiling_point = fusion_task_info.profiling_point; | |||
auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes; | |||
auto &all_reduce_idx = fusion_task_info.all_reduce_node_idx; | |||
// If op_desc have this attr, call nodes with same group key in a stream together | |||
if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) && | |||
(fusion_nodes_seen.count(node.get()) == 0)) { | |||
@@ -429,8 +438,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||
return INTERNAL_ERROR; | |||
} | |||
// profiling task | |||
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||
node_index, task_def_list, all_reduce_idx); | |||
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||
run_context.stream = run_context.graphStreamList[stream_id]; | |||
GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", | |||
op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); | |||
@@ -443,8 +451,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||
return ret; | |||
} | |||
// profiling task | |||
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||
node_index, task_def_list, all_reduce_idx); | |||
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||
size_t task_list_size_after = task_def_list.size(); | |||
// if tasks is reduced | |||
if (task_list_size_after < task_list_size_before) { | |||
@@ -849,6 +856,13 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||
GELOGD("Profiling is not open."); | |||
return SUCCESS; | |||
} | |||
// subgraph of dynamic graph no need to find index, has been found in parent graph | |||
if (IsSubGraphOfDynamicGraph(graph)) { | |||
GELOGI("Graph[%s] is subgraph of dynamic graph, no nned to find index.", graph->GetName().c_str()); | |||
return SUCCESS; | |||
} | |||
GELOGI("Start get FP/BP index."); | |||
std::string fp_point_str; | |||
std::string bp_point_str; | |||
@@ -886,9 +900,47 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||
return SUCCESS; | |||
} | |||
Status TaskGenerator::InsertProfilingArTaskBefore(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||
bool is_insert_bp_profiling_task) { | |||
bool is_insert_all_reduce_task = false; | |||
int64_t ar_log_id = 0xFFFF; | |||
if (is_insert_bp_profiling_task) { | |||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, ar_log_id); | |||
is_insert_all_reduce_task = true; | |||
} | |||
if (!is_insert_all_reduce_task) { | |||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||
if (all_reduce_nodes[i] == node_index) { | |||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||
GELOGE(FAILED, "Multiply result is out of range."); | |||
return FAILED); | |||
ar_log_id = i * kProfilingArStep + kProfilingArStartLogid; | |||
is_insert_all_reduce_task = true; | |||
break; | |||
} | |||
} | |||
} | |||
if (is_insert_all_reduce_task) { | |||
GELOGI("The start allreduce operator is %s, idx %u, log_id %ld", op_desc->GetName().c_str(), node_index, ar_log_id); | |||
TaskDef ar_task_def; | |||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||
if (ar_log_def != nullptr) { | |||
ar_log_def->set_logid(ar_log_id); | |||
ar_log_def->set_notify(false); | |||
} | |||
task_def_list.push_back(ar_task_def); | |||
} | |||
return SUCCESS; | |||
} | |||
Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | |||
vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | |||
vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx) { | |||
vector<domi::TaskDef> &task_def_list) { | |||
const char *profiling_mode = std::getenv(kProfilingMode); | |||
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | |||
ProfilingManager::Instance().ProfilingTrainingTraceOn(); | |||
@@ -931,19 +983,31 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||
} | |||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||
uint64_t all_reduce_task_idx = 0; | |||
if (is_all_reduce) { | |||
(void)InsertProfilingArTaskBefore(op_desc, all_reduce_nodes, node_index, | |||
task_def_list, is_insert_bp_profiling_task); | |||
} | |||
return SUCCESS; | |||
} | |||
Status TaskGenerator::InsertProfilingArTaskAfter(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||
bool is_insert_bp_profiling_task) { | |||
bool is_insert_all_reduce_task = false; | |||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||
all_reduce_task_idx = all_reduce_node_idx; | |||
int64_t ar_log_id = 0xFFFF; | |||
if (is_insert_bp_profiling_task) { | |||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, ar_log_id); | |||
ar_log_id += 1; | |||
is_insert_all_reduce_task = true; | |||
} | |||
if (is_all_reduce) { | |||
all_reduce_node_idx++; | |||
} | |||
if (!is_insert_all_reduce_task) { | |||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||
if (all_reduce_nodes[i] == node_index) { | |||
all_reduce_task_idx = i; | |||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||
GELOGE(FAILED, "Multiply result is out of range."); | |||
return FAILED); | |||
ar_log_id = i * kProfilingArStep + kProfilingArEndLogid; | |||
is_insert_all_reduce_task = true; | |||
break; | |||
} | |||
@@ -951,28 +1015,24 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||
} | |||
if (is_insert_all_reduce_task) { | |||
GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||
GELOGI("The start allreduce operator is %s, idx %u, log_id %ld", op_desc->GetName().c_str(), node_index, ar_log_id); | |||
TaskDef ar_task_def; | |||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||
if (ar_log_def != nullptr) { | |||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||
GELOGE(FAILED, "Multiply result is out of range."); | |||
return FAILED); | |||
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArStartLogid; | |||
ar_log_def->set_logid(log_id); | |||
ar_log_def->set_logid(ar_log_id); | |||
ar_log_def->set_notify(false); | |||
(void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||
} | |||
task_def_list.push_back(ar_task_def); | |||
} | |||
return SUCCESS; | |||
} | |||
Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | |||
vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | |||
vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx) { | |||
vector<domi::TaskDef> &task_def_list) { | |||
GE_CHECK_NOTNULL(op_desc); | |||
const char *profiling_mode = std::getenv(kProfilingMode); | |||
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | |||
@@ -1017,36 +1077,11 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||
task_def_list.emplace_back(end_task_def); | |||
} | |||
uint32_t all_reduce_task_idx = 0; | |||
bool is_insert_all_reduce_task = false; | |||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||
all_reduce_task_idx = all_reduce_node_idx; | |||
is_insert_all_reduce_task = true; | |||
} | |||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||
if (all_reduce_nodes[i] == node_index) { | |||
all_reduce_task_idx = i; | |||
is_insert_all_reduce_task = true; | |||
break; | |||
} | |||
if (is_all_reduce) { | |||
(void)InsertProfilingArTaskAfter(op_desc, all_reduce_nodes, node_index, | |||
task_def_list, is_insert_bp_profiling_task); | |||
} | |||
if (is_insert_all_reduce_task) { | |||
GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||
TaskDef ar_task_def; | |||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||
GE_CHECK_NOTNULL(ar_log_def); | |||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||
GELOGE(FAILED, "Multiply result is out of range."); | |||
return FAILED); | |||
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArEndLogid; | |||
ar_log_def->set_logid(log_id); | |||
ar_log_def->set_notify(false); | |||
task_def_list.emplace_back(ar_task_def); | |||
} | |||
return SUCCESS; | |||
} | |||
@@ -129,10 +129,16 @@ class TaskGenerator { | |||
std::vector<uint32_t> &all_reduce_nodes) const; | |||
Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | |||
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | |||
std::vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx); | |||
std::vector<domi::TaskDef> &task_def_list); | |||
Status InsertProfilingArTaskBefore(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_listy, | |||
bool is_insert_bp_profiling_task); | |||
Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | |||
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | |||
std::vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx); | |||
std::vector<domi::TaskDef> &task_def_list); | |||
Status InsertProfilingArTaskAfter(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||
bool is_insert_bp_profiling_task); | |||
static bool IsProfPoint(const OpDescPtr &op, const std::string &name); | |||
@@ -155,6 +161,8 @@ class TaskGenerator { | |||
Status SetKnownShapeStream(RunContext &run_context, int64_t stream_id); | |||
bool IsSubGraphOfDynamicGraph(const ComputeGraphPtr &graph) const; | |||
uint8_t *var_mem_base_ = nullptr; | |||
uint64_t var_mem_size_ = 0; | |||
}; | |||
@@ -174,6 +174,38 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel | |||
compute_graph_info = context_->GetProfilingGraphDescInfo(); | |||
context_->ClearProfilingGraphDescInfo(); | |||
auto op_desc = node->GetOpDesc(); | |||
GE_CHECK_NOTNULL(op_desc); | |||
for (auto &tmp_compute_graph_info : compute_graph_info) { | |||
// default | |||
if (op_desc->GetAllInputsSize() == 0) { | |||
tmp_compute_graph_info.input_format = { FORMAT_NULL }; | |||
tmp_compute_graph_info.input_shape = { {0} }; | |||
tmp_compute_graph_info.input_data_type = { DT_UNDEFINED }; | |||
} | |||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | |||
GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i); | |||
if (input_desc == nullptr) { | |||
continue; | |||
} | |||
tmp_compute_graph_info.input_format.emplace_back(input_desc->GetFormat()); | |||
tmp_compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims()); | |||
tmp_compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType()); | |||
} | |||
if (op_desc->GetOutputsSize() == 0) { | |||
tmp_compute_graph_info.output_format = { FORMAT_NULL }; | |||
tmp_compute_graph_info.output_shape = { {0} }; | |||
tmp_compute_graph_info.output_data_type = { DT_UNDEFINED }; | |||
} | |||
for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) { | |||
GeTensorDesc output_desc = op_desc->GetOutputDesc(j); | |||
tmp_compute_graph_info.output_format.emplace_back(output_desc.GetFormat()); | |||
tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | |||
tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | |||
} | |||
} | |||
return SUCCESS; | |||
} | |||
@@ -1608,16 +1608,19 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||
GE_CHECK_NOTNULL(compute_graph); | |||
NodePtr node_ptr = nullptr; | |||
vector<domi::TaskDef> task_def_list; | |||
map<NodePtr, vector<domi::TaskDef>> node_task_map; | |||
// create fp node | |||
bool is_insert_fp_profiling_task = false; | |||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); | |||
if (is_insert_fp_profiling_task) { | |||
vector<domi::TaskDef> task_def_list; | |||
(void)GenerateFpProfilingTask(op_desc, task_def_list); | |||
auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE); | |||
GE_CHECK_NOTNULL(fp_desc); | |||
fp_desc->SetOpKernelLibName(kEngineNameRts); | |||
node_ptr = compute_graph->AddNode(fp_desc); | |||
GE_CHECK_NOTNULL(node_ptr); | |||
node_task_map[node_ptr] = task_def_list; | |||
GELOGD("Create fp profiling node success before."); | |||
} | |||
// creat all reduce start node | |||
@@ -1625,6 +1628,7 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||
vector<domi::TaskDef> task_def_list; | |||
int64_t log_id = 0; | |||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||
GELOGD("All reduce node profiling task log id: %ld before", log_id); | |||
@@ -1634,18 +1638,24 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||
GE_CHECK_NOTNULL(ar_desc_start); | |||
ar_desc_start->SetOpKernelLibName(kEngineNameRts); | |||
node_ptr = compute_graph->AddNode(ar_desc_start); | |||
GE_CHECK_NOTNULL(node_ptr); | |||
node_task_map[node_ptr] = task_def_list; | |||
GELOGD("Create all reduce start profiling node success before."); | |||
} | |||
if (node_ptr != nullptr) { | |||
for (const auto &task_def : task_def_list) { | |||
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||
if (!node_task_map.empty()) { | |||
for (const auto &node_task : node_task_map) { | |||
NodePtr profiling_node = node_task.first; | |||
vector<domi::TaskDef> task_def_lists = node_task.second; | |||
for (const auto &task_def : task_def_lists) { | |||
hybrid_model_.task_defs_[profiling_node].emplace_back(task_def); | |||
} | |||
NodeItem *node_item = nullptr; | |||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(profiling_node, &node_item)); | |||
node_item->input_start = 0; | |||
node_item->output_start = 0; | |||
graph_item.node_items_.emplace_back(node_item); | |||
} | |||
NodeItem *node_item = nullptr; | |||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||
node_item->input_start = 0; | |||
node_item->output_start = 0; | |||
graph_item.node_items_.emplace_back(node_item); | |||
} else { | |||
GELOGD("No need to create profiling node before."); | |||
} | |||
@@ -1661,12 +1671,13 @@ Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const | |||
GE_CHECK_NOTNULL(compute_graph); | |||
NodePtr node_ptr = nullptr; | |||
vector<domi::TaskDef> task_def_list; | |||
map<NodePtr, vector<domi::TaskDef>> node_task_map; | |||
// Create all reduce end node | |||
bool is_insert_bp_profiling_task = false; | |||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | |||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | |||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||
vector<domi::TaskDef> task_def_list; | |||
int64_t log_id = 0; | |||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||
GELOGD("All reduce node profiling task log id: %ld after", log_id); | |||
@@ -1676,38 +1687,50 @@ Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const | |||
GE_CHECK_NOTNULL(ar_desc_end); | |||
ar_desc_end->SetOpKernelLibName(kEngineNameRts); | |||
node_ptr = compute_graph->AddNode(ar_desc_end); | |||
GE_CHECK_NOTNULL(node_ptr); | |||
node_task_map[node_ptr] = task_def_list; | |||
GELOGD("Create all reduce end profiling node success after."); | |||
} | |||
// create bp node | |||
if (!is_all_reduce && is_insert_bp_profiling_task) { | |||
vector<domi::TaskDef> task_def_list; | |||
(void) GenerateBpProfilingTask(op_desc, task_def_list); | |||
auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE); | |||
GE_CHECK_NOTNULL(bp_op_desc); | |||
bp_op_desc->SetOpKernelLibName(kEngineNameRts); | |||
node_ptr = compute_graph->AddNode(bp_op_desc); | |||
GE_CHECK_NOTNULL(node_ptr); | |||
node_task_map[node_ptr] = task_def_list; | |||
GELOGD("Create bp profiling node success after."); | |||
} | |||
// create end node | |||
bool is_insert_end_profiling_task = false; | |||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task); | |||
if (is_insert_end_profiling_task) { | |||
vector<domi::TaskDef> task_def_list; | |||
(void)GenerateEndProfilingTask(op_desc, task_def_list); | |||
auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE); | |||
GE_CHECK_NOTNULL(end_desc); | |||
end_desc->SetOpKernelLibName(kEngineNameRts); | |||
node_ptr = compute_graph->AddNode(end_desc); | |||
GE_CHECK_NOTNULL(node_ptr); | |||
node_task_map[node_ptr] = task_def_list; | |||
GELOGD("Create end profiling node success after."); | |||
} | |||
if (node_ptr != nullptr) { | |||
for (const auto &task_def : task_def_list) { | |||
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||
if (!node_task_map.empty()) { | |||
for (const auto &node_task : node_task_map) { | |||
NodePtr profiling_node = node_task.first; | |||
vector<domi::TaskDef> task_def_lists = node_task.second; | |||
for (const auto &task_def : task_def_lists) { | |||
hybrid_model_.task_defs_[profiling_node].emplace_back(task_def); | |||
} | |||
NodeItem *node_item = nullptr; | |||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(profiling_node, &node_item)); | |||
node_item->input_start = 0; | |||
node_item->output_start = 0; | |||
graph_item.node_items_.emplace_back(node_item); | |||
} | |||
NodeItem *node_item = nullptr; | |||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||
node_item->input_start = 0; | |||
node_item->output_start = 0; | |||
graph_item.node_items_.emplace_back(node_item); | |||
} else { | |||
GELOGD("No need to create profiling node after."); | |||
} | |||
@@ -554,33 +554,6 @@ Status TaskContext::SaveProfilingGraphDescInfo(uint32_t task_id, uint32_t stream | |||
tmp_compute_graph_info.model_name = dynamic_model_name; | |||
tmp_compute_graph_info.op_name = op_desc->GetName(); | |||
tmp_compute_graph_info.op_type = op_desc->GetType(); | |||
// default | |||
if (op_desc->GetAllInputsSize() == 0) { | |||
tmp_compute_graph_info.input_format = { FORMAT_NULL }; | |||
tmp_compute_graph_info.input_shape = { {0} }; | |||
tmp_compute_graph_info.input_data_type = { DT_UNDEFINED }; | |||
} | |||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | |||
GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i); | |||
if (input_desc == nullptr) { | |||
continue; | |||
} | |||
tmp_compute_graph_info.input_format.emplace_back(input_desc->GetFormat()); | |||
tmp_compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims()); | |||
tmp_compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType()); | |||
} | |||
if (op_desc->GetOutputsSize() == 0) { | |||
tmp_compute_graph_info.output_format = { FORMAT_NULL }; | |||
tmp_compute_graph_info.output_shape = { {0} }; | |||
tmp_compute_graph_info.output_data_type = { DT_UNDEFINED }; | |||
} | |||
for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) { | |||
GeTensorDesc output_desc = op_desc->GetOutputDesc(j); | |||
tmp_compute_graph_info.output_format.emplace_back(output_desc.GetFormat()); | |||
tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | |||
tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | |||
} | |||
tmp_compute_graph_info.task_id = task_id; | |||
tmp_compute_graph_info.stream_id = stream_id; | |||
compute_graph_info.emplace_back(tmp_compute_graph_info); | |||