@@ -37,6 +37,8 @@ using domi::BuildMode; | |||||
namespace { | namespace { | ||||
const int32_t kInvalidPerfLevel = -1; | const int32_t kInvalidPerfLevel = -1; | ||||
const int64_t kProfilingArStep = 2; | |||||
const int64_t kProfilingArStartLogid = 3; | |||||
enum NodeType { kSubgraphData, kSubgraphNode, kOthers }; | enum NodeType { kSubgraphData, kSubgraphNode, kOthers }; | ||||
} // namespace | } // namespace | ||||
namespace ge { | namespace ge { | ||||
@@ -457,6 +459,11 @@ Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { | |||||
if (all_reduce_node_index[i] == node_index) { | if (all_reduce_node_index[i] == node_index) { | ||||
GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index); | ||||
(void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); | (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true); | ||||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||||
GELOGE(FAILED, "Multiply result is out of range."); | |||||
return FAILED); | |||||
int64_t log_id = i * kProfilingArStep + kProfilingArStartLogid; | |||||
(void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||||
continue; | continue; | ||||
} | } | ||||
} | } | ||||
@@ -234,6 +234,19 @@ Status TaskGenerator::SaveFusionNodes(map<int64_t, std::vector<NodePtr>> &fusion | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
bool TaskGenerator::IsSubGraphOfDynamicGraph(const ComputeGraphPtr &graph) const { | |||||
auto parent_graph_ptr = graph->GetParentGraph(); | |||||
if (parent_graph_ptr == nullptr) { | |||||
return false; | |||||
} | |||||
auto root_graph_ptr = GraphUtils::FindRootGraph(parent_graph_ptr); | |||||
if (root_graph_ptr == nullptr) { | |||||
return false; | |||||
} | |||||
return root_graph_ptr->GetGraphUnknownFlag(); | |||||
} | |||||
Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &graph, | Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &graph, | ||||
vector<domi::TaskDef> &task_def_list, map<uint32_t, string> &op_name_map) { | vector<domi::TaskDef> &task_def_list, map<uint32_t, string> &op_name_map) { | ||||
GELOGD("Beign to generate task, graph name is %s.", graph->GetName().c_str()); | GELOGD("Beign to generate task, graph name is %s.", graph->GetName().c_str()); | ||||
@@ -274,7 +287,6 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
}; | }; | ||||
GE_MAKE_GUARD(release, callback); | GE_MAKE_GUARD(release, callback); | ||||
uint64_t all_reduce_node_idx = 0; | |||||
for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { | ||||
OpDescPtr op_desc = node->GetOpDesc(); | OpDescPtr op_desc = node->GetOpDesc(); | ||||
GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
@@ -293,7 +305,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
// Part2: Call | // Part2: Call | ||||
auto fusion_task_info = | auto fusion_task_info = | ||||
FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, | FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, | ||||
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes, all_reduce_node_idx}; | |||||
ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes}; | |||||
GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen), | GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen), | ||||
"Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str()); | "Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str()); | ||||
// continue directly | // continue directly | ||||
@@ -317,8 +329,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
type.c_str()); | type.c_str()); | ||||
// Profiling task | // Profiling task | ||||
size_t task_list_size_before = task_def_list.size(); | size_t task_list_size_before = task_def_list.size(); | ||||
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||||
node_index, task_def_list, all_reduce_node_idx)); | |||||
GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||||
int64_t op_id = op_desc->GetId(); | int64_t op_id = op_desc->GetId(); | ||||
// Compatible with dynamic shape scenes, the default is 0 | // Compatible with dynamic shape scenes, the default is 0 | ||||
int64_t stream_id = 0; | int64_t stream_id = 0; | ||||
@@ -338,8 +349,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra | |||||
return ret; | return ret; | ||||
} | } | ||||
// Profiling task | // Profiling task | ||||
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||||
node_index, task_def_list, all_reduce_node_idx)); | |||||
GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list)); | |||||
size_t task_list_size_after = task_def_list.size(); | size_t task_list_size_after = task_def_list.size(); | ||||
// If tasks is reduced | // If tasks is reduced | ||||
if (task_list_size_after < task_list_size_before) { | if (task_list_size_after < task_list_size_before) { | ||||
@@ -382,7 +392,6 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
auto &op_name_map = fusion_task_info.op_name_map; | auto &op_name_map = fusion_task_info.op_name_map; | ||||
auto &profiling_point = fusion_task_info.profiling_point; | auto &profiling_point = fusion_task_info.profiling_point; | ||||
auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes; | auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes; | ||||
auto &all_reduce_idx = fusion_task_info.all_reduce_node_idx; | |||||
// If op_desc have this attr, call nodes with same group key in a stream together | // If op_desc have this attr, call nodes with same group key in a stream together | ||||
if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) && | if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) && | ||||
(fusion_nodes_seen.count(node.get()) == 0)) { | (fusion_nodes_seen.count(node.get()) == 0)) { | ||||
@@ -429,8 +438,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
// profiling task | // profiling task | ||||
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, | |||||
node_index, task_def_list, all_reduce_idx); | |||||
(void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||||
run_context.stream = run_context.graphStreamList[stream_id]; | run_context.stream = run_context.graphStreamList[stream_id]; | ||||
GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", | GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", | ||||
op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); | op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); | ||||
@@ -443,8 +451,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info | |||||
return ret; | return ret; | ||||
} | } | ||||
// profiling task | // profiling task | ||||
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, | |||||
node_index, task_def_list, all_reduce_idx); | |||||
(void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list); | |||||
size_t task_list_size_after = task_def_list.size(); | size_t task_list_size_after = task_def_list.size(); | ||||
// if tasks is reduced | // if tasks is reduced | ||||
if (task_list_size_after < task_list_size_before) { | if (task_list_size_after < task_list_size_before) { | ||||
@@ -850,6 +857,13 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||||
GELOGD("Profiling is not open."); | GELOGD("Profiling is not open."); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
// subgraph of dynamic graph no need to find index, has been found in parent graph | |||||
if (IsSubGraphOfDynamicGraph(graph)) { | |||||
GELOGI("Graph[%s] is subgraph of dynamic graph, no nned to find index.", graph->GetName().c_str()); | |||||
return SUCCESS; | |||||
} | |||||
GELOGI("Start get FP/BP index."); | GELOGI("Start get FP/BP index."); | ||||
std::string fp_point_str; | std::string fp_point_str; | ||||
std::string bp_point_str; | std::string bp_point_str; | ||||
@@ -887,9 +901,47 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
Status TaskGenerator::InsertProfilingArTaskBefore(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||||
bool is_insert_bp_profiling_task) { | |||||
bool is_insert_all_reduce_task = false; | |||||
int64_t ar_log_id = 0xFFFF; | |||||
if (is_insert_bp_profiling_task) { | |||||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, ar_log_id); | |||||
is_insert_all_reduce_task = true; | |||||
} | |||||
if (!is_insert_all_reduce_task) { | |||||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||||
if (all_reduce_nodes[i] == node_index) { | |||||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||||
GELOGE(FAILED, "Multiply result is out of range."); | |||||
return FAILED); | |||||
ar_log_id = i * kProfilingArStep + kProfilingArStartLogid; | |||||
is_insert_all_reduce_task = true; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
if (is_insert_all_reduce_task) { | |||||
GELOGI("The start allreduce operator is %s, idx %u, log_id %ld", op_desc->GetName().c_str(), node_index, ar_log_id); | |||||
TaskDef ar_task_def; | |||||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||||
if (ar_log_def != nullptr) { | |||||
ar_log_def->set_logid(ar_log_id); | |||||
ar_log_def->set_notify(false); | |||||
} | |||||
task_def_list.push_back(ar_task_def); | |||||
} | |||||
return SUCCESS; | |||||
} | |||||
Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx) { | |||||
vector<domi::TaskDef> &task_def_list) { | |||||
const char *profiling_mode = std::getenv(kProfilingMode); | const char *profiling_mode = std::getenv(kProfilingMode); | ||||
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ProfilingManager::Instance().ProfilingTrainingTraceOn(); | ||||
@@ -932,19 +984,31 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||||
} | } | ||||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | ||||
uint64_t all_reduce_task_idx = 0; | |||||
if (is_all_reduce) { | |||||
(void)InsertProfilingArTaskBefore(op_desc, all_reduce_nodes, node_index, | |||||
task_def_list, is_insert_bp_profiling_task); | |||||
} | |||||
return SUCCESS; | |||||
} | |||||
Status TaskGenerator::InsertProfilingArTaskAfter(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||||
bool is_insert_bp_profiling_task) { | |||||
bool is_insert_all_reduce_task = false; | bool is_insert_all_reduce_task = false; | ||||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
all_reduce_task_idx = all_reduce_node_idx; | |||||
int64_t ar_log_id = 0xFFFF; | |||||
if (is_insert_bp_profiling_task) { | |||||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, ar_log_id); | |||||
ar_log_id += 1; | |||||
is_insert_all_reduce_task = true; | is_insert_all_reduce_task = true; | ||||
} | } | ||||
if (is_all_reduce) { | |||||
all_reduce_node_idx++; | |||||
} | |||||
if (!is_insert_all_reduce_task) { | if (!is_insert_all_reduce_task) { | ||||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | ||||
if (all_reduce_nodes[i] == node_index) { | if (all_reduce_nodes[i] == node_index) { | ||||
all_reduce_task_idx = i; | |||||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep), | |||||
GELOGE(FAILED, "Multiply result is out of range."); | |||||
return FAILED); | |||||
ar_log_id = i * kProfilingArStep + kProfilingArEndLogid; | |||||
is_insert_all_reduce_task = true; | is_insert_all_reduce_task = true; | ||||
break; | break; | ||||
} | } | ||||
@@ -952,28 +1016,24 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const | |||||
} | } | ||||
if (is_insert_all_reduce_task) { | if (is_insert_all_reduce_task) { | ||||
GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
GELOGI("The start allreduce operator is %s, idx %u, log_id %ld", op_desc->GetName().c_str(), node_index, ar_log_id); | |||||
TaskDef ar_task_def; | TaskDef ar_task_def; | ||||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | ||||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | ar_task_def.set_stream_id(op_desc->GetStreamId()); | ||||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | ||||
if (ar_log_def != nullptr) { | if (ar_log_def != nullptr) { | ||||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||||
GELOGE(FAILED, "Multiply result is out of range."); | |||||
return FAILED); | |||||
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArStartLogid; | |||||
ar_log_def->set_logid(log_id); | |||||
ar_log_def->set_logid(ar_log_id); | |||||
ar_log_def->set_notify(false); | ar_log_def->set_notify(false); | ||||
(void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | |||||
} | } | ||||
task_def_list.push_back(ar_task_def); | task_def_list.push_back(ar_task_def); | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx) { | |||||
vector<domi::TaskDef> &task_def_list) { | |||||
GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
const char *profiling_mode = std::getenv(kProfilingMode); | const char *profiling_mode = std::getenv(kProfilingMode); | ||||
bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || | ||||
@@ -1018,36 +1078,11 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P | |||||
task_def_list.emplace_back(end_task_def); | task_def_list.emplace_back(end_task_def); | ||||
} | } | ||||
uint32_t all_reduce_task_idx = 0; | |||||
bool is_insert_all_reduce_task = false; | |||||
if (is_all_reduce && is_insert_bp_profiling_task) { | |||||
all_reduce_task_idx = all_reduce_node_idx; | |||||
is_insert_all_reduce_task = true; | |||||
} | |||||
for (size_t i = 0; i < all_reduce_nodes.size(); i++) { | |||||
if (all_reduce_nodes[i] == node_index) { | |||||
all_reduce_task_idx = i; | |||||
is_insert_all_reduce_task = true; | |||||
break; | |||||
} | |||||
if (is_all_reduce) { | |||||
(void)InsertProfilingArTaskAfter(op_desc, all_reduce_nodes, node_index, | |||||
task_def_list, is_insert_bp_profiling_task); | |||||
} | } | ||||
if (is_insert_all_reduce_task) { | |||||
GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index); | |||||
TaskDef ar_task_def; | |||||
ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE); | |||||
ar_task_def.set_stream_id(op_desc->GetStreamId()); | |||||
LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp(); | |||||
GE_CHECK_NOTNULL(ar_log_def); | |||||
GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep), | |||||
GELOGE(FAILED, "Multiply result is out of range."); | |||||
return FAILED); | |||||
auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArEndLogid; | |||||
ar_log_def->set_logid(log_id); | |||||
ar_log_def->set_notify(false); | |||||
task_def_list.emplace_back(ar_task_def); | |||||
} | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -129,10 +129,16 @@ class TaskGenerator { | |||||
std::vector<uint32_t> &all_reduce_nodes) const; | std::vector<uint32_t> &all_reduce_nodes) const; | ||||
Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
std::vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx); | |||||
std::vector<domi::TaskDef> &task_def_list); | |||||
Status InsertProfilingArTaskBefore(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_listy, | |||||
bool is_insert_bp_profiling_task); | |||||
Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point, | ||||
std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index, | ||||
std::vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx); | |||||
std::vector<domi::TaskDef> &task_def_list); | |||||
Status InsertProfilingArTaskAfter(const OpDescPtr &op_desc, std::vector<uint32_t> &all_reduce_nodes, | |||||
uint32_t node_index, std::vector<domi::TaskDef> &task_def_list, | |||||
bool is_insert_bp_profiling_task); | |||||
static bool IsProfPoint(const OpDescPtr &op, const std::string &name); | static bool IsProfPoint(const OpDescPtr &op, const std::string &name); | ||||
@@ -155,6 +161,8 @@ class TaskGenerator { | |||||
Status SetKnownShapeStream(RunContext &run_context, int64_t stream_id); | Status SetKnownShapeStream(RunContext &run_context, int64_t stream_id); | ||||
bool IsSubGraphOfDynamicGraph(const ComputeGraphPtr &graph) const; | |||||
uint8_t *var_mem_base_ = nullptr; | uint8_t *var_mem_base_ = nullptr; | ||||
uint64_t var_mem_size_ = 0; | uint64_t var_mem_size_ = 0; | ||||
}; | }; | ||||
@@ -174,6 +174,38 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel | |||||
compute_graph_info = context_->GetProfilingGraphDescInfo(); | compute_graph_info = context_->GetProfilingGraphDescInfo(); | ||||
context_->ClearProfilingGraphDescInfo(); | context_->ClearProfilingGraphDescInfo(); | ||||
auto op_desc = node->GetOpDesc(); | |||||
GE_CHECK_NOTNULL(op_desc); | |||||
for (auto &tmp_compute_graph_info : compute_graph_info) { | |||||
// default | |||||
if (op_desc->GetAllInputsSize() == 0) { | |||||
tmp_compute_graph_info.input_format = { FORMAT_NULL }; | |||||
tmp_compute_graph_info.input_shape = { {0} }; | |||||
tmp_compute_graph_info.input_data_type = { DT_UNDEFINED }; | |||||
} | |||||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | |||||
GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i); | |||||
if (input_desc == nullptr) { | |||||
continue; | |||||
} | |||||
tmp_compute_graph_info.input_format.emplace_back(input_desc->GetFormat()); | |||||
tmp_compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims()); | |||||
tmp_compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType()); | |||||
} | |||||
if (op_desc->GetOutputsSize() == 0) { | |||||
tmp_compute_graph_info.output_format = { FORMAT_NULL }; | |||||
tmp_compute_graph_info.output_shape = { {0} }; | |||||
tmp_compute_graph_info.output_data_type = { DT_UNDEFINED }; | |||||
} | |||||
for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) { | |||||
GeTensorDesc output_desc = op_desc->GetOutputDesc(j); | |||||
tmp_compute_graph_info.output_format.emplace_back(output_desc.GetFormat()); | |||||
tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | |||||
tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | |||||
} | |||||
} | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -1608,16 +1608,19 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||||
GE_CHECK_NOTNULL(compute_graph); | GE_CHECK_NOTNULL(compute_graph); | ||||
NodePtr node_ptr = nullptr; | NodePtr node_ptr = nullptr; | ||||
vector<domi::TaskDef> task_def_list; | |||||
map<NodePtr, vector<domi::TaskDef>> node_task_map; | |||||
// create fp node | // create fp node | ||||
bool is_insert_fp_profiling_task = false; | bool is_insert_fp_profiling_task = false; | ||||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); | (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); | ||||
if (is_insert_fp_profiling_task) { | if (is_insert_fp_profiling_task) { | ||||
vector<domi::TaskDef> task_def_list; | |||||
(void)GenerateFpProfilingTask(op_desc, task_def_list); | (void)GenerateFpProfilingTask(op_desc, task_def_list); | ||||
auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE); | auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE); | ||||
GE_CHECK_NOTNULL(fp_desc); | GE_CHECK_NOTNULL(fp_desc); | ||||
fp_desc->SetOpKernelLibName(kEngineNameRts); | fp_desc->SetOpKernelLibName(kEngineNameRts); | ||||
node_ptr = compute_graph->AddNode(fp_desc); | node_ptr = compute_graph->AddNode(fp_desc); | ||||
GE_CHECK_NOTNULL(node_ptr); | |||||
node_task_map[node_ptr] = task_def_list; | |||||
GELOGD("Create fp profiling node success before."); | GELOGD("Create fp profiling node success before."); | ||||
} | } | ||||
// creat all reduce start node | // creat all reduce start node | ||||
@@ -1625,6 +1628,7 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | ||||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | ||||
if (is_all_reduce && is_insert_bp_profiling_task) { | if (is_all_reduce && is_insert_bp_profiling_task) { | ||||
vector<domi::TaskDef> task_def_list; | |||||
int64_t log_id = 0; | int64_t log_id = 0; | ||||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | ||||
GELOGD("All reduce node profiling task log id: %ld before", log_id); | GELOGD("All reduce node profiling task log id: %ld before", log_id); | ||||
@@ -1634,18 +1638,24 @@ Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, cons | |||||
GE_CHECK_NOTNULL(ar_desc_start); | GE_CHECK_NOTNULL(ar_desc_start); | ||||
ar_desc_start->SetOpKernelLibName(kEngineNameRts); | ar_desc_start->SetOpKernelLibName(kEngineNameRts); | ||||
node_ptr = compute_graph->AddNode(ar_desc_start); | node_ptr = compute_graph->AddNode(ar_desc_start); | ||||
GE_CHECK_NOTNULL(node_ptr); | |||||
node_task_map[node_ptr] = task_def_list; | |||||
GELOGD("Create all reduce start profiling node success before."); | GELOGD("Create all reduce start profiling node success before."); | ||||
} | } | ||||
if (node_ptr != nullptr) { | |||||
for (const auto &task_def : task_def_list) { | |||||
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||||
if (!node_task_map.empty()) { | |||||
for (const auto &node_task : node_task_map) { | |||||
NodePtr profiling_node = node_task.first; | |||||
vector<domi::TaskDef> task_def_lists = node_task.second; | |||||
for (const auto &task_def : task_def_lists) { | |||||
hybrid_model_.task_defs_[profiling_node].emplace_back(task_def); | |||||
} | |||||
NodeItem *node_item = nullptr; | |||||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(profiling_node, &node_item)); | |||||
node_item->input_start = 0; | |||||
node_item->output_start = 0; | |||||
graph_item.node_items_.emplace_back(node_item); | |||||
} | } | ||||
NodeItem *node_item = nullptr; | |||||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||||
node_item->input_start = 0; | |||||
node_item->output_start = 0; | |||||
graph_item.node_items_.emplace_back(node_item); | |||||
} else { | } else { | ||||
GELOGD("No need to create profiling node before."); | GELOGD("No need to create profiling node before."); | ||||
} | } | ||||
@@ -1661,12 +1671,13 @@ Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const | |||||
GE_CHECK_NOTNULL(compute_graph); | GE_CHECK_NOTNULL(compute_graph); | ||||
NodePtr node_ptr = nullptr; | NodePtr node_ptr = nullptr; | ||||
vector<domi::TaskDef> task_def_list; | |||||
map<NodePtr, vector<domi::TaskDef>> node_task_map; | |||||
// Create all reduce end node | // Create all reduce end node | ||||
bool is_insert_bp_profiling_task = false; | bool is_insert_bp_profiling_task = false; | ||||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); | ||||
bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE); | ||||
if (is_all_reduce && is_insert_bp_profiling_task) { | if (is_all_reduce && is_insert_bp_profiling_task) { | ||||
vector<domi::TaskDef> task_def_list; | |||||
int64_t log_id = 0; | int64_t log_id = 0; | ||||
(void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id); | ||||
GELOGD("All reduce node profiling task log id: %ld after", log_id); | GELOGD("All reduce node profiling task log id: %ld after", log_id); | ||||
@@ -1676,38 +1687,50 @@ Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const | |||||
GE_CHECK_NOTNULL(ar_desc_end); | GE_CHECK_NOTNULL(ar_desc_end); | ||||
ar_desc_end->SetOpKernelLibName(kEngineNameRts); | ar_desc_end->SetOpKernelLibName(kEngineNameRts); | ||||
node_ptr = compute_graph->AddNode(ar_desc_end); | node_ptr = compute_graph->AddNode(ar_desc_end); | ||||
GE_CHECK_NOTNULL(node_ptr); | |||||
node_task_map[node_ptr] = task_def_list; | |||||
GELOGD("Create all reduce end profiling node success after."); | GELOGD("Create all reduce end profiling node success after."); | ||||
} | } | ||||
// create bp node | // create bp node | ||||
if (!is_all_reduce && is_insert_bp_profiling_task) { | if (!is_all_reduce && is_insert_bp_profiling_task) { | ||||
vector<domi::TaskDef> task_def_list; | |||||
(void) GenerateBpProfilingTask(op_desc, task_def_list); | (void) GenerateBpProfilingTask(op_desc, task_def_list); | ||||
auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE); | auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE); | ||||
GE_CHECK_NOTNULL(bp_op_desc); | GE_CHECK_NOTNULL(bp_op_desc); | ||||
bp_op_desc->SetOpKernelLibName(kEngineNameRts); | bp_op_desc->SetOpKernelLibName(kEngineNameRts); | ||||
node_ptr = compute_graph->AddNode(bp_op_desc); | node_ptr = compute_graph->AddNode(bp_op_desc); | ||||
GE_CHECK_NOTNULL(node_ptr); | |||||
node_task_map[node_ptr] = task_def_list; | |||||
GELOGD("Create bp profiling node success after."); | GELOGD("Create bp profiling node success after."); | ||||
} | } | ||||
// create end node | // create end node | ||||
bool is_insert_end_profiling_task = false; | bool is_insert_end_profiling_task = false; | ||||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task); | (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task); | ||||
if (is_insert_end_profiling_task) { | if (is_insert_end_profiling_task) { | ||||
vector<domi::TaskDef> task_def_list; | |||||
(void)GenerateEndProfilingTask(op_desc, task_def_list); | (void)GenerateEndProfilingTask(op_desc, task_def_list); | ||||
auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE); | auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE); | ||||
GE_CHECK_NOTNULL(end_desc); | GE_CHECK_NOTNULL(end_desc); | ||||
end_desc->SetOpKernelLibName(kEngineNameRts); | end_desc->SetOpKernelLibName(kEngineNameRts); | ||||
node_ptr = compute_graph->AddNode(end_desc); | node_ptr = compute_graph->AddNode(end_desc); | ||||
GE_CHECK_NOTNULL(node_ptr); | |||||
node_task_map[node_ptr] = task_def_list; | |||||
GELOGD("Create end profiling node success after."); | GELOGD("Create end profiling node success after."); | ||||
} | } | ||||
if (node_ptr != nullptr) { | |||||
for (const auto &task_def : task_def_list) { | |||||
hybrid_model_.task_defs_[node_ptr].emplace_back(task_def); | |||||
if (!node_task_map.empty()) { | |||||
for (const auto &node_task : node_task_map) { | |||||
NodePtr profiling_node = node_task.first; | |||||
vector<domi::TaskDef> task_def_lists = node_task.second; | |||||
for (const auto &task_def : task_def_lists) { | |||||
hybrid_model_.task_defs_[profiling_node].emplace_back(task_def); | |||||
} | |||||
NodeItem *node_item = nullptr; | |||||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(profiling_node, &node_item)); | |||||
node_item->input_start = 0; | |||||
node_item->output_start = 0; | |||||
graph_item.node_items_.emplace_back(node_item); | |||||
} | } | ||||
NodeItem *node_item = nullptr; | |||||
GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item)); | |||||
node_item->input_start = 0; | |||||
node_item->output_start = 0; | |||||
graph_item.node_items_.emplace_back(node_item); | |||||
} else { | } else { | ||||
GELOGD("No need to create profiling node after."); | GELOGD("No need to create profiling node after."); | ||||
} | } | ||||
@@ -554,33 +554,6 @@ Status TaskContext::SaveProfilingGraphDescInfo(uint32_t task_id, uint32_t stream | |||||
tmp_compute_graph_info.model_name = dynamic_model_name; | tmp_compute_graph_info.model_name = dynamic_model_name; | ||||
tmp_compute_graph_info.op_name = op_desc->GetName(); | tmp_compute_graph_info.op_name = op_desc->GetName(); | ||||
tmp_compute_graph_info.op_type = op_desc->GetType(); | tmp_compute_graph_info.op_type = op_desc->GetType(); | ||||
// default | |||||
if (op_desc->GetAllInputsSize() == 0) { | |||||
tmp_compute_graph_info.input_format = { FORMAT_NULL }; | |||||
tmp_compute_graph_info.input_shape = { {0} }; | |||||
tmp_compute_graph_info.input_data_type = { DT_UNDEFINED }; | |||||
} | |||||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | |||||
GeTensorDescPtr input_desc = op_desc->MutableInputDesc(i); | |||||
if (input_desc == nullptr) { | |||||
continue; | |||||
} | |||||
tmp_compute_graph_info.input_format.emplace_back(input_desc->GetFormat()); | |||||
tmp_compute_graph_info.input_shape.emplace_back(input_desc->GetShape().GetDims()); | |||||
tmp_compute_graph_info.input_data_type.emplace_back(input_desc->GetDataType()); | |||||
} | |||||
if (op_desc->GetOutputsSize() == 0) { | |||||
tmp_compute_graph_info.output_format = { FORMAT_NULL }; | |||||
tmp_compute_graph_info.output_shape = { {0} }; | |||||
tmp_compute_graph_info.output_data_type = { DT_UNDEFINED }; | |||||
} | |||||
for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) { | |||||
GeTensorDesc output_desc = op_desc->GetOutputDesc(j); | |||||
tmp_compute_graph_info.output_format.emplace_back(output_desc.GetFormat()); | |||||
tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); | |||||
tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); | |||||
} | |||||
tmp_compute_graph_info.task_id = task_id; | tmp_compute_graph_info.task_id = task_id; | ||||
tmp_compute_graph_info.stream_id = stream_id; | tmp_compute_graph_info.stream_id = stream_id; | ||||
compute_graph_info.emplace_back(tmp_compute_graph_info); | compute_graph_info.emplace_back(tmp_compute_graph_info); | ||||