From ceaec233002ff0e9e90dfeada120c6efdd50d9a8 Mon Sep 17 00:00:00 2001 From: chuxing Date: Thu, 25 Mar 2021 19:55:51 +0800 Subject: [PATCH 1/4] serialize hccl ops --- ge/hybrid/model/hybrid_model_builder.cc | 191 +++++++++++++----- ge/hybrid/model/hybrid_model_builder.h | 8 +- ge/hybrid/model/node_item.cc | 4 + ge/hybrid/model/node_item.h | 2 + .../compiledsubgraph/known_node_executor.cc | 43 ++-- .../compiledsubgraph/known_node_executor.h | 8 +- tests/ut/ge/CMakeLists.txt | 1 + tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 4 +- .../ge/hybrid/known_node_executor_unittest.cc | 62 ++++++ 9 files changed, 244 insertions(+), 79 deletions(-) create mode 100644 tests/ut/ge/hybrid/known_node_executor_unittest.cc diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index a3b1da20..55f3c4dd 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -255,9 +255,7 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n (void) AttrUtils::SetBool(new_node->op_desc, kIsFirstNode, false); (void) AttrUtils::SetBool(new_node->op_desc, kIsLastNode, false); - new_node->node_id = node_index; - new_node->op_desc->SetId(node_index); - node_index += 1; + new_node->node_id = static_cast(new_node->op_desc->GetId()); NodeExecutorManager::ExecutorType executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node); new_node->is_profiling_report = (executor_type == NodeExecutorManager::ExecutorType::AICORE) || (executor_type == NodeExecutorManager::ExecutorType::AICPU_TF) || @@ -279,10 +277,10 @@ Status HybridModelBuilder::ParseForceInfershapeNodes(const NodePtr &node, NodeIt } Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const std::vector &dependencies) { - std::set dependent_input_nodes; + std::set dependent_for_shape_inference; + std::set dependent_for_execution; auto &ge_node = node_item.node; - bool is_hccl_op = - NodeExecutorManager::GetInstance().ResolveExecutorType(*ge_node) == NodeExecutorManager::ExecutorType::HCCL; + bool is_hccl_op = node_item.IsHcclOp(); // The input tensors become valid after computation is done for parent nodes of type DEPEND_COMPUTE. // Wait for these parent nodes before execution. @@ -297,29 +295,15 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s auto src_node_item = MutableNodeItem(src_node); GE_CHECK_NOTNULL(src_node_item); - if (is_hccl_op) { - GELOGD("[%s] Add input data dependent node [%s] due to engine type is HCCL", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str()); - src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); - node_item.has_observer = true; - for (auto &dst_node : ge_node->GetOutNodes()) { - if (dst_node == nullptr) { - continue; - } - - NodeItem *dst_node_item = nullptr; - GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(dst_node, &dst_node_item)); - dst_node_item->dependents_for_execution.emplace_back(ge_node); - } - } else if (src_node_item->shape_inference_type == DEPEND_COMPUTE) { - GELOGD("[%s] Add input data dependent node [%s] due to inference type = DEPEND_COMPUTE", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str()); - + if (src_node_item->shape_inference_type == DEPEND_COMPUTE || is_hccl_op || src_node_item->IsHcclOp()) { + GELOGD("[%s](%s) Add input data dependent node [%s](%s), shape inference type = %d", + ge_node->GetName().c_str(), + ge_node->GetType().c_str(), + src_node->GetName().c_str(), + src_node->GetType().c_str(), + static_cast(src_node_item->shape_inference_type)); src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); + dependent_for_execution.emplace(src_node); } if (src_node_item->shape_inference_type == DEPEND_SHAPE_RANGE) { @@ -327,22 +311,17 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s node_item.NodeName().c_str(), src_node_item->NodeName().c_str()); src_node_item->has_observer = true; - dependent_input_nodes.emplace(src_node); + dependent_for_shape_inference.emplace(src_node); } } // cond or branch need to be prepared before the execution of IF or CASE if (node_item.node_type == IF || node_item.node_type == STATELESSIF || node_item.node_type == CASE) { - const auto &in_anchor = ge_node->GetInDataAnchor(0); - GE_CHECK_NOTNULL(in_anchor); - const auto &peer_anchor = in_anchor->GetPeerOutAnchor(); - GE_CHECK_NOTNULL(peer_anchor); - auto src_node = peer_anchor->GetOwnerNode(); + auto src_node = NodeUtils::GetInDataNodeByIndex(*ge_node, 0); // cond input GE_CHECK_NOTNULL(src_node); auto src_node_item = MutableNodeItem(src_node); GE_CHECK_NOTNULL(src_node_item); - src_node_item->has_observer = true; - node_item.dependents_for_execution.emplace_back(src_node); + dependent_for_execution.emplace(src_node); GELOGD("[%s] Dependent added from %s for control op's cond/branch", node_item.NodeName().c_str(), src_node_item->NodeName().c_str()); @@ -366,24 +345,32 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s GE_CHECK_NOTNULL(src_node); auto src_node_item = MutableNodeItem(src_node); src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx()); - src_node_item->has_observer = true; - - dependent_input_nodes.emplace(src_node); + dependent_for_shape_inference.emplace(src_node); GELOGD("[%s] Dependent added from output of [%s:%d]", node_item.NodeName().c_str(), src_node_item->NodeName().c_str(), peer_out_anchor->GetIdx()); } - for (const auto &dep_node : dependent_input_nodes) { + GE_CHK_STATUS_RET(ParseDependentForFusedSubgraph(node_item, dependent_for_shape_inference)); + for (const auto &dep_node : dependent_for_shape_inference) { + auto src_node_item = MutableNodeItem(dep_node); + GE_CHECK_NOTNULL(src_node_item); + src_node_item->has_observer = true; node_item.dependents_for_shape_inference.emplace_back(dep_node); } - GE_CHK_STATUS_RET(ParseDependentForFusedSubgraph(node_item)); + for (const auto &dep_node : dependent_for_execution) { + auto src_node_item = MutableNodeItem(dep_node); + GE_CHECK_NOTNULL(src_node_item); + src_node_item->has_observer = true; + node_item.dependents_for_execution.emplace_back(dep_node); + } + return SUCCESS; } -Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { +Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item, std::set &dependencies) { if (node_item.fused_subgraph == nullptr) { return SUCCESS; } @@ -413,17 +400,12 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) { node_item.NodeName().c_str(), op_desc->GetName().c_str(), src_node_item->NodeName().c_str()); - src_node_item->has_observer = true; src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx()); - - auto &depends = node_item.dependents_for_shape_inference; - if (std::find(depends.begin(), depends.end(), src_node) == depends.end()) { - depends.emplace_back(src_node); - GELOGD("[%s] Dependent added from output of [%s:%d]", - node_item.NodeName().c_str(), - src_node_item->NodeName().c_str(), - peer_out_anchor->GetIdx()); - } + dependencies.emplace(src_node); + GELOGD("[%s] Dependent added from output of [%s:%d]", + node_item.NodeName().c_str(), + src_node_item->NodeName().c_str(), + peer_out_anchor->GetIdx()); } return SUCCESS; @@ -770,9 +752,23 @@ Status HybridModelBuilder::LoadGraph() { GELOGI("After merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), root_graph->GetAllNodesSize()); - GE_DUMP(root_graph, "hybrid_merged_graph"); } + root_graph_ = root_graph; + // Reset node id by topological order across all subgraphs + int64_t index = 0; + for (const auto &node : root_graph->GetAllNodes()) { + GE_CHECK_NOTNULL(node); + auto parent_graph = node->GetOwnerComputeGraph(); + // No need to update nodes in known subgraph + if (parent_graph != nullptr && !parent_graph->GetGraphUnknownFlag()) { + continue; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + op_desc->SetId(index++); + } + GE_DUMP(root_graph, "hybrid_merged_graph"); GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), "Failed to load root graph."); GELOGD("Done loading root graph successfully."); GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), "Failed to group nodes for root graph"); @@ -810,6 +806,7 @@ Status HybridModelBuilder::LoadGraph() { } } + GE_CHK_STATUS_RET(ParseDependentForHcclNodes(), "Failed to establish dependencies for hccl ops"); GELOGI("Done loading all subgraphs successfully."); return SUCCESS; } @@ -1077,7 +1074,12 @@ Status HybridModelBuilder::InitWeights() { Status HybridModelBuilder::LoadTasks() { GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); + std::map ordered_node_items; for (auto &it : hybrid_model_.node_items_) { + auto &node_item = it.second; + ordered_node_items.emplace(node_item->node_id, node_item.get()); + } + for (auto &it : ordered_node_items) { auto &node_item = it.second; auto &node_ptr = node_item->node; if (node_item->node_type == NETOUTPUT) { @@ -1905,6 +1907,7 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root NodeItem *node_item = nullptr; GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node, &node_item)); GE_CHK_STATUS_RET_NOLOG(BuildNodeItem(node, *node_item)); + GE_CHK_STATUS_RET_NOLOG(ParseParallelGroups(node_item)); GE_CHK_STATUS_RET_NOLOG(UpdateAnchorStatus(node)); // needed by FE generate task node_item->input_start = input_start; @@ -2011,5 +2014,87 @@ Status HybridModelBuilder::CheckAicpuOpList() { "Launch check aicpu op type failed."); return SUCCESS; } + +Status HybridModelBuilder::ParseParallelGroups(NodeItem *node_item) { + const auto &node = node_item->node; + auto executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node); + if (executor_type == NodeExecutorManager::ExecutorType::HCCL) { + std::string parallel_group; + if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, parallel_group)) { + GELOGD("[%s] Got parallel group = %s", node_item->NodeName().c_str(), parallel_group.c_str()); + group_to_nodes_[parallel_group].emplace(node_item); + std::set group{parallel_group}; + node_to_groups_[node_item].emplace(parallel_group); + } + } else if (executor_type == NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH) { + std::set parallel_groups; + GELOGD("[%s] Parse parallel group for known-shaped subgraph", node_item->NodeName().c_str()); + for (const auto &subgraph_name : node->GetOpDesc()->GetSubgraphInstanceNames()) { + GELOGD("[%s] Start to get parallel group from subgraph: %s", + node_item->NodeName().c_str(), + subgraph_name.c_str()); + auto subgraph = root_graph_->GetSubgraph(subgraph_name); + GE_CHECK_NOTNULL(subgraph); + for (const auto &sub_node : subgraph->GetAllNodes()) { + std::string parallel_group; + if (AttrUtils::GetStr(sub_node->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, parallel_group)) { + GELOGD("[%s::%s] Got parallel group = %s", + subgraph_name.c_str(), + sub_node->GetName().c_str(), + parallel_group.c_str()); + parallel_groups.emplace(parallel_group); + } + } + } + + if (!parallel_groups.empty()) { + for (const auto ¶llel_group : parallel_groups) { + group_to_nodes_[parallel_group].emplace(node_item); + GELOGD("[%s] has parallel group: %s", node_item->NodeName().c_str(), parallel_group.c_str()); + } + node_to_groups_.emplace(node_item, std::move(parallel_groups)); + } + } + + return SUCCESS; +} + +Status HybridModelBuilder::ParseDependentForHcclNodes() { + for (const auto &it : node_to_groups_) { + auto node_item = it.first; + auto dst_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node_item->node); + for (const auto ¶llel_group : it.second) { + auto &dependent_nodes = group_to_nodes_[parallel_group]; + NodeItem *nearest_dep_node = nullptr; + int max_id = -1; + for (auto &dep_node : dependent_nodes) { + auto src_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*dep_node->node); + if (src_engine_type == dst_engine_type) { + continue; + } + + if (dep_node->node_id < node_item->node_id && dep_node->node_id > max_id) { + nearest_dep_node = dep_node; + max_id = dep_node->node_id; + } + } + + if (nearest_dep_node != nullptr) { + GELOGD("Add dependency for nodes of same parallel group[%s], src = [%s], dst = [%s]", + parallel_group.c_str(), + nearest_dep_node->NodeName().c_str(), + node_item->NodeName().c_str()); + auto &deps = node_item->dependents_for_execution; + if (std::find(deps.begin(), deps.end(), nearest_dep_node->node) != deps.end()) { + GELOGD("Already has dependency, skip it"); + continue; + } + nearest_dep_node->has_observer = true; + deps.emplace_back(nearest_dep_node->node); + } + } + } + return SUCCESS; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h index 313d5ca6..0b91afbe 100644 --- a/ge/hybrid/model/hybrid_model_builder.h +++ b/ge/hybrid/model/hybrid_model_builder.h @@ -63,8 +63,10 @@ class HybridModelBuilder { Status BuildNodeItem(const NodePtr &node, NodeItem &node_item); Status GetOrCreateNodeItem(const NodePtr &node, NodeItem **node_item); Status ParseForceInfershapeNodes(const NodePtr &node, NodeItem &node_item); + Status ParseParallelGroups(NodeItem *node_item); Status ParseDependentInputNodes(NodeItem &node_item, const std::vector &dependencies); - Status ParseDependentForFusedSubgraph(NodeItem &node_item); + Status ParseDependentForFusedSubgraph(NodeItem &node_item, std::set &dependencies); + Status ParseDependentForHcclNodes(); Status IndexTaskDefs(); Status IndexTaskDefs(const ComputeGraphPtr &sub_graph, const GeModelPtr &ge_model); Status IndexSpecialNodes(); @@ -97,12 +99,14 @@ class HybridModelBuilder { NodeItem *MutableNodeItem(const NodePtr &node); GeRootModelPtr ge_root_model_; + ComputeGraphPtr root_graph_; std::map subgraph_models_; std::map constant_op_nodes_; + std::map> group_to_nodes_; + std::map> node_to_groups_; HybridModel &hybrid_model_; std::map>> node_ref_inputs_; - int node_index = 0; RuntimeParam &runtime_param_; VarManager *var_manager_ = nullptr; diff --git a/ge/hybrid/model/node_item.cc b/ge/hybrid/model/node_item.cc index 805064be..06d654cf 100644 --- a/ge/hybrid/model/node_item.cc +++ b/ge/hybrid/model/node_item.cc @@ -251,6 +251,10 @@ bool NodeItem::IsControlOp() const { return ge::hybrid::IsControlOp(op_desc->GetType()); } +bool NodeItem::IsHcclOp() const { + return NodeExecutorManager::GetInstance().ResolveExecutorType(*node) == NodeExecutorManager::ExecutorType::HCCL; +} + std::string NodeItem::DebugString() const { std::stringstream ss; ss << "Node: "; diff --git a/ge/hybrid/model/node_item.h b/ge/hybrid/model/node_item.h index 631dbd9e..474a1da4 100644 --- a/ge/hybrid/model/node_item.h +++ b/ge/hybrid/model/node_item.h @@ -67,6 +67,8 @@ struct NodeItem { bool IsControlOp() const; + bool IsHcclOp() const; + void SetToDynamic(); std::string DebugString() const; diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc index bb96c275..45882343 100755 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc @@ -95,13 +95,6 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { Status KnownNodeTask::Init(TaskContext &context) { // allocate output mem GE_CHK_STATUS_RET(context.AllocateOutputs(), "known node task allocate output failed."); - - // init davinicmodel - if (!load_flag_) { - davinci_model_->InitRuntimeParams(); - GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed."); - } - // allocate mem base void *buffer = nullptr; if (davinci_model_->TotalMemSize() != 0) { @@ -129,23 +122,31 @@ Status KnownNodeTask::Init(TaskContext &context) { void *global_step = context.GetExecutionContext()->global_step; davinci_model_->SetKnownShapeGlobalStep(global_step); } - int32_t device_id = 0; - rtError_t rt_ret = rtGetDevice(&device_id); - if (rt_ret != RT_ERROR_NONE || device_id < 0) { - GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); - return RT_ERROR_TO_GE_STATUS(rt_ret); - } - davinci_model_->SetDeviceId(device_id); - GE_CHK_STATUS_RET(davinci_model_->Init(), "KnownNodeExecutor::InitDavinciModel failed."); load_flag_ = true; - } else { - GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), - davinci_model_->Id(), davinci_model_->SubModelId()), "KnownNodeTask::Init destroy aicpu kernel failed."); } + GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), + davinci_model_->Id(), davinci_model_->SubModelId()), + "KnownNodeTask::Init destroy aicpu kernel failed."); GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); return SUCCESS; } +Status KnownNodeTask::InitDavinciModel() { + GELOGD("[Init][Model] start"); + davinci_model_->InitRuntimeParams(); + GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed"); + int32_t device_id = 0; + GE_CHK_RT_RET(rtGetDevice(&device_id)); + davinci_model_->SetDeviceId(static_cast(device_id)); + GE_CHK_STATUS_RET(DoInitDavinciModel(), "[Init][Model] Failed to init davinci model."); + GELOGD("[Init][Model] success"); + return SUCCESS; +} + +Status KnownNodeTask::DoInitDavinciModel() { + return davinci_model_->Init(); +} + Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start"); @@ -182,9 +183,11 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed."); - task = MakeShared(davinci_model); - GE_CHECK_NOTNULL(task); + auto known_node_task = MakeShared(davinci_model); + GE_CHECK_NOTNULL(known_node_task); + GE_CHK_STATUS_RET_NOLOG(known_node_task->InitDavinciModel()); GELOGI("[%s] KnownNodeExecutor::LoadTask success.", node->GetName().c_str()); + task = std::move(known_node_task); return SUCCESS; } diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h index 6e9740ad..5eed528a 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h @@ -31,11 +31,15 @@ class KnownNodeTask : public NodeTask { : davinci_model_(davinci_model) {} - ~KnownNodeTask() {} + ~KnownNodeTask() = default; Status UpdateArgs(TaskContext &context) override; Status ExecuteAsync(TaskContext &context, std::function done_callback) override; Status Init(TaskContext &context) override; + Status InitDavinciModel(); + + protected: + virtual Status DoInitDavinciModel(); private: std::shared_ptr davinci_model_ = nullptr; bool load_flag_ = false; @@ -47,8 +51,6 @@ class KnownNodeExecutor : public NodeExecutor { Status PrepareTask(NodeTask &task, TaskContext &context) const; Status ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const; ~KnownNodeExecutor() {} - private: - std::shared_ptr davinci_model_ = nullptr; }; } // namespace hybrid } // namespace ge diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 3caba788..4c4a72a3 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -793,6 +793,7 @@ set(PROFILING_MNG_TEST_FILES set(HYBRID_TEST_FILES "hybrid/ge_hybrid_unittest.cc" + "hybrid/known_node_executor_unittest.cc" ) set(OTHERS_TEST_FILES diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 3b5d19e6..e5669d15 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -51,7 +51,9 @@ class UtestGeHybrid : public testing::Test { protected: void SetUp() {} - void TearDown() {} + void TearDown() { + NpuMemoryAllocator::allocators_.clear(); + } }; static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") { diff --git a/tests/ut/ge/hybrid/known_node_executor_unittest.cc b/tests/ut/ge/hybrid/known_node_executor_unittest.cc new file mode 100644 index 00000000..67a8e323 --- /dev/null +++ b/tests/ut/ge/hybrid/known_node_executor_unittest.cc @@ -0,0 +1,62 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#define protected public +#define private public +#include "hybrid/node_executor/compiledsubgraph/known_node_executor.h" +#undef private +#undef protected +#include "graph/manager/graph_mem_allocator.h" + +using namespace std; +using namespace testing; +using namespace ge; +using namespace hybrid; + +class UnknownNodeExecutorTest : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +namespace { +class KnownNodeTaskMock : public KnownNodeTask { + public: + KnownNodeTaskMock(std::shared_ptr davinci_model): KnownNodeTask(davinci_model) {}; + ~KnownNodeTaskMock() override = default; + MOCK_METHOD0(DoInitDavinciModel, Status()); +}; +} + +TEST_F(UnknownNodeExecutorTest, test_init_davinci_model) { + auto davinci_model = std::make_shared(0, nullptr); + davinci_model->SetDeviceId(0); + davinci_model->SetKnownNode(true); + + auto ge_model = make_shared(); + AttrUtils::SetInt(ge_model, ATTR_MODEL_VAR_SIZE, 0); + AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 1024); + davinci_model->Assign(ge_model); + + KnownNodeTaskMock mock(davinci_model); + EXPECT_CALL(mock, DoInitDavinciModel).WillOnce(::testing::Return(SUCCESS)); + ASSERT_EQ(mock.InitDavinciModel(), SUCCESS); +} \ No newline at end of file From deebe05906c51155d37faf1ac476a80b899e4b42 Mon Sep 17 00:00:00 2001 From: chuxing Date: Fri, 26 Mar 2021 10:39:45 +0800 Subject: [PATCH 2/4] update UT --- ge/hybrid/model/hybrid_model_builder.cc | 23 ++++---- ge/hybrid/model/hybrid_model_builder.h | 8 +-- tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 72 +++++++++++++++++++++++- 3 files changed, 88 insertions(+), 15 deletions(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 55f3c4dd..316b94de 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -806,7 +806,7 @@ Status HybridModelBuilder::LoadGraph() { } } - GE_CHK_STATUS_RET(ParseDependentForHcclNodes(), "Failed to establish dependencies for hccl ops"); + GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), "Failed to establish dependencies for hccl ops"); GELOGI("Done loading all subgraphs successfully."); return SUCCESS; } @@ -1907,7 +1907,7 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root NodeItem *node_item = nullptr; GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node, &node_item)); GE_CHK_STATUS_RET_NOLOG(BuildNodeItem(node, *node_item)); - GE_CHK_STATUS_RET_NOLOG(ParseParallelGroups(node_item)); + GE_CHK_STATUS_RET_NOLOG(CollectParallelGroups(node_item)); GE_CHK_STATUS_RET_NOLOG(UpdateAnchorStatus(node)); // needed by FE generate task node_item->input_start = input_start; @@ -2015,16 +2015,16 @@ Status HybridModelBuilder::CheckAicpuOpList() { return SUCCESS; } -Status HybridModelBuilder::ParseParallelGroups(NodeItem *node_item) { +Status HybridModelBuilder::CollectParallelGroups(NodeItem *node_item) { const auto &node = node_item->node; auto executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node); if (executor_type == NodeExecutorManager::ExecutorType::HCCL) { std::string parallel_group; if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_PARALLEL_GROUP, parallel_group)) { GELOGD("[%s] Got parallel group = %s", node_item->NodeName().c_str(), parallel_group.c_str()); - group_to_nodes_[parallel_group].emplace(node_item); + parallel_group_to_nodes_[parallel_group].emplace(node_item); std::set group{parallel_group}; - node_to_groups_[node_item].emplace(parallel_group); + node_to_parallel_groups_[node_item].emplace(parallel_group); } } else if (executor_type == NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH) { std::set parallel_groups; @@ -2049,25 +2049,28 @@ Status HybridModelBuilder::ParseParallelGroups(NodeItem *node_item) { if (!parallel_groups.empty()) { for (const auto ¶llel_group : parallel_groups) { - group_to_nodes_[parallel_group].emplace(node_item); + parallel_group_to_nodes_[parallel_group].emplace(node_item); GELOGD("[%s] has parallel group: %s", node_item->NodeName().c_str(), parallel_group.c_str()); } - node_to_groups_.emplace(node_item, std::move(parallel_groups)); + node_to_parallel_groups_.emplace(node_item, std::move(parallel_groups)); } } return SUCCESS; } -Status HybridModelBuilder::ParseDependentForHcclNodes() { - for (const auto &it : node_to_groups_) { +Status HybridModelBuilder::ParseDependentByParallelGroup() { + for (const auto &it : node_to_parallel_groups_) { auto node_item = it.first; auto dst_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node_item->node); for (const auto ¶llel_group : it.second) { - auto &dependent_nodes = group_to_nodes_[parallel_group]; + auto &dependent_nodes = parallel_group_to_nodes_[parallel_group]; NodeItem *nearest_dep_node = nullptr; int max_id = -1; for (auto &dep_node : dependent_nodes) { + if (node_item == dep_node) { + continue; + } auto src_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*dep_node->node); if (src_engine_type == dst_engine_type) { continue; diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h index 0b91afbe..1481d61e 100644 --- a/ge/hybrid/model/hybrid_model_builder.h +++ b/ge/hybrid/model/hybrid_model_builder.h @@ -63,10 +63,10 @@ class HybridModelBuilder { Status BuildNodeItem(const NodePtr &node, NodeItem &node_item); Status GetOrCreateNodeItem(const NodePtr &node, NodeItem **node_item); Status ParseForceInfershapeNodes(const NodePtr &node, NodeItem &node_item); - Status ParseParallelGroups(NodeItem *node_item); + Status CollectParallelGroups(NodeItem *node_item); Status ParseDependentInputNodes(NodeItem &node_item, const std::vector &dependencies); Status ParseDependentForFusedSubgraph(NodeItem &node_item, std::set &dependencies); - Status ParseDependentForHcclNodes(); + Status ParseDependentByParallelGroup(); Status IndexTaskDefs(); Status IndexTaskDefs(const ComputeGraphPtr &sub_graph, const GeModelPtr &ge_model); Status IndexSpecialNodes(); @@ -102,8 +102,8 @@ class HybridModelBuilder { ComputeGraphPtr root_graph_; std::map subgraph_models_; std::map constant_op_nodes_; - std::map> group_to_nodes_; - std::map> node_to_groups_; + std::map> parallel_group_to_nodes_; + std::map> node_to_parallel_groups_; HybridModel &hybrid_model_; std::map>> node_ref_inputs_; diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index e5669d15..2166b274 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -19,10 +19,12 @@ #include #include "runtime/rt.h" +#include "graph/utils/node_utils.h" #define protected public #define private public #include "hybrid/model/hybrid_model_builder.h" #include "hybrid/model/hybrid_model.h" +#include "hybrid/node_executor/node_executor.h" #include "model/ge_model.h" #include "model/ge_root_model.h" #include "hybrid/node_executor/aicore/aicore_op_task.h" @@ -247,7 +249,7 @@ TEST_F(UtestGeHybrid, init_weight_success) { ASSERT_EQ(ret,PARAM_INVALID); } - TEST_F(UtestGeHybrid, hybrid_model_executor) { +TEST_F(UtestGeHybrid, hybrid_model_executor) { ComputeGraphPtr compute_graph = MakeShared("abc"); GeRootModelPtr root_model = MakeShared(compute_graph); HybridModel model(root_model); @@ -258,3 +260,71 @@ TEST_F(UtestGeHybrid, init_weight_success) { HybridModelExecutor executor(model_ptr, device_id, stream); executor.Init(); } + +TEST_F(UtestGeHybrid, test_parse_parallel_group) { + NodeExecutorManager::GetInstance().engine_mapping_.emplace("ops_kernel_info_hccl", + NodeExecutorManager::ExecutorType::HCCL); + ComputeGraphPtr compute_graph = MakeShared("test"); + OpDescPtr op_desc = CreateOpDesc("AllReduce", "AllReduce"); + op_desc->SetId(0); + ge::AttrUtils::SetStr(op_desc, ATTR_NAME_PARALLEL_GROUP, "group_1"); + auto node = compute_graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->node_id = 0; + + op_desc->SetOpKernelLibName("ops_kernel_info_hccl"); + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + + HybridModelBuilder builder(model); + builder.root_graph_ = compute_graph; + ASSERT_EQ(builder.CollectParallelGroups(node_item.get()), SUCCESS); + + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 1); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 1); + + OpDescPtr op_desc_1 = CreateOpDesc("subgraph", "PartitionedCall"); + op_desc_1->AddSubgraphName("subgraph"); + auto node_1 = compute_graph->AddNode(op_desc_1); + + ComputeGraphPtr subgraph = MakeShared("subgraph"); + ASSERT_EQ(NodeUtils::SetSubgraph(*node_1, 0, subgraph), GRAPH_SUCCESS); + + std::unique_ptr node_item_1; + NodeItem::Create(node_1, node_item_1); + node_item_1->node_id = 1; + + ASSERT_EQ(builder.CollectParallelGroups(node_item_1.get()), SUCCESS); + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 1); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 1); + + OpDescPtr op_desc_2 = CreateOpDesc("sub_node_1", "AllReduce"); + ge::AttrUtils::SetStr(op_desc_2, ATTR_NAME_PARALLEL_GROUP, "group_1"); + auto node_2 = subgraph->AddNode(op_desc_2); + ASSERT_TRUE(node_2 != nullptr); + + OpDescPtr op_desc_3 = CreateOpDesc("sub_node_2", "AllReduce2"); + ge::AttrUtils::SetStr(op_desc_3, ATTR_NAME_PARALLEL_GROUP, "group_2"); + auto node_3 = subgraph->AddNode(op_desc_3); + ASSERT_TRUE(node_3 != nullptr); + + ASSERT_EQ(builder.CollectParallelGroups(node_item_1.get()), SUCCESS); + ASSERT_EQ(builder.node_to_parallel_groups_.size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_.size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_["group_1"].size(), 2); + ASSERT_EQ(builder.parallel_group_to_nodes_["group_2"].size(), 1); + + ASSERT_FALSE(node_item->has_observer); + ASSERT_TRUE(node_item_1->dependents_for_execution.empty()); + ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); + ASSERT_TRUE(node_item->has_observer); + ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); + ASSERT_EQ(node_item_1->dependents_for_execution[0], node); + + // repeat parse + ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); + ASSERT_TRUE(node_item->has_observer); + ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); + ASSERT_EQ(node_item_1->dependents_for_execution[0], node); +} \ No newline at end of file From 795a935d34b3d296dd64efd456d16ed72fc15561 Mon Sep 17 00:00:00 2001 From: chuxing Date: Fri, 26 Mar 2021 20:40:56 +0800 Subject: [PATCH 3/4] fix Profiling --- ge/hybrid/model/hybrid_model_builder.cc | 37 +++++++++++++++---------- ge/hybrid/model/hybrid_model_builder.h | 1 + 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 316b94de..dfd6ac6b 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -1072,30 +1072,39 @@ Status HybridModelBuilder::InitWeights() { return SUCCESS; } +Status HybridModelBuilder::LoadTask(NodeItem &node_item) { + auto &node_ptr = node_item.node; + GELOGD("[%s] Start to build kernel task", node_ptr->GetName().c_str()); + auto load_ret = node_item.node_executor->LoadTask(hybrid_model_, + node_ptr, + node_item.kernel_task); + if (load_ret != UNSUPPORTED && load_ret != SUCCESS) { + GELOGE(load_ret, "[%s] Failed to load task", node_ptr->GetName().c_str()); + return load_ret; + } + + GELOGD("[%s] Done loading task successfully.", node_ptr->GetName().c_str()); + return SUCCESS; +} + Status HybridModelBuilder::LoadTasks() { GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); - std::map ordered_node_items; + std::map ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { - auto &node_item = it.second; - ordered_node_items.emplace(node_item->node_id, node_item.get()); - } - for (auto &it : ordered_node_items) { auto &node_item = it.second; auto &node_ptr = node_item->node; if (node_item->node_type == NETOUTPUT) { continue; } - - GELOGD("[%s] Start to build kernel task", node_ptr->GetName().c_str()); - auto load_ret = node_item->node_executor->LoadTask(hybrid_model_, - node_ptr, - node_item->kernel_task); - if (load_ret != UNSUPPORTED && load_ret != SUCCESS) { - GELOGE(load_ret, "[%s] Failed to load task", node_ptr->GetName().c_str()); - return load_ret; + if (node_item->node_type == PARTITIONEDCALL) { + ordered_partitioned_calls.emplace(node_item->node_id, node_item.get()); } + GE_CHK_STATUS_RET_NOLOG(LoadTask(*node_item)); + } - GELOGD("[%s] Done loading task successfully.", node_ptr->GetName().c_str()); + // HCCL operators need to be loaded in the same order across different processes + for (auto &it : ordered_partitioned_calls) { + GE_CHK_STATUS_RET_NOLOG(LoadTask(*it.second)); } return SUCCESS; diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h index 1481d61e..a59a282a 100644 --- a/ge/hybrid/model/hybrid_model_builder.h +++ b/ge/hybrid/model/hybrid_model_builder.h @@ -57,6 +57,7 @@ class HybridModelBuilder { Status ValidateParams(); Status LoadGraph(); Status LoadGeModel(ComputeGraph &graph, const GeModelPtr &ge_model); + Status LoadTask(NodeItem &node_item); Status LoadTasks(); Status IdentifyVariableOutputs(NodeItem &node_item); Status IdentifySameInputs(NodeItem &node_item); From 91d57d07e385b01b863d8920a85f52c8bf42b89b Mon Sep 17 00:00:00 2001 From: chuxing Date: Fri, 26 Mar 2021 20:58:11 +0800 Subject: [PATCH 4/4] fix sc --- ge/hybrid/model/hybrid_model_builder.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index dfd6ac6b..f1f28010 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -1092,7 +1092,6 @@ Status HybridModelBuilder::LoadTasks() { std::map ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { auto &node_item = it.second; - auto &node_ptr = node_item->node; if (node_item->node_type == NETOUTPUT) { continue; }