2.if broadcast input more than one, and from variable, add memcpy node between them. delete move variable to broadcast input in davinci model runpull/277/head
| @@ -425,6 +425,13 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | |||||
| atomic_addr_clean_id_ = node_op_desc->GetId(); | atomic_addr_clean_id_ = node_op_desc->GetId(); | ||||
| } | } | ||||
| // if input size just one, no need to reassign continuous memory | |||||
| bool is_input_continuous = false; | |||||
| (void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); | |||||
| if (is_input_continuous && (node_op_desc->GetInputSize() <= 1)) { | |||||
| (void)ge::AttrUtils::SetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); | |||||
| } | |||||
| for (auto &out_anchor : n->GetAllOutDataAnchors()) { | for (auto &out_anchor : n->GetAllOutDataAnchors()) { | ||||
| GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | ||||
| bool reuse_input = false; | bool reuse_input = false; | ||||
| @@ -928,6 +935,13 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec | |||||
| GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | ||||
| auto node_op_desc = n->GetOpDesc(); | auto node_op_desc = n->GetOpDesc(); | ||||
| GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | ||||
| // if output size just one, no need to reassign continuous memory | |||||
| if (node_op_desc->GetOutputsSize() == 1) { | |||||
| zero_memory_list_.emplace_back(n, kOutput, 0); | |||||
| return nullptr; | |||||
| } | |||||
| MemoryBlock *block = nullptr; | MemoryBlock *block = nullptr; | ||||
| int64_t total_size = 0; | int64_t total_size = 0; | ||||
| int64_t memory_type = RT_MEMORY_HBM; | int64_t memory_type = RT_MEMORY_HBM; | ||||
| @@ -1746,9 +1760,8 @@ Status BlockMemAssigner::Assign() { | |||||
| bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | ||||
| return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | ||||
| (node_type == HCOMBROADCAST) || (node_type == CONSTANTOP) || | |||||
| (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) || | |||||
| (node_type == HVDCALLBACKBROADCAST); | |||||
| (node_type == CONSTANTOP) || (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || | |||||
| (node_type == ASSIGN) || (node_type == HVDWAIT); | |||||
| } | } | ||||
| bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | ||||
| @@ -1993,12 +1993,6 @@ Status DavinciModel::SyncVarData() { | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | RT_MEMCPY_HOST_TO_DEVICE)); | ||||
| } | } | ||||
| for (auto op_desc : variable_op_list_) { | |||||
| ret = | |||||
| VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_); | |||||
| GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_, | |||||
| op_desc->GetName().c_str()); | |||||
| } | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -37,6 +37,12 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | |||||
| auto op_desc = node->GetOpDesc(); | auto op_desc = node->GetOpDesc(); | ||||
| GE_IF_BOOL_EXEC(op_desc == nullptr, continue); | GE_IF_BOOL_EXEC(op_desc == nullptr, continue); | ||||
| Status ret = ProcessBroadcastMemcpy(graph, node); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "failed ProcessBroadcastMemcpy."); | |||||
| return ret; | |||||
| } | |||||
| bool node_input_mutable = false; | bool node_input_mutable = false; | ||||
| if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | ||||
| continue; | continue; | ||||
| @@ -61,7 +67,7 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | |||||
| // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | ||||
| NodePtr src_node = src_out_anchor->GetOwnerNode(); | NodePtr src_node = src_out_anchor->GetOwnerNode(); | ||||
| std::string src_type = src_node->GetType(); | std::string src_type = src_node->GetType(); | ||||
| bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA) || (src_type == CONSTANT); | |||||
| bool check_src_type = (src_type == CONSTANTOP) || (src_type == VARIABLE) || (src_type == DATA) || (src_type == CONSTANT); | |||||
| if (check_src_type) { | if (check_src_type) { | ||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| @@ -82,6 +88,44 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| // If broadcast input size is bigger than 1, and input from variable, | |||||
| // cause by broadcast input memory should be continuous, | |||||
| // another featuremap mem will be allocated for broadcast input. | |||||
| // In this condition, move data from variable mem to broadcast input featuremap mem will be executed each step. | |||||
| // In order to avoid move action out of model, use memcpy node instead of move action code. | |||||
| Status HcclMemcpyPass::ProcessBroadcastMemcpy(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
| auto op_desc = node->GetOpDesc(); | |||||
| if (op_desc == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| if ((node->GetType() == HCOMBROADCAST || node->GetType() == HVDCALLBACKBROADCAST) && op_desc->GetInputSize() > 1) { | |||||
| for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | |||||
| if (hccl_in_anchor == nullptr) { | |||||
| continue; | |||||
| } | |||||
| auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | |||||
| if (src_out_anchor == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| NodePtr src_node = src_out_anchor->GetOwnerNode(); | |||||
| std::string src_type = src_node->GetType(); | |||||
| bool check_src_type = (src_type == CONSTANTOP) || (src_type == VARIABLE) || (src_type == DATA) || (src_type == CONSTANT); | |||||
| if (check_src_type) { | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| /// | /// | ||||
| /// @brief Add MemcpyAsync Node | /// @brief Add MemcpyAsync Node | ||||
| /// @param [in] ge::ComputeGraphPtr graph | /// @param [in] ge::ComputeGraphPtr graph | ||||
| @@ -37,6 +37,8 @@ class HcclMemcpyPass : public GraphPass { | |||||
| Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | ||||
| const InDataAnchorPtr &hccl_in_anchor); | const InDataAnchorPtr &hccl_in_anchor); | ||||
| Status ProcessBroadcastMemcpy(const ComputeGraphPtr &graph, const NodePtr node); | |||||
| std::unordered_map<std::string, uint32_t> node_num_map_; | std::unordered_map<std::string, uint32_t> node_num_map_; | ||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||