| @@ -318,7 +318,11 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_ | |||||
| AddDependLife(node, node, stream_id_, depend_stream_life_, total_node_depend_stream_life); | AddDependLife(node, node, stream_id_, depend_stream_life_, total_node_depend_stream_life); | ||||
| } | } | ||||
| } | } | ||||
| depend_stream_life_[stream_id_] = GetLifeBegin(); | |||||
| // not same stream can't be reused by life time directly, should be reused by dependence | |||||
| if (same_stream_) { | |||||
| depend_stream_life_[stream_id_] = GetLifeBegin(); | |||||
| } | |||||
| } | } | ||||
| size_t MemoryBlock::GetLifeEnd() { | size_t MemoryBlock::GetLifeEnd() { | ||||
| @@ -415,6 +419,15 @@ BlockMemAssigner::~BlockMemAssigner() { | |||||
| } | } | ||||
| } | } | ||||
| void BlockMemAssigner::MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc) { | |||||
| // if input size just one, no need to reassign continuous memory | |||||
| bool is_input_continuous = false; | |||||
| (void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); | |||||
| if (is_input_continuous && (node_op_desc->GetInputsSize() <= 1)) { | |||||
| (void)ge::AttrUtils::SetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); | |||||
| } | |||||
| } | |||||
| void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | ||||
| vector<int64_t> temp; | vector<int64_t> temp; | ||||
| for (const NodePtr &n : compute_graph_->GetAllNodes()) { | for (const NodePtr &n : compute_graph_->GetAllNodes()) { | ||||
| @@ -425,6 +438,8 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | |||||
| atomic_addr_clean_id_ = node_op_desc->GetId(); | atomic_addr_clean_id_ = node_op_desc->GetId(); | ||||
| } | } | ||||
| MarkContinuousAllocedForOneInput(node_op_desc); | |||||
| for (auto &out_anchor : n->GetAllOutDataAnchors()) { | for (auto &out_anchor : n->GetAllOutDataAnchors()) { | ||||
| GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | ||||
| bool reuse_input = false; | bool reuse_input = false; | ||||
| @@ -815,14 +830,21 @@ bool BlockMemAssigner::IsContinuousOutput(const NodePtr &n) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| // Get the continuous output type of the node, default is false | |||||
| bool is_output_continuous = false; | |||||
| auto node_desc = n->GetOpDesc(); | auto node_desc = n->GetOpDesc(); | ||||
| if (node_desc == nullptr) { | if (node_desc == nullptr) { | ||||
| GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); | GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); | ||||
| return false; | return false; | ||||
| } | } | ||||
| // if output size just one, no need to reassign continuous memory | |||||
| if (node_desc->GetOutputsSize() == 1) { | |||||
| GELOGI("op %s output size is one, no need to continuous process.", n->GetName().c_str()); | |||||
| return false; | |||||
| } | |||||
| // Get the continuous output type of the node, default is false | |||||
| bool is_output_continuous = false; | |||||
| // If GetBool fail, is_output_continuous is false. | // If GetBool fail, is_output_continuous is false. | ||||
| (void)ge::AttrUtils::GetBool(node_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); | (void)ge::AttrUtils::GetBool(node_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); | ||||
| if (is_output_continuous) { | if (is_output_continuous) { | ||||
| @@ -928,6 +950,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec | |||||
| GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | ||||
| auto node_op_desc = n->GetOpDesc(); | auto node_op_desc = n->GetOpDesc(); | ||||
| GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | ||||
| MemoryBlock *block = nullptr; | MemoryBlock *block = nullptr; | ||||
| int64_t total_size = 0; | int64_t total_size = 0; | ||||
| int64_t memory_type = RT_MEMORY_HBM; | int64_t memory_type = RT_MEMORY_HBM; | ||||
| @@ -1111,15 +1134,21 @@ bool IsKnownSubgraphData(const NodePtr &node) { | |||||
| return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX); | return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX); | ||||
| } | } | ||||
| void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) { | |||||
| void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, | |||||
| bool same_stream) { | |||||
| GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); | ||||
| GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); | GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); | ||||
| GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); | GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); | ||||
| --to_release->ref_count_; | --to_release->ref_count_; | ||||
| if (!same_stream) { | |||||
| to_release->same_stream_ = false; | |||||
| } | |||||
| if (to_release->ref_count_ == 0) { | if (to_release->ref_count_ == 0) { | ||||
| to_release->SetLifeTimeEnd(life_time_); | to_release->SetLifeTimeEnd(life_time_); | ||||
| reusable_memory.emplace_back(to_release); | |||||
| AddReusableBlockCount(*to_release, reusable_block_counts_); | |||||
| if (to_release->same_stream_) { | |||||
| reusable_memory.emplace_back(to_release); | |||||
| AddReusableBlockCount(*to_release, reusable_block_counts_); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -1159,10 +1188,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map<string, vec | |||||
| node_type_indexs.back().node->GetName().c_str()); | node_type_indexs.back().node->GetName().c_str()); | ||||
| if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && | if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && | ||||
| (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx())) && | |||||
| (node->GetOpDesc()->GetStreamId() == block->stream_id_)) { | |||||
| ReleaseMemory(block, reusable_memory); | |||||
| if (block->ref_count_ == 0) { | |||||
| (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx()))) { | |||||
| ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_)); | |||||
| if (block->ref_count_ == 0 && block->same_stream_) { | |||||
| SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); | SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); | ||||
| } | } | ||||
| } | } | ||||
| @@ -1682,10 +1710,10 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, | |||||
| op_desc->SetWorkspace(workspace_list); | op_desc->SetWorkspace(workspace_list); | ||||
| } | } | ||||
| GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]" | GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]" | ||||
| " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(), | |||||
| " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d].", graph_name.c_str(), | |||||
| op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), | op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), | ||||
| block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_, | block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_, | ||||
| block->continuous_block_, block->deleted_block_, node_type.ref_input); | |||||
| block->continuous_block_, block->deleted_block_, block->same_stream_, node_type.ref_input); | |||||
| } | } | ||||
| void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { | void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { | ||||
| @@ -1746,9 +1774,8 @@ Status BlockMemAssigner::Assign() { | |||||
| bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | ||||
| return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | ||||
| (node_type == HCOMBROADCAST) || (node_type == CONSTANTOP) || | |||||
| (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) || | |||||
| (node_type == HVDCALLBACKBROADCAST); | |||||
| (node_type == CONSTANTOP) || (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || | |||||
| (node_type == ASSIGN) || (node_type == HVDWAIT); | |||||
| } | } | ||||
| bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | ||||
| @@ -65,6 +65,7 @@ class MemoryBlock { | |||||
| stream_id_(stream_id), | stream_id_(stream_id), | ||||
| deleted_block_(false), | deleted_block_(false), | ||||
| reuse_mem_(reuse_mem), | reuse_mem_(reuse_mem), | ||||
| same_stream_(true), | |||||
| input_index_(0), | input_index_(0), | ||||
| continuous_block_(false), | continuous_block_(false), | ||||
| first_continuous_block_(false), | first_continuous_block_(false), | ||||
| @@ -142,6 +143,7 @@ class MemoryBlock { | |||||
| int64_t stream_id_; | int64_t stream_id_; | ||||
| bool deleted_block_; | bool deleted_block_; | ||||
| bool reuse_mem_; | bool reuse_mem_; | ||||
| bool same_stream_; | |||||
| uint32_t input_index_; | uint32_t input_index_; | ||||
| bool continuous_block_; | bool continuous_block_; | ||||
| bool first_continuous_block_; | bool first_continuous_block_; | ||||
| @@ -353,7 +355,7 @@ class BlockMemAssigner : public MemAssigner { | |||||
| /// @return void | /// @return void | ||||
| /// @author | /// @author | ||||
| /// | /// | ||||
| void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory); | |||||
| void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true); | |||||
| /// | /// | ||||
| /// @ingroup GE | /// @ingroup GE | ||||
| @@ -409,6 +411,8 @@ class BlockMemAssigner : public MemAssigner { | |||||
| MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem); | MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem); | ||||
| void MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc); | |||||
| std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_; | std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_; | ||||
| std::map<std::string, uint64_t> reusable_block_counts_; | std::map<std::string, uint64_t> reusable_block_counts_; | ||||
| @@ -1993,12 +1993,6 @@ Status DavinciModel::SyncVarData() { | |||||
| RT_MEMCPY_HOST_TO_DEVICE)); | RT_MEMCPY_HOST_TO_DEVICE)); | ||||
| } | } | ||||
| for (auto op_desc : variable_op_list_) { | |||||
| ret = | |||||
| VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_); | |||||
| GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_, | |||||
| op_desc->GetName().c_str()); | |||||
| } | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -1997,6 +1997,8 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { | |||||
| new (std::nothrow) TransOpWithoutReshapeFusionPass)) | new (std::nothrow) TransOpWithoutReshapeFusionPass)) | ||||
| GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::TransOpBreadthFusionPass", | GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::TransOpBreadthFusionPass", | ||||
| new (std::nothrow) TransOpBreadthFusionPass)) | new (std::nothrow) TransOpBreadthFusionPass)) | ||||
| GE_CHK_STATUS_RET( | |||||
| after_merge_passes.AddPass("OptimizeStage1_1::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass)); | |||||
| GE_TIMESTAMP_START(after_merge_passes); | GE_TIMESTAMP_START(after_merge_passes); | ||||
| auto ret = after_merge_passes.Run(compute_graph); | auto ret = after_merge_passes.Run(compute_graph); | ||||
| @@ -32,46 +32,152 @@ const char *const kInputMutable = "_input_mutable"; | |||||
| } // namespace | } // namespace | ||||
| namespace ge { | namespace ge { | ||||
| Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | ||||
| Status ret = SUCCESS; | |||||
| GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID); | GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID); | ||||
| for (const auto &node : graph->GetDirectNode()) { | for (const auto &node : graph->GetDirectNode()) { | ||||
| auto op_desc = node->GetOpDesc(); | auto op_desc = node->GetOpDesc(); | ||||
| GE_IF_BOOL_EXEC(op_desc == nullptr, continue); | |||||
| if (op_desc == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| ret = ContinuousInputProcess(graph, node); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "failed ProcessBroadcastMemcpy, node_name:%s.", node->GetName().c_str()); | |||||
| return ret; | |||||
| } | |||||
| ret = MutableInputProcess(graph, node); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "failed MutableInputProcess, node_name:%s.", node->GetName().c_str()); | |||||
| return ret; | |||||
| } | |||||
| ret = P2pmemInputProcess(graph, node); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "failed P2pmemInputProcess, node_name:%s.", node->GetName().c_str()); | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| // If node has _input_mutable attr, means input mem may be modified when op execute. | |||||
| // In order to avoid to affect another op execute with same input when data modified, | |||||
| // need to inset memcpy node between. | |||||
| // also works on situation that input is variable or const. | |||||
| Status HcclMemcpyPass::MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
| auto op_desc = node->GetOpDesc(); | |||||
| bool node_input_mutable = false; | |||||
| if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | |||||
| return SUCCESS; | |||||
| } | |||||
| if (!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable)) { | |||||
| GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); | |||||
| return FAILED; | |||||
| } | |||||
| if (!node_input_mutable) { | |||||
| return SUCCESS; | |||||
| } | |||||
| bool node_input_mutable = false; | |||||
| if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | |||||
| GELOGI("input mutable hcom op is:%s.", op_desc->GetName().c_str()); | |||||
| for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | |||||
| if (hccl_in_anchor == nullptr) { | |||||
| continue; | continue; | ||||
| } | } | ||||
| auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | |||||
| GE_CHECK_NOTNULL(src_out_anchor); | |||||
| GE_IF_BOOL_EXEC(!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable), | |||||
| GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); return FAILED); | |||||
| if (!node_input_mutable) { | |||||
| int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); | |||||
| if (src_out_anchor_size == kAnchorSize) { | |||||
| // Identity needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | |||||
| if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| continue; | continue; | ||||
| } | } | ||||
| GELOGI("hcom op is:%s.", op_desc->GetName().c_str()); | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| // If broadcast input size is bigger than 1, and input from variable, | |||||
| // cause by broadcast input memory should be continuous, | |||||
| // another featuremap mem will be allocated for broadcast input. | |||||
| // In this condition, move data from variable mem to broadcast input featuremap mem will be executed each step. | |||||
| // In order to avoid move action out of model, use memcpy node instead of move action code. | |||||
| Status HcclMemcpyPass::ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
| auto op_desc = node->GetOpDesc(); | |||||
| bool is_input_continuous = false; | |||||
| (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); | |||||
| if (is_input_continuous && op_desc->GetInputsSize() > 1) { | |||||
| GELOGI("continuous input op is:%s.", op_desc->GetName().c_str()); | |||||
| // if input size bigger than one, insert memcpy between var data for support continous mem alloc | |||||
| for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | ||||
| if (hccl_in_anchor == nullptr) { | if (hccl_in_anchor == nullptr) { | ||||
| continue; | continue; | ||||
| } | } | ||||
| auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | ||||
| GE_CHECK_NOTNULL(src_out_anchor); | |||||
| int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); | |||||
| if (src_out_anchor_size == kAnchorSize) { | |||||
| // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | |||||
| NodePtr src_node = src_out_anchor->GetOwnerNode(); | |||||
| std::string src_type = src_node->GetType(); | |||||
| bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA) || (src_type == CONSTANT); | |||||
| if (check_src_type) { | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
| return ret; | |||||
| } | |||||
| if (src_out_anchor == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
| if (ret != SUCCESS) { | |||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
| return ret; | |||||
| } | } | ||||
| continue; | |||||
| } | } | ||||
| } | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| // if input is var type, and node input need p2p mem, then memcpy should be insert between the two | |||||
| Status HcclMemcpyPass::P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
| auto op_desc = node->GetOpDesc(); | |||||
| vector<int64_t> input_memory_types; | |||||
| (void) ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, input_memory_types); | |||||
| if (input_memory_types.empty()) { | |||||
| return SUCCESS; | |||||
| } | |||||
| for (uint32_t index = 0; index < input_memory_types.size() && index < op_desc->GetInputsSize(); index++) { | |||||
| if (input_memory_types[index] != RT_MEMORY_P2P_DDR) { | |||||
| continue; | |||||
| } | |||||
| GELOGI("p2p input op is:%s.", op_desc->GetName().c_str()); | |||||
| auto hccl_in_anchor = node->GetInDataAnchor(index); | |||||
| if (hccl_in_anchor == nullptr) { | |||||
| continue; | |||||
| } | |||||
| auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | |||||
| if (src_out_anchor == nullptr) { | |||||
| GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
| Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | ||||
| @@ -82,8 +188,12 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| bool HcclMemcpyPass::IsDataNode(const std::string& node_type) { | |||||
| return (node_type == CONSTANTOP) || (node_type == VARIABLE) || (node_type == DATA) || (node_type == CONSTANT); | |||||
| } | |||||
| /// | /// | ||||
| /// @brief Add MemcpyAsync Node | |||||
| /// @brief Add Identity Node | |||||
| /// @param [in] ge::ComputeGraphPtr graph | /// @param [in] ge::ComputeGraphPtr graph | ||||
| /// @param [in] ge::OutDataAnchorPtr in_node | /// @param [in] ge::OutDataAnchorPtr in_node | ||||
| /// @return ge::NodePtr | /// @return ge::NodePtr | ||||
| @@ -101,20 +211,20 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O | |||||
| node_name = CheckDuplicateName(node_name); | node_name = CheckDuplicateName(node_name); | ||||
| OpDescPtr op_desc = MakeShared<OpDesc>(node_name.c_str(), IDENTITY); | OpDescPtr op_desc = MakeShared<OpDesc>(node_name.c_str(), IDENTITY); | ||||
| if (op_desc == nullptr) { | if (op_desc == nullptr) { | ||||
| GELOGE(INTERNAL_ERROR, "Create identity op: MakeShared op_desc fail."); | |||||
| GELOGE(INTERNAL_ERROR, "Create Identity op: MakeShared op_desc fail."); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| GELOGI("Create identity op:%s.", op_desc->GetName().c_str()); | |||||
| GELOGI("Create Identity op:%s.", op_desc->GetName().c_str()); | |||||
| graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ||||
| if (ret != GRAPH_SUCCESS) { | if (ret != GRAPH_SUCCESS) { | ||||
| GELOGE(INTERNAL_ERROR, "Create identity op: add input desc fail."); | |||||
| GELOGE(INTERNAL_ERROR, "Create Identity op: add input desc fail."); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ||||
| if (ret != GRAPH_SUCCESS) { | if (ret != GRAPH_SUCCESS) { | ||||
| GELOGE(INTERNAL_ERROR, "Create identity op: add output desc fail."); | |||||
| GELOGE(INTERNAL_ERROR, "Create Identity op: add output desc fail."); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| // because history reason ,this pass can not do work after constant fold so mark it | // because history reason ,this pass can not do work after constant fold so mark it | ||||
| @@ -122,7 +232,7 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O | |||||
| NodePtr memcpy_node = graph->AddNode(op_desc); | NodePtr memcpy_node = graph->AddNode(op_desc); | ||||
| if (memcpy_node == nullptr) { | if (memcpy_node == nullptr) { | ||||
| GELOGE(INTERNAL_ERROR, "Insert identity node fail."); | |||||
| GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| @@ -155,7 +265,8 @@ std::string HcclMemcpyPass::CheckDuplicateName(const std::string &node_name) { | |||||
| /// | /// | ||||
| Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | ||||
| const InDataAnchorPtr &hccl_in_anchor) { | const InDataAnchorPtr &hccl_in_anchor) { | ||||
| GELOGI("The op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str()); | |||||
| GELOGI("Between op %s and op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str(), | |||||
| hccl_in_anchor->GetOwnerNode()->GetName().c_str()); | |||||
| NodePtr memcpy_node = CreateIdentityNode(graph, src_out_anchor); | NodePtr memcpy_node = CreateIdentityNode(graph, src_out_anchor); | ||||
| GE_CHECK_NOTNULL(memcpy_node); | GE_CHECK_NOTNULL(memcpy_node); | ||||
| @@ -37,6 +37,14 @@ class HcclMemcpyPass : public GraphPass { | |||||
| Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | ||||
| const InDataAnchorPtr &hccl_in_anchor); | const InDataAnchorPtr &hccl_in_anchor); | ||||
| Status ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
| Status MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
| Status P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
| bool IsDataNode(const std::string& node_type); | |||||
| std::unordered_map<std::string, uint32_t> node_num_map_; | std::unordered_map<std::string, uint32_t> node_num_map_; | ||||
| }; | }; | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -60,7 +60,6 @@ | |||||
| #include "graph/passes/get_original_format_pass.h" | #include "graph/passes/get_original_format_pass.h" | ||||
| #include "graph/passes/guarantee_const_pass.h" | #include "graph/passes/guarantee_const_pass.h" | ||||
| #include "graph/passes/hccl_group_pass.h" | #include "graph/passes/hccl_group_pass.h" | ||||
| #include "graph/passes/hccl_memcpy_pass.h" | |||||
| #include "graph/passes/identity_pass.h" | #include "graph/passes/identity_pass.h" | ||||
| #include "graph/passes/infershape_pass.h" | #include "graph/passes/infershape_pass.h" | ||||
| #include "graph/passes/iterator_op_pass.h" | #include "graph/passes/iterator_op_pass.h" | ||||
| @@ -1693,8 +1692,6 @@ Status GraphPrepare::PrepareOptimize() { | |||||
| PassManager graph_pass; | PassManager graph_pass; | ||||
| try { | try { | ||||
| (void)graph_pass.AddPass("PrepareOptimize::PrunePass", new PrunePass); | (void)graph_pass.AddPass("PrepareOptimize::PrunePass", new PrunePass); | ||||
| // todo 临时把hccl的memcpy插入放到图准备,为了防止其多插memcpy | |||||
| (void)graph_pass.AddPass("PrepareOptimize::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass); | |||||
| } catch (std::bad_alloc &e) { | } catch (std::bad_alloc &e) { | ||||
| GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); | GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); | ||||
| return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
| @@ -245,6 +245,11 @@ const std::string INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16"; | |||||
| // 0: close debug; 1: open TBE compiler; 2: open ccec compiler | // 0: close debug; 1: open TBE compiler; 2: open ccec compiler | ||||
| const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel"; | const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel"; | ||||
| // Configure for fix hcombroadcast format. | |||||
| // when config model multi, broadcast format should be fixed | |||||
| // 0: data multi; 1: model multi; | |||||
| const std::string HCOM_MULTI_MODE = "ge.hcomMultiMode"; | |||||
| // Graph run mode | // Graph run mode | ||||
| enum GraphRunMode { PREDICTION = 0, TRAIN }; | enum GraphRunMode { PREDICTION = 0, TRAIN }; | ||||