From 14732acd6f17294ceae50de750792c62a1aaa143 Mon Sep 17 00:00:00 2001 From: TangQunzhang Date: Mon, 11 Jan 2021 11:01:41 +0800 Subject: [PATCH] Continuous memory optimization, code refactoring --- .../build/memory/binary_block_mem_assigner.cc | 4 +- ge/graph/build/memory/block_mem_assigner.cc | 264 ++++-- ge/graph/build/memory/block_mem_assigner.h | 54 +- ge/graph/build/memory/graph_mem_assigner.cc | 855 ++++++------------ ge/graph/build/memory/graph_mem_assigner.h | 24 +- .../load/new_model_manager/davinci_model.cc | 6 +- metadef | 2 +- parser | 2 +- 8 files changed, 503 insertions(+), 708 deletions(-) diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc index fff589f3..97a0aed6 100644 --- a/ge/graph/build/memory/binary_block_mem_assigner.cc +++ b/ge/graph/build/memory/binary_block_mem_assigner.cc @@ -69,8 +69,8 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { GELOGW("Vector all_memory_size is empty!"); return SUCCESS; } - if ((all_memory_size.front() == 0) || (log(kLogBase) == 0)) { - GELOGE(FAILED, "dividend is 0!"); + if ((all_memory_size.front() <= 0) || (log(kLogBase) == 0)) { + GELOGE(FAILED, "Memory size:%ld is invalid.", all_memory_size.front()); return FAILED; } // Memory size is 512 aligned, so it is not necessary to take less than 512 diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc index 76e7efbe..21d6a49e 100755 --- a/ge/graph/build/memory/block_mem_assigner.cc +++ b/ge/graph/build/memory/block_mem_assigner.cc @@ -65,10 +65,7 @@ void AlignMemOffset(size_t &mem_align_size) { } static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { - auto left_node_op_desc = left.node->GetOpDesc(); - auto right_node_op_desc = right.node->GetOpDesc(); - if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr) - && (left_node_op_desc->GetId() < right_node_op_desc->GetId())) { + if (left.GetLifeBegin() < right.GetLifeBegin()) { return true; } return false; @@ -100,14 +97,14 @@ bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { auto left_node_op_desc = left.node->GetOpDesc(); auto right_node_op_desc = right.node->GetOpDesc(); if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) { - if (left_node_op_desc->GetId() < right_node_op_desc->GetId()) { - if (left.life_time_end >= static_cast(right_node_op_desc->GetId())) { + if (left.GetLifeBegin() < right.GetLifeBegin()) { + if (left.life_time_end >= right.GetLifeBegin()) { return true; } - } else if (left_node_op_desc->GetId() == right_node_op_desc->GetId()) { + } else if (left.GetLifeBegin() == right.GetLifeBegin()) { return true; } else { - if (right.life_time_end >= static_cast(left_node_op_desc->GetId())) { + if (right.life_time_end >= left.GetLifeBegin()) { return true; } } @@ -325,12 +322,7 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_ size_t MemoryBlock::GetLifeBegin() { size_t life_time = 0; if (!node_type_index_list_.empty()) { - if (node_type_index_list_.front().node != nullptr) { - auto node_op_desc = node_type_index_list_.front().node->GetOpDesc(); - if (node_op_desc != nullptr) { - life_time = node_op_desc->GetId(); - } - } + life_time = node_type_index_list_.front().GetLifeBegin(); } return life_time; } @@ -417,7 +409,7 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_ depend_stream_life_[stream_id_] = GetLifeBegin(); } -size_t MemoryBlock::GetLifeEnd() { +size_t MemoryBlock::GetLifeEnd() const { if (!node_type_index_list_.empty()) { return node_type_index_list_.back().life_time_end; } @@ -571,32 +563,29 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { for (auto &out_anchor : n->GetAllOutDataAnchors()) { GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); - bool reuse_input = false; - GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInput(output_desc, reuse_input) != SUCCESS, - GELOGI("Get reuse_input failed")); - - if (!reuse_input) { - int64_t size = 0; - GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); - batch_all_memory_size[batch_label].emplace_back(size); - if (batch_total_size.find(batch_label) == batch_total_size.end()) { - batch_total_size[batch_label] = size; - } else { - batch_total_size[batch_label] += size; - } + int64_t size = 0; + GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); + GE_IF_BOOL_EXEC(size < 0, GELOGE(FAILED, "Node:%s size:%ld is invalid, maybe it is unknown shape node.", + node_op_desc->GetName().c_str(), size); + return;); + batch_all_memory_size[batch_label].emplace_back(size); + if (batch_total_size.find(batch_label) == batch_total_size.end()) { + batch_total_size[batch_label] = size; + } else { + batch_total_size[batch_label] += size; + } - if (!anchor_to_symbol_.empty()) { - auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString()); - if (iter1 == anchor_to_symbol_.end()) { - continue; - } - const std::string &symbol = iter1->second; - auto iter2 = symbol_size_.find(symbol); - if (iter2 == symbol_size_.end()) { - symbol_size_[symbol] = size; - } else if (size > static_cast(iter2->second)) { - iter2->second = size; - } + if (!anchor_to_symbol_.empty()) { + auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString()); + if (iter1 == anchor_to_symbol_.end()) { + continue; + } + const std::string &symbol = iter1->second; + auto iter2 = symbol_size_.find(symbol); + if (iter2 == symbol_size_.end()) { + symbol_size_[symbol] = size; + } else if (size > static_cast(iter2->second)) { + iter2->second = size; } } } @@ -637,35 +626,17 @@ bool IsDirectOutputNode(const NodePtr &node, int idx) { return false; } -void AddReusableBlockCount(const MemoryBlock &mem_block, map &reusable_block_counts) { - string key = std::to_string(mem_block.Size()); - key += "_" + std::to_string(mem_block.stream_id_); - key += "_" + std::to_string(mem_block.memory_type_); - auto it = reusable_block_counts.find(key); - if (it != reusable_block_counts.end()) { - it->second++; - } else { - reusable_block_counts[key] = 1; - } -} - -void ReduceReusableBlockCount(const MemoryBlock &mem_block, map &reusable_block_counts) { - string key = std::to_string(mem_block.Size()); - key += "_" + std::to_string(mem_block.stream_id_); - key += "_" + std::to_string(mem_block.memory_type_); - auto it = reusable_block_counts.find(key); - if (it != reusable_block_counts.end()) { - if (it->second > 0) { - it->second--; - } - } -} - -bool CanReuseBySize(const map &reusable_block_counts, const MemoryBlock &reusable_block, - size_t block_size, size_t real_size, bool continuous) { +bool CanReuseBlock(size_t continuous_life_begin, const MemoryBlock &reusable_block, size_t block_size) { bool can_reuse = false; if (reusable_block.Size() == block_size) { - can_reuse = true; + // in some continuous input case, continuous first input node's is not same as topo first node. + if (continuous_life_begin > 0) { + if (continuous_life_begin > reusable_block.GetLifeEnd()) { + can_reuse = true; + } + } else { + can_reuse = true; + } } return can_reuse; } @@ -676,6 +647,13 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) { return false; } + auto node_desc = n->GetOpDesc(); + GE_IF_BOOL_EXEC(node_desc == nullptr, GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); + return false;); + std::vector offsets_for_fusion = {}; + bool has_lx_fusion_attr = + AttrUtils::GetListInt(node_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion); + if (static_cast(out_index) < n->GetAllOutDataAnchors().size()) { auto out_anchor = n->GetOutDataAnchor(out_index); GE_IF_BOOL_EXEC(out_anchor == nullptr, @@ -698,16 +676,17 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou return false;); // If GetBool fail, is_input_continuous is false. - bool is_input_continuous_no_padding = false; - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, - is_input_continuous_no_padding); - if (is_input_continuous_no_padding) { + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous); + if (is_input_continuous) { reset_zero_copy_flag = true; - return false; + has_lx_fusion_attr = true; + } else { + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); } - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); - GE_IF_BOOL_EXEC(is_input_continuous && CheckIsZeroMemNodeType(peer_node->GetType()), + // lx_fusion memory only assign first input, broadcast's input some are variable some are not, reassign later + GE_IF_BOOL_EXEC(is_input_continuous && + (CheckIsZeroMemNodeType(peer_node->GetType()) || (has_lx_fusion_attr && (peer_in_anchor->GetIdx() != 0))), GELOGI("Node[%s] output[%u] no_need_assign_memory.", n->GetName().c_str(), out_index); no_need_assign_memory = true; return false;); @@ -721,6 +700,10 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou // Only set attr one times. if (node_continuous_input_blocks_[peer_in_node_desc->GetName()].size() == 0) { (void)ge::AttrUtils::SetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); + // lx fusion case assign max size for first block, so reuse as none continuous + GE_IF_BOOL_EXEC(has_lx_fusion_attr, + is_op_reuse_mem_ = IsContinuousMemoryReuse(n, peer_node, out_index); + return false;); node_continuous_input_counts_[peer_in_node_desc->GetName()] = peer_node->GetAllInDataAnchorsSize(); } peer_input_index = peer_in_anchor->GetIdx(); @@ -733,6 +716,95 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou return false; } +bool IsContinuousInputNodeMaxLife(const NodePtr &n, uint32_t out_index) { + if (n == nullptr) { + return false; + } + + int64_t max_node_life_time = 0; + int64_t continuous_input_node_life_time = 0; + if (static_cast(out_index) < n->GetAllOutDataAnchors().size()) { + auto out_anchor = n->GetOutDataAnchor(out_index); + if(out_anchor == nullptr) { + return false; + } + + // continuous input node's life time should be max + for (auto const &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) { + if ((peer_in_anchor == nullptr) || (peer_in_anchor->GetOwnerNode() == nullptr)){ + return false; + } + auto peer_in_node_desc = peer_in_anchor->GetOwnerNode()->GetOpDesc(); + GE_IF_BOOL_EXEC(peer_in_node_desc == nullptr, + GELOGE(FAILED, "Node[%s] output[%u] peer in node desc is null.", n->GetName().c_str(), out_index); + return false;); + + if(peer_in_node_desc->GetId() > max_node_life_time) { + max_node_life_time = peer_in_node_desc->GetId(); + } + + // If GetBool fail, is_input_continuous is false. + bool is_input_continuous = false; + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous); + if (!is_input_continuous) { + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); + } + if (is_input_continuous) { + continuous_input_node_life_time = peer_in_node_desc->GetId(); + } + } + } + return ((max_node_life_time != 0) && (continuous_input_node_life_time == max_node_life_time)) ; +} + +/// +/// @ingroup GE +/// @brief Check continuous memory reuseable +/// @return void +/// +bool BlockMemAssigner::IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index) { + // n,peer_node_desc have been checked + auto node_desc = n->GetOpDesc(); + auto peer_node_desc = peer_node->GetOpDesc(); + continuous_life_begin_ = static_cast(node_desc->GetId()); + // lx fusion case check all continuous input node, firt input node's life time should be min + for (const auto &in_anchor : peer_node->GetAllInDataAnchors()) { + if ((in_anchor == nullptr) || (in_anchor->GetPeerOutAnchor() == nullptr) || + (in_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) || + (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr)) { + GELOGE(FAILED, "Node[%s] output[%u] peer input node desc is null.", n->GetName().c_str(), out_index); + return false; + } + auto peer_out_node_desc = in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc(); + /// + /// node2 node1 node3 + /// | / / | + /// node5 node6 + /// firt input node's life time is not min + /// when node5's first input node2's life time is not min(node2 > node1), use node1's life time to reuse + /// + if (static_cast(peer_out_node_desc->GetId()) < continuous_life_begin_) { + continuous_life_begin_ = static_cast(peer_out_node_desc->GetId()); + GELOGI( + "Node[%s] life[%ld] output[%u] is not continuous input node[%s] life[%ld]'s min life time," + "min is node[%s] life[%zu]", + n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(), + peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), continuous_life_begin_); + } + // when node3's output node5's life time is not max(node6 > node5), not reuse + if (!IsContinuousInputNodeMaxLife(in_anchor->GetPeerOutAnchor()->GetOwnerNode(), + in_anchor->GetPeerOutAnchor()->GetIdx())) { + GELOGI( + "Node[%s] life[%ld] output[%u]'s continuous input node[%s] life[%ld]'s is not node[%s] output[%d]'s " + "max life node", + n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(), + peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), in_anchor->GetPeerOutAnchor()->GetIdx()); + return false; + } + } + return true; +} + /// /// @ingroup GE /// @brief Check pre_reuse flag & post_reuse glag for each symbol @@ -1018,8 +1090,9 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue); // A node can reuse blocks of the same stream and preorder streams - if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) { - reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false}, real_size, no_align_size); + if (CanReuseBlock(continuous_life_begin_, *reusable_block, block_size)) { + reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, + real_size, no_align_size); if (mem_type == kOutput) { auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString()); if (iter != anchor_to_symbol_.end()) { @@ -1028,7 +1101,6 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, } reusable_block->continuous_block_ = continuous; reusable_block->ref_count_++; - ReduceReusableBlockCount(*reusable_block, reusable_block_counts_); reusable_blocks_[memory_type][stream_id].erase((++it).base()); return reusable_block; } @@ -1041,8 +1113,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, // Data and netoutput need zero copy block block->is_zero_copy_ = IsZeroCopyBlock(n, continuous); - - block->Init(real_size, mem_type, n, out_index, no_align_size, node_op_desc->GetStreamId()); + block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, real_size, no_align_size); block->stream_id_ = node_op_desc->GetStreamId(); block->ref_count_++; block->continuous_block_ = continuous; @@ -1142,8 +1213,23 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, std::string symbol; if (IsSymbolExist(node_index_io, symbol)) { block = symbol_blocks_[symbol]; - block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size); + GE_IF_BOOL_EXEC(block == nullptr, GELOGE(FAILED, "Node %s ref block is nullptr.", node_op_desc->GetName().c_str()); + return nullptr); + // reduce old size + size_t align_size = block->Size(); + AlignMemOffset(align_size); + theory_memory_size_ -= align_size; + + auto block_size = GetBlockSize(size, ranges); + block->SetSize(block_size); + block->SetLifeTimeEnd(life_time_); + block->AddNodeTypeIndex({n, kOutput, index, true, continuous_life_begin_}, size, no_align_size); block->ref_count_++; + + // add new size + align_size = block_size; + AlignMemOffset(align_size); + theory_memory_size_ += align_size; } else { int64_t max_size = size; int64_t memory_type = RT_MEMORY_HBM; @@ -1196,7 +1282,6 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS, GELOGI("Get dst_reuse_input_index failed")); if (dst_reuse_input && (dst_reuse_input_index == static_cast(in_anchor->GetIdx()))) { - block->AddNodeTypeIndex({owner_node, kOutput, i, true}, block->Size(), block->Size()); out_count_reuse_input += 1; reuse_input = true; } @@ -1237,7 +1322,7 @@ bool IsAtomicOutputMemory(const ge::NodePtr &node, uint32_t output_index, bool i if (static_cast(index) == output_index) { if (node->GetOwnerComputeGraph() != nullptr) { string graph_name = node->GetOwnerComputeGraph()->GetName(); - GELOGD("[IMAS]Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(), + GELOGD("Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(), op_desc->GetName().c_str(), index, op_desc->GetStreamId()); } return true; @@ -1275,7 +1360,6 @@ void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vectorsame_stream_) { to_release->SetLifeTimeEnd(life_time_); reusable_memory.emplace_back(to_release); - AddReusableBlockCount(*to_release, reusable_block_counts_); } } } @@ -1375,6 +1459,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector } is_op_reuse_mem_ = true; + continuous_life_begin_ = 0; if (op_reuse_env_valid_ == true) { vector::iterator it_name = std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), op_desc->GetName()); @@ -1426,7 +1511,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector continue; } // atomic can't be reused - bool need_change = is_op_reuse_mem_ && out_node_set_continuous_input && is_atomic; + bool need_change = is_op_reuse_mem_ && is_atomic; if (need_change) { is_op_reuse_mem_ = false; } @@ -1819,11 +1904,12 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, } op_desc->SetWorkspace(workspace_list); } - GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] " - "life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(), - op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), - block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block_level, block->reuse_mem_, - block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input, + GELOGI("[IMAS]Set %s name[%s] optype[%s] %s[%u] offset to [%ld] streamid[%ld] memtype[%ld] size[%zu] realsize[%zu] " + "noalignsize[%zu] life time begin[%s] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", + graph_name.c_str(), op_desc->GetName().c_str(), node_type.node->GetType().c_str(), + node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),block->memory_type_, + block->Size(), real_size, no_align_size, node_type.GetLifeBeginDesc().c_str(), end, child_block_level, + block->reuse_mem_, block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input, block->batch_label_.c_str()); } diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h index 58bcda75..78584078 100755 --- a/ge/graph/build/memory/block_mem_assigner.h +++ b/ge/graph/build/memory/block_mem_assigner.h @@ -39,14 +39,15 @@ using DependStreamLife = std::map>; enum OpMemoryType { kOutput, kWorkspace }; struct NodeTypeIndex { - NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false) - : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input) {} + NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false, size_t begin = 0) + : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input), life_time_begin(begin) {} ge::NodePtr node = nullptr; OpMemoryType mem_type = kOutput; uint32_t index = 0; - size_t life_time_end = kMaxLifeTime; bool ref_input = false; + size_t life_time_begin = 0; + size_t life_time_end = kMaxLifeTime; const string GetMemType() const { if (mem_type == kOutput) { return "output"; @@ -55,6 +56,34 @@ struct NodeTypeIndex { } return "unknown"; } + + size_t GetLifeBegin() const { + if ((node == nullptr) || (node->GetOpDesc() == nullptr)) { + return 0; + } + + if ((life_time_begin > 0) && (life_time_begin < static_cast(node->GetOpDesc()->GetId()))) { + return life_time_begin; + } else { + return node->GetOpDesc()->GetId(); + } + } + + std::string GetLifeBeginDesc() const { + if (node == nullptr) { + return ""; + } + auto node_op_desc = node->GetOpDesc(); + if (node_op_desc != nullptr) { + auto life_begin = GetLifeBegin(); + if (life_begin != static_cast(node_op_desc->GetId())) { + return std::to_string(life_begin) + "-" + std::to_string(node_op_desc->GetId()); + } else { + return std::to_string(node_op_desc->GetId()); + } + } + return ""; + } }; class MemoryBlock { @@ -86,16 +115,13 @@ class MemoryBlock { symbol_list_.clear(); } - void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size, - int64_t stream_id) { - real_size_list_.emplace_back(real_size); - no_align_size_list_.emplace_back(no_align_size); - node_type_index_list_.emplace_back(node, type, out_index, false); - if (stream_id != stream_id_) { - same_stream_ = false; + size_t Size() const { return block_size_; } + + void SetSize(size_t size) { + if (size > block_size_) { + block_size_ = size; } } - size_t Size() const { return block_size_; } size_t AlignSize() const; @@ -143,7 +169,7 @@ class MemoryBlock { size_t GetLifeBegin(); - size_t GetLifeEnd(); + size_t GetLifeEnd() const; void AddDependLifeBegin(DependStreamLife &node_depend_stream_life); @@ -406,6 +432,7 @@ class BlockMemAssigner : public MemAssigner { bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name, uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag); + bool IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index); /// /// @ingroup GE /// @|+++++++++block1++++++++| |+++++++++block1++++++++| @@ -425,8 +452,6 @@ class BlockMemAssigner : public MemAssigner { std::unordered_map>> reusable_blocks_; - std::map reusable_block_counts_; - std::unordered_map>> stream_workspace_blocks_; std::unordered_map> node_out_blocks_; @@ -456,6 +481,7 @@ class BlockMemAssigner : public MemAssigner { std::string max_batch_label_; + size_t continuous_life_begin_ = 0; /// /// @ [stream1][nodeid] /// @[nodeid] [stream2][nodeid] diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index 98d073d4..f94eb275 100755 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -35,10 +35,9 @@ namespace { const int kAllInputAddrIsAtomic = -1; const int kVirtualInputNodeMemoryReuse = 0; const int kVirtualOutputNodeMemoryReuse = 1; -const size_t kVirtualInputNodeOutputSize = 1; -const size_t kVirtualOutputNodeInputSize = 1; -const size_t kVirtualNodeDataIndex = 0; -const char *const kMbatchNodeNameFlag = "_ascend_mbatch_batch_"; +// One state per bit cannot be repeated +enum ContinuousType { kTypeInput = 1, kTypeInputNoPadding = 2, kTypeOutput = 4, kTypeOutputNoPadding = 8 }; + int64_t GetSymbolOutputOffset(const std::map &anchor_to_symbol, const std::map> &symbol_to_anchors, const ge::NodePtr &node, const uint32_t i) { @@ -136,7 +135,7 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() { return ge::SUCCESS; } -ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, +ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index, int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size) { graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size); @@ -181,68 +180,6 @@ ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::Cons return SUCCESS; } -Status GraphMemoryAssigner::GetMaxBatchLabel(const map> &mem_reuse_virtual_nodes_map, - int32_t mem_reuse_model, string &max_batch_label) { - for (auto &i_map : mem_reuse_virtual_nodes_map) { - vector virtual_nodes_list = i_map.second; - vector max_shape_dims; - size_t max_batch_dim = 0; - bool max_batch_dim_find = false; - for (size_t i = 0; i < virtual_nodes_list.size(); ++i) { - GE_CHECK_NOTNULL(virtual_nodes_list[i]); - OpDescPtr op_desc = virtual_nodes_list[i]->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - - ge::ConstGeTensorDescPtr input_output_desc; - if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { - input_output_desc = op_desc->GetOutputDescPtr(kVirtualNodeDataIndex); - } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { - input_output_desc = op_desc->GetInputDescPtr(kVirtualNodeDataIndex); - } else { - std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - GE_CHECK_NOTNULL(input_output_desc); - - if (i == 0) { - // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. - (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label); - max_shape_dims = input_output_desc->GetShape().GetDims(); - } else { - vector current_shape_dims = input_output_desc->GetShape().GetDims(); - if (current_shape_dims.size() != max_shape_dims.size()) { - std::string error = "The shape of several nodes between multiple batches does not match."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - for (size_t j = 0; j < current_shape_dims.size(); ++j) { - if (current_shape_dims[j] == max_shape_dims[j]) { - continue; - } - if (max_batch_dim_find && max_batch_dim != j) { - std::string error = "The shape of several nodes between multiple batches does not match."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - max_batch_dim_find = true; - max_batch_dim = j; - if (current_shape_dims[j] > max_shape_dims[j]) { - max_shape_dims[j] = current_shape_dims[j]; - // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. - (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label); - } - // Only compare the first different dim in shape. - break; - } - } - } - // In every element of virtual_input_nodes_map, the label of the max batch node is the same. - break; - } - return SUCCESS; -} - Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map &mem_type_to_offset) { if (memory_offset_.empty()) { GELOGE(FAILED, "memory_offset_ is empty."); @@ -250,13 +187,6 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, mapGetGraphMemoryMaxSize())}); + GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(), + iter.second, iter.first); } return ge::FAILED; } @@ -313,22 +245,137 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(map &mem_offse return SUCCESS; } +uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) { + if (op_desc == nullptr) { + return 0; + }; + + bool is_continuous = false; + uint32_t continuous_type = 0; + // If GetBool fail, is_continuous is false. + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_continuous); + if (is_continuous) { + continuous_type |= kTypeInput; + } else { + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_continuous); + if (is_continuous) { + bool attr_reuse = false; + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + if (attr_reuse) { + continuous_type |= kTypeInputNoPadding; + } + } + } + + is_continuous = false; + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_continuous); + if (is_continuous) { + continuous_type |= kTypeOutput; + } else { + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, is_continuous); + if (is_continuous) { + bool attr_reuse = false; + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + if (attr_reuse) { + continuous_type |= kTypeOutputNoPadding; + } + } + } + + if (continuous_type != 0) { + GELOGI("Current node %s continuous type %d.", op_desc->GetName().c_str(), continuous_type); + } + return continuous_type; +} + +Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &output_desc, uint32_t continuous_type, + int64_t &tensor_size, int64_t &nopadding_size) { + if ((op_desc == nullptr) || (output_desc == nullptr)) { + GELOGE(FAILED, "Input para is nullptr."); + return FAILED; + } + tensor_size = 0; + nopadding_size = 0; + bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0); + if (is_nopadding) { + int64_t attr_dim_index; + bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); + if (!get_attr_dim_flag) { + GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + return FAILED; + } + + // Calculate tensor real size of each piece of data and out size of complete data + int64_t batch_dim_num = 1; + if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, nopadding_size, batch_dim_num, tensor_size) != + SUCCESS) { + GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s.", op_desc->GetName().c_str()); + return FAILED; + } + } else { + if (ge::TensorUtils::GetSize(*output_desc, tensor_size) != ge::SUCCESS) { + GELOGE(FAILED, "GetSize failed."); + return FAILED; + } + } + if ((tensor_size < 0) || (nopadding_size < 0)) { + GELOGE(FAILED, "GetMemorySize for node %s failed.", op_desc->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + +void AlignMemOffset(int64_t &mem_align_size) { + if (mem_align_size <= 0) { + return; + } + mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; +} + +bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op_desc) { + bool is_peer_output_continuous = false; + // If GetBool fail, is_peer_output_continuous is false. + (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous); + + // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and + // continuous output of the previous node is the same, we can support it. If size != 1, there may be + // conflict between the two, we can not support it. + auto peer_output_size = peer_op_desc->GetOutputsSize(); + GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1), + std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + + " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + + " requires continuous output. There may be conflict between the two." + + "This node is not supported now."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return true;); + + bool is_peer_reference = false; + // If GetBool fail, is_peer_reference is false. + (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); + GE_IF_BOOL_EXEC(is_peer_reference, + std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + + " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + + " requires continuous output. There may be conflict between the two." + + "This node is not supported now."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return true;); + return false; +} + Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { Status ret; for (auto &node : compute_graph_->GetAllNodes()) { - // Get the continuous input type of the node, default is false - bool is_input_continuous = false; - GE_CHECK_NOTNULL(node->GetOpDesc()); - // If GetBool fail, is_input_continuous is false. - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); + GE_CHECK_NOTNULL(node); + auto continuous_type = GetContinuousMemoryType(node->GetOpDesc()); // Assign continuous input memory - if (is_input_continuous) { - int64_t memory_type = RT_MEMORY_HBM; - GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed."); + bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0); + int64_t memory_type = RT_MEMORY_HBM; + GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed."); + if (continuous_input) { int64_t mem_clean_start = 0; int64_t mem_clean_size = 0; - ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type); + ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type, continuous_type); if (ret != ge::SUCCESS) { GELOGE(ret, "Assign continuous input memory failed!"); return ret; @@ -338,7 +385,6 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { vector input_indexes; // If GetListInt fail, input_indexes is empty. (void) ge::AttrUtils::GetListInt(node->GetOpDesc(), ATOMIC_ATTR_INPUT_INDEX, input_indexes); - if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) { // check whether there is an atomic conflict between the current node and the peer out node if (!CheckInputIsSupportAtomic(node)) { @@ -350,9 +396,10 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { const auto &in_control_anchor = node->GetInControlAnchor(); GE_CHECK_NOTNULL(in_control_anchor); for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) { + GE_CHECK_NOTNULL(peer_out_control_anchor); auto peer_out_node = peer_out_control_anchor->GetOwnerNode(); if (peer_out_node->GetType() == ATOMICADDRCLEAN) { - ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}); + ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}, memory_type); if (ret != SUCCESS) { GELOGE(ret, "Failed to set attr for atomic addr clean node %s.", peer_out_node->GetName().c_str()); return ret; @@ -362,23 +409,12 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { } } - // Get the reference type of the node, default is false - bool is_ref = false; - // If GetBool fail, is_ref is false. - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref); - - // Get the continuous output type of the node, default is false - bool is_output_continuous = false; - // If GetBool fail, is_output_continuous is false. - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); - - // If the output is ref type and refers to the ref of an input, the name of the output - // and the input are the same. Ge encounters ref type, finds matching relationship according - // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast - if (!is_ref && is_output_continuous) { // Assign continuous output memory - ret = AssignContinuousOutputMemory(node); + // Assign continuous output memory + bool continuous_output = ((continuous_type & kTypeOutput) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0); + if (continuous_output) { + ret = AssignContinuousOutputMemory(node, memory_type, continuous_type); if (ret != ge::SUCCESS) { - GELOGE(ret, "Assign reference memory failed!"); + GELOGE(ret, "Assign continuous output memory failed!"); return ret; } } @@ -391,520 +427,181 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { } Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, - int64_t &continuous_mem_size, int64_t memory_type) { + int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type) { GELOGI("Current node %s needs continuous input.", node->GetName().c_str()); - bool continuous_input_alloc = false; - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, continuous_input_alloc); auto iter = memory_offset_.find(memory_type); if (iter == memory_offset_.end()) { std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } + // The head and tail of hcom continuous input should be added 512 + iter->second.mem_offset_ += MEM_ALIGN_SIZE; continuous_mem_start = iter->second.mem_offset_; + int64_t mem_offset = iter->second.mem_offset_; + int64_t extra_memory_size = 0; + bool is_continuous_input_allocated = false; + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, is_continuous_input_allocated); for (auto &in_data_anchor : node->GetAllInDataAnchors()) { + GE_IF_BOOL_EXEC(in_data_anchor == nullptr, continue); auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue); - auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); GE_IF_BOOL_EXEC(peer_op_desc == nullptr, continue); - bool is_peer_output_continuous = false; - // If GetBool fail, is_peer_output_continuous is false. - (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous); - - // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and - // continuous output of the previous node is the same, we can support it. If size != 1, there may be - // conflict between the two, we can not support it. - auto peer_output_size = peer_op_desc->GetOutputsSize(); - GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1), - std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + - " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + - " requires continuous output. There may be conflict between the two." + - "This node is not supported now."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return PARAM_INVALID;); - - bool is_peer_reference = false; - // If GetBool fail, is_peer_reference is false. - (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); - GE_IF_BOOL_EXEC(is_peer_reference, - std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + - " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + - " requires continuous output. There may be conflict between the two." + - "This node is not supported now."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return PARAM_INVALID;); - - vector output_list = peer_op_desc->GetOutputOffset(); - std::vector offsets_for_fusion = {}; - bool has_offset_attr = - AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion); - if (peer_out_data_anchor->GetIdx() < static_cast(output_list.size())) { - if (continuous_input_alloc && !has_offset_attr) { - if (in_data_anchor->GetIdx() == 0) { - continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx()); - } - // can not use else if, incase only one input - if (in_data_anchor->GetIdx() == static_cast(node->GetAllInDataAnchors().size()) - 1) { - int64_t tensor_desc_size = 0; - Status ret = ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), - tensor_desc_size); - GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;); - - tensor_desc_size = (tensor_desc_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; - continuous_mem_size = - output_list.at(peer_out_data_anchor->GetIdx()) - continuous_mem_start + tensor_desc_size + MEM_ALIGN_SIZE; - } - GELOGI( - "[IMAS]Check Continuous input : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%u] " - "real_size[%u].", - node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), - peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), - 0, 0); - continue; - } - - output_list.at(peer_out_data_anchor->GetIdx()) = iter->second.mem_offset_; - } else { - std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx()); - return FAILED; - } - peer_op_desc->SetOutputOffset(output_list); - size_t pre_mem_offset = iter->second.mem_offset_; + GE_IF_BOOL_EXEC(IsContinuousInputConflict(node, peer_op_desc), return PARAM_INVALID;); int64_t tensor_desc_size = 0; - if (has_offset_attr) { - if (peer_out_data_anchor->GetIdx() < static_cast(offsets_for_fusion.size())) { - auto offset_for_fusion = offsets_for_fusion[peer_out_data_anchor->GetIdx()]; - iter->second.mem_offset_ += offset_for_fusion; - } else { + int64_t nopadding_size = 0; + int64_t real_size = 0; + std::vector offsets_of_fusion = {}; + bool lx_fusion = AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_of_fusion); + lx_fusion = lx_fusion && !offsets_of_fusion.empty(); + if (lx_fusion) { + if (peer_out_data_anchor->GetIdx() >= static_cast(offsets_of_fusion.size())) { std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) + " index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } + nopadding_size = offsets_of_fusion[peer_out_data_anchor->GetIdx()]; + tensor_desc_size = nopadding_size; } else { - Status ret = - TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size); - GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;); - - iter->second.mem_offset_ += tensor_desc_size; - } - - // If set tensor_actual_size, Memory alignment is not required. - int32_t is_tensor_actual_size = 0; - ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size); - if (is_tensor_actual_size == 0) { - AlignMemOffset(MEM_ALIGN_SIZE, memory_type); + if (GetMemorySize(node->GetOpDesc(), peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()), + continuous_type, tensor_desc_size, nopadding_size) != ge::SUCCESS) { + return FAILED; + } } - GELOGI( - "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] " - "real_size[%ld].", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), - peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), - (iter->second.mem_offset_ - pre_mem_offset), tensor_desc_size); - } - - iter->second.mem_offset_ += MEM_ALIGN_SIZE; - if (!continuous_input_alloc) { - continuous_mem_size = iter->second.mem_offset_ - continuous_mem_start; - } - return SUCCESS; -} - -Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node) { - GELOGI("Current node %s needs continuous output.", node->GetName().c_str()); - auto out_op_desc = node->GetOpDesc(); - GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); - vector output_list = out_op_desc->GetOutputOffset(); - if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) { - GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", - out_op_desc->GetOutputsSize(), output_list.size()); - return ge::FAILED; - } - - size_t mem_offset = output_list[0]; - for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { - output_list[out_data_anchor->GetIdx()] = mem_offset; - int64_t tensor_desc_size = 0; - if (ge::TensorUtils::GetSize(*(out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx())), tensor_desc_size) != - ge::SUCCESS) { - GELOGE(FAILED, "GetSize failed."); - return FAILED; - } - mem_offset += tensor_desc_size; - if (mem_offset <= 0) { + bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || lx_fusion; + vector output_list = peer_op_desc->GetOutputOffset(); + if (peer_out_data_anchor->GetIdx() >= static_cast(output_list.size())) { + std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } - mem_offset = (mem_offset + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; - GELOGI( - "[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] " - "real_size[%ld].", - node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), - output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), tensor_desc_size, tensor_desc_size); - } - out_op_desc->SetOutputOffset(output_list); - return ge::SUCCESS; -} -Status GraphMemoryAssigner::ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse) { - OpDescPtr op_desc = node->GetOpDesc(); - vector output_list = op_desc->GetOutputOffset(); - if (output_list.empty()) { - GELOGE(FAILED, "Outputoffset is empty node name:%s", node->GetName().c_str()); - return FAILED; - } - output_list.at(0) = mem_offset_reuse; - op_desc->SetOutputOffset(output_list); - GELOGI("Set virtual input node %s output offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse); - - int64_t attr_dim_index; - bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); - if (!get_attr_dim_flag) { - GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); - return FAILED; - } - - size_t extra_memory_size = 0; - for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { - auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); - GE_CHECK_NOTNULL(peer_out_data_anchor); - auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_CHECK_NOTNULL(peer_op_desc); - vector output_offsets = peer_op_desc->GetOutputOffset(); - if (peer_out_data_anchor->GetIdx() >= static_cast(output_offsets.size())) { - GELOGE(ge::FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); - return ge::FAILED; + // when continuous input has been allocated first input is beginning offset + bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0); + if (is_allocated_first_input) { + mem_offset = output_list.at(peer_out_data_anchor->GetIdx()); + continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx()); + } else { + // set offset for input + output_list.at(peer_out_data_anchor->GetIdx()) = mem_offset; + peer_op_desc->SetOutputOffset(output_list); } - output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse; - peer_op_desc->SetOutputOffset(output_offsets); - size_t pre_mem_offset = mem_offset_reuse; - // Calculate tensor real size of each piece of data and out size of complete data - ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()); - GE_CHECK_NOTNULL(output_desc); - int64_t output_mem_size; - int64_t batch_dim_num = 1; - int64_t out_size; - if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != - SUCCESS) { - GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", - peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx()); - return FAILED; + int64_t align_size = tensor_desc_size; + if (is_nopadding) { + mem_offset += nopadding_size; + extra_memory_size += (tensor_desc_size - nopadding_size); + real_size = nopadding_size; + } else { + ge::AlignMemOffset(align_size); + mem_offset += align_size; + // The head and tail of hcom continuous input should be added 512 + extra_memory_size = MEM_ALIGN_SIZE; + real_size = tensor_desc_size; } - mem_offset_reuse += output_mem_size; - extra_memory_size = extra_memory_size + out_size - output_mem_size; - - GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] " - "real_size[%ld].", - node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), - peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), out_size, - output_mem_size); - } - mem_offset_reuse += extra_memory_size; - size_t after_mem_offset = mem_offset_reuse; - GELOGI("After reassign virtual input node[name: %s, type: %s] memory, memory offset = %zu.", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset); - return SUCCESS; -} - -Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() { - map> mem_reuse_virtual_input_nodes_map; - int64_t memory_type = RT_MEMORY_HBM; - for (const auto &n : compute_graph_->GetAllNodes()) { - OpDescPtr op_desc = n->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - bool attr_continuous = false; - bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, attr_continuous); - GE_IF_BOOL_EXEC(!get_continuous_flag, continue); - bool attr_reuse = false; - bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); - GE_IF_BOOL_EXEC(!get_reuse_flag, continue); - if (attr_reuse && attr_continuous) { - if (op_desc->GetOutputsSize() != kVirtualInputNodeOutputSize) { - // When current virtual node has several outputs, can't directly determine which input is the tensor for reuse. - std::string error = "Only one output is supported, current virtual node" + FmtToStr(n->GetName()) + - " has " + FmtToStr(op_desc->GetOutputsSize()) + " outputs."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed."); - auto iter = memory_offset_.find(memory_type); - if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - GELOGD("Start to reassign memory for virtual input node, memory offset = %zu, memory type = %ld.", - iter->second.mem_offset_, memory_type); - string batch_label_string; - // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter - (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); - if (batch_label_string.empty()) { - size_t node_mem_offset = iter->second.mem_offset_; - // No ATTR_NAME_BATCH_LABEL, no need to reuse memory. - Status status = ReAssignVirtualInputNodeMemory(n, node_mem_offset); - if (status != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual input node failed, node name: %s.", n->GetName().c_str()); - return FAILED; - } - - iter->second.mem_offset_ = node_mem_offset; - AlignMemOffset(MEM_ALIGN_SIZE, memory_type); - GELOGD("After reassign memory for virtual input node, align memory = %zu, memory type = %ld.", - iter->second.mem_offset_, memory_type); - } else { - // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory. - string current_node_full_name = op_desc->GetName(); - size_t pos = current_node_full_name.find(kMbatchNodeNameFlag); - if (pos == string::npos) { - GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.", - kMbatchNodeNameFlag, n->GetName().c_str()); - return FAILED; - } - string fixed_name = current_node_full_name.substr(0, pos); - vector parallel_virtual_input_nodes; - if (mem_reuse_virtual_input_nodes_map.count(fixed_name) != 0) { - parallel_virtual_input_nodes = mem_reuse_virtual_input_nodes_map[fixed_name]; - } - parallel_virtual_input_nodes.emplace_back(n); - mem_reuse_virtual_input_nodes_map[fixed_name] = parallel_virtual_input_nodes; - } - } + GELOGI("[IMAS]Continuous input : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld] " + "size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), + node->GetType().c_str(), peer_op_desc->GetName().c_str(),peer_out_data_anchor->GetIdx(), + output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), memory_type, + is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding); } - int32_t mem_reuse_model = 0; - if (ReAssignVirtualNodesMemory(mem_reuse_virtual_input_nodes_map, mem_reuse_model) != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual input nodes failed."); - return FAILED; + mem_offset += extra_memory_size; + ge::AlignMemOffset(mem_offset); + continuous_mem_size = mem_offset - continuous_mem_start; + if (is_continuous_input_allocated) { + // not allocate memory here, so no need add 512 in header + iter->second.mem_offset_ -= MEM_ALIGN_SIZE; + } else { + iter->second.mem_offset_ = mem_offset; } return SUCCESS; } -Status GraphMemoryAssigner::ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse) { - OpDescPtr op_desc = node->GetOpDesc(); - - // 1. set memory of to be reused input tensor +Status GetFirstInputPeerOutOutputOffset(const ge::NodePtr &node, int64_t &mem_offset) { auto in_data_anchor_list = node->GetAllInDataAnchors(); + if (in_data_anchor_list.empty()) { + GELOGE(FAILED, "Node %s's in data anchor is empty.", node->GetName().c_str()); + return FAILED; + } auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor(); - GE_CHECK_NOTNULL(peer_out_data_anchor); + GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, GELOGE(ge::FAILED, "peer_out_data_anchor is null."); + return ge::FAILED); auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_CHECK_NOTNULL(peer_op_desc); + GE_IF_BOOL_EXEC(peer_op_desc == nullptr, GELOGE(ge::FAILED, "peer_op_desc is null."); return ge::FAILED); vector in_node_output_offsets = peer_op_desc->GetOutputOffset(); if (peer_out_data_anchor->GetIdx() >= static_cast(in_node_output_offsets.size())) { GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); return FAILED; } - in_node_output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse; - peer_op_desc->SetOutputOffset(in_node_output_offsets); - GELOGI("Set virtual output node %s input data offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse); + mem_offset = in_node_output_offsets.at(peer_out_data_anchor->GetIdx()); + return SUCCESS; +} - // 2. set memory of output tensor - vector output_list = op_desc->GetOutputOffset(); - if (output_list.empty()) { - GELOGE(FAILED, "Outputoffset is empty, node name: %s", node->GetName().c_str()); - return FAILED; - } - if (op_desc->GetOutputsSize() > output_list.size()) { - GELOGE(FAILED, "The size %zu of op_desc is more than output_list's size %zu.", op_desc->GetOutputsSize(), - output_list.size()); - return FAILED; - } - int64_t attr_dim_index; - bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); - if (!get_attr_dim_flag) { - GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); - return FAILED; +Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type, + uint32_t continuous_type) { + GELOGI("Current node %s needs continuous output.", node->GetName().c_str()); + auto out_op_desc = node->GetOpDesc(); + GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); + vector output_list = out_op_desc->GetOutputOffset(); + if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) { + GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", + out_op_desc->GetOutputsSize(), output_list.size()); + return ge::FAILED; } - size_t extra_memory_size = 0; - for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { - output_list[out_data_anchor->GetIdx()] = mem_offset_reuse; - size_t pre_mem_offset = mem_offset_reuse; - - // calculate tensor real size of each piece of data and out size of complete data - ge::ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()); - GE_CHECK_NOTNULL(output_desc); - int64_t output_mem_size; - int64_t batch_dim_num = 1; - int64_t out_size; - if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != - SUCCESS) { - GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", - op_desc->GetName().c_str(), out_data_anchor->GetIdx()); - return FAILED; + int64_t mem_offset = 0; + bool is_nopadding = ((continuous_type & kTypeOutputNoPadding) != 0); + if (is_nopadding) { + // out tensor memory must be reused input tensor memory + if (GetFirstInputPeerOutOutputOffset(node, mem_offset) != SUCCESS) { + return ge::FAILED; } + } else { + // Get the reference type of the node, default is false + bool is_ref = false; + // If GetBool fail, is_ref is false. + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref); - mem_offset_reuse += output_mem_size; - extra_memory_size = extra_memory_size + out_size - output_mem_size; - - GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu], size[%ld], real_size[%ld].", - node->GetOwnerComputeGraph()->GetName().c_str(), op_desc->GetName().c_str(), out_data_anchor->GetIdx(), - pre_mem_offset, out_size, output_mem_size); - } - op_desc->SetOutputOffset(output_list); - mem_offset_reuse += extra_memory_size; - size_t after_mem_offset = mem_offset_reuse; - GELOGI("After reassign virtual output node[name: %s, type: %s] memory, memory offset = %zu.", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset); - return SUCCESS; -} - -Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() { - map> mem_reuse_virtual_output_nodes_map; - int64_t memory_type = RT_MEMORY_HBM; - for (const auto &n : compute_graph_->GetAllNodes()) { - OpDescPtr op_desc = n->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - bool attr_continuous = false; - bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, attr_continuous); - GE_IF_BOOL_EXEC(!get_continuous_flag, continue); - bool attr_reuse = false; - bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); - GE_IF_BOOL_EXEC(!get_reuse_flag, continue); - - if (attr_reuse && attr_continuous) { - auto in_data_anchor_list = n->GetAllInDataAnchors(); - if (in_data_anchor_list.size() != kVirtualOutputNodeInputSize) { - // When current virtual node has several inputs, can't directly determine which input is the tensor for reuse. - std::string error = "Only one input is supported, current virtual node" + FmtToStr(n->GetName()) + - " has " + FmtToStr(in_data_anchor_list.size()) + " inputs."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed."); - auto iter = memory_offset_.find(memory_type); - if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - GELOGD("Start to reassign memory for virtual output node, memory offset = %zu, memory type = %ld.", - iter->second.mem_offset_, memory_type); - string batch_label_string; - // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter - (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); - if (batch_label_string.empty()) { - size_t node_mem_offset = iter->second.mem_offset_; - // No ATTR_NAME_BATCH_LABEL, no need to reuse memory. - Status status = ReAssignVirtualOutputNodeMemory(n, node_mem_offset); - if (status != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual output node failed, node name: %s.", n->GetName().c_str()); - return FAILED; - } - iter->second.mem_offset_ = node_mem_offset; - AlignMemOffset(MEM_ALIGN_SIZE, memory_type); - GELOGD("After reassign memory for virtual output node, align memory = %zu, memory type = %ld.", - iter->second.mem_offset_, memory_type); - } else { - // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory. - string current_node_full_name = op_desc->GetName(); - size_t pos = current_node_full_name.find(kMbatchNodeNameFlag); - if (pos == string::npos) { - std::string error = "Cannot find key string" + FmtToStr(kMbatchNodeNameFlag) + - " of multi-batch in name of virtual output node, the node name is " + FmtToStr(n->GetName()); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - string fixed_name = current_node_full_name.substr(0, pos); - vector parallel_virtual_output_nodes; - if (mem_reuse_virtual_output_nodes_map.count(fixed_name) != 0) { - parallel_virtual_output_nodes = mem_reuse_virtual_output_nodes_map[fixed_name]; - } - parallel_virtual_output_nodes.emplace_back(n); - mem_reuse_virtual_output_nodes_map[fixed_name] = parallel_virtual_output_nodes; - } + // If the output is ref type and refers to the ref of an input, the name of the output + // and the input are the same. Ge encounters ref type, finds matching relationship according + // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast + if (is_ref) { + GELOGI("Current node %s no needs assign continuous output because reference input by name.", + node->GetName().c_str()); + return SUCCESS; } + mem_offset = output_list[0]; } - int32_t mem_reuse_model = 1; - if (ReAssignVirtualNodesMemory(mem_reuse_virtual_output_nodes_map, mem_reuse_model) != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual output nodes failed."); - return FAILED; - } - return SUCCESS; -} - -Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map> &mem_reuse_nodes_map, - int32_t mem_reuse_model) { - // Find max batch label value - string max_batch_label; - GE_CHK_STATUS_RET(GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label), - "Get max batch label failed."); - PrintMemoryOffset(); - vector nodes_mem_offset_list; - for (auto &i_map : mem_reuse_nodes_map) { - vector virtual_nodes_list = i_map.second; - int64_t memory_type = RT_MEMORY_HBM; - GE_CHK_STATUS_RET(GetNodeListMemoryType(virtual_nodes_list, mem_reuse_model, memory_type), - "Get node list memory type failed."); - auto iter = memory_offset_.find(memory_type); - if (iter == memory_offset_.end()) { - std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { + output_list[out_data_anchor->GetIdx()] = mem_offset; + int64_t tensor_desc_size = 0; + int64_t nopadding_size = 0; + if (GetMemorySize(out_op_desc, out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()), continuous_type, + tensor_desc_size, nopadding_size) != ge::SUCCESS) { return FAILED; } - size_t max_batch_node_mem_offset = iter->second.mem_offset_; - nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset); - for (auto &i_node : virtual_nodes_list) { - // Op_desc is not nullptr, it has been checked. - OpDescPtr op_desc = i_node->GetOpDesc(); - string batch_label_string; - // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. - (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); - if (batch_label_string == max_batch_label) { - Status status = SUCCESS; - if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { - status = ReAssignVirtualInputNodeMemory(i_node, max_batch_node_mem_offset); - } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { - status = ReAssignVirtualOutputNodeMemory(i_node, max_batch_node_mem_offset); - } else { - std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - if (status != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str()); - return FAILED; - } - iter->second.mem_offset_ = max_batch_node_mem_offset; - AlignMemOffset(MEM_ALIGN_SIZE, memory_type); - GELOGD("After reassign memory for virtual node, align memory = %zu, memory type = %ld.", - iter->second.mem_offset_, memory_type); - // Only assign memory of max batch nodes. - break; - } - } - } - PrintMemoryOffset(); - size_t memory_reuse_index = 0; - for (auto &i_map : mem_reuse_nodes_map) { - vector virtual_nodes_list = i_map.second; - for (auto &i_node : virtual_nodes_list) { - size_t remaining_batch_node_mem_offset = nodes_mem_offset_list[memory_reuse_index]; - Status status = SUCCESS; - if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { - status = ReAssignVirtualInputNodeMemory(i_node, remaining_batch_node_mem_offset); - } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { - status = ReAssignVirtualOutputNodeMemory(i_node, remaining_batch_node_mem_offset); - } else { - std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return FAILED; - } - - if (status != SUCCESS) { - GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str()); - return FAILED; - } + if (is_nopadding) { + mem_offset += nopadding_size; + } else { + mem_offset += tensor_desc_size; + ge::AlignMemOffset(mem_offset); } - memory_reuse_index++; + GELOGI("[IMAS]Continuous output : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld]" + " size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), + node->GetType().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), + output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), memory_type, 0UL, + is_nopadding ? nopadding_size : tensor_desc_size, is_nopadding); } - return SUCCESS; + out_op_desc->SetOutputOffset(output_list); + return ge::SUCCESS; } Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { @@ -946,7 +643,7 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { GE_CHECK_NOTNULL(mem_assigner_); GE_CHECK_NOTNULL(mem_assigner_->GetPriorityAssinger()); if ((atomic_mem_size != 0) && (iter_batch.first == mem_assigner_->GetPriorityAssinger()->GetMaxBatchLabel())) { - GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}), + GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}, RT_MEMORY_HBM), "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str()); } } @@ -1084,7 +781,7 @@ Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector & } // All atomic nodes use atomic_addr_clean op independently, so we need to set the attr separately. - if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end) != SUCCESS) { + if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end, RT_MEMORY_HBM) != SUCCESS) { GELOGE(FAILED, "Failed to set atomic attr separately."); return FAILED; } @@ -1231,9 +928,10 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve output_list[output_index] = iter->second.mem_offset_; std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); - GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld]" - " batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, - iter->second.mem_offset_, op_desc->GetStreamId(), size, size, batch_label.c_str()); + GELOGI("[IMAS]Atomic output : Set %s name[%s] optype[%s] output[%ld] offset to [%zu] stream_id[%ld] memtype[%ld] " + "size[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), + node->GetType().c_str(), output_index, iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM, + size, size, batch_label.c_str()); iter->second.mem_offset_ += size; AlignMemOffset(MEM_ALIGN_SIZE, RT_MEMORY_HBM); @@ -1309,10 +1007,10 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); GELOGI( - "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " - "size[%ld] real_size[%ld] batch[%s].", - compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, - mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size, + "[IMAS]Atomic ordinary workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " + "memtype[%ld] size[%ld] real_size[%ld] batch[%s].", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index, + mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size, batch_label.c_str()); mem_type_iter->second.mem_offset_ += workspace_size; @@ -1350,10 +1048,10 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); GELOGI( - "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] " - "real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, - mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size, - batch_label.c_str()); + "[IMAS]Atomic fusion workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " + "memtype[%ld] ssize[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index, mem_type_iter->second.mem_offset_, + op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size, batch_label.c_str()); mem_type_iter->second.mem_offset_ += workspace_size; mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_); @@ -1429,7 +1127,7 @@ ge::Status GraphMemoryAssigner::SetInputOffset() { return FAILED; } for (auto pair : memory_offset_) { - GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memory type[%ld]", compute_graph_->GetName().c_str(), + GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(), pair.second.mem_offset_, pair.first); } @@ -1598,7 +1296,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { } Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start, - const vector &mem_offset_end) { + const vector &mem_offset_end, int64_t memory_type) { GELOGD("Start to set independent atomic attr, atomic_addr_clean memory offset start is %ld", atomic_mem_start); // Parsing offset and size vectors @@ -1627,7 +1325,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in GELOGD("Current node memory_offset vector size is %zu, node name %s, node type is %s.", memory_offset_size.size(), peer_out_node_desc->GetName().c_str(), peer_out_node_desc->GetType().c_str()); if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) { - if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size) != SUCCESS) { + if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size, memory_type) != SUCCESS) { GELOGE(FAILED, "Set atomic clean attr failed."); return FAILED; } @@ -1638,7 +1336,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in } ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const vector &atomic_mem_start, - const vector &atomic_mem_size) { + const vector &atomic_mem_size, int64_t memory_type) { auto node_op_desc = node->GetOpDesc(); if (node_op_desc != nullptr) { GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str()); @@ -1677,9 +1375,10 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve } string atomic_mem_size_str = ss.str(); - GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] output[0] offset to [%s] streamid[%ld] size[%s]", - node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), - atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), atomic_mem_size_str.c_str()); + GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] optype[%s] output[0] offset to [%s] streamid[%ld]" + " memtype[%ld] size[%s]",node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), + node->GetType().c_str(), atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), memory_type, + atomic_mem_size_str.c_str()); } return SUCCESS; } diff --git a/ge/graph/build/memory/graph_mem_assigner.h b/ge/graph/build/memory/graph_mem_assigner.h index def24287..a380e594 100755 --- a/ge/graph/build/memory/graph_mem_assigner.h +++ b/ge/graph/build/memory/graph_mem_assigner.h @@ -119,31 +119,15 @@ class GraphMemoryAssigner { /// ge::Status ReAssignContinuousMemory(bool is_loop_graph); - ge::Status ReAssignReuseAndNoPaddingContinuousInputMemory(); - - ge::Status ReAssignReuseAndNoPaddingContinuousOutputMemory(); - - ge::Status ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse); - - ge::Status ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse); - - ge::Status ReAssignVirtualNodesMemory(map> &mem_reuse_nodes_map, int32_t mem_reuse_model); - - ge::Status GetMaxBatchLabel(const map> &mem_reuse_virtual_nodes_map, - int32_t mem_reuse_model, string &max_batch_label); - - ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index, - int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size); - ge::Status ReAssignAtomicMemory(bool is_loop_graph); ge::Status FilterAtomicNodesForMemoryAssign(map>> &normal_atomic_nodes_map, map> &connecting_output_atomic_nodes); ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, - int64_t &continuous_mem_size, int64_t memory_type); + int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type); - ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node); + ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type, uint32_t continuous_type); /// /// @brief check the input of node whether support atomic attr @@ -169,10 +153,10 @@ class GraphMemoryAssigner { ge::Status AssignConnectNetOutputAtomicMemory(vector &connect_netoutput_nodes); ge::Status SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start, - const std::vector &mem_offset_end); + const std::vector &mem_offset_end, int64_t memory_type); ge::Status SetAtomicCleanAttr(const ge::NodePtr &node, const std::vector &atomic_mem_start, - const std::vector &atomic_mem_size); + const std::vector &atomic_mem_size, int64_t memory_type); ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node); diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index 49abe17c..988f8d0e 100755 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -1820,7 +1820,7 @@ void DavinciModel::GetUserDesignateShapeOrder(std::vector &user_inp /// Status DavinciModel::InitAippInfo(uint32_t index, const OpDescPtr &op_desc) { if (!op_desc->HasAttr(ATTR_NAME_AIPP)) { - GELOGW("there is not AIPP related with index %u.", index); + GELOGW("There is not AIPP related with index %u.", index); return SUCCESS; } @@ -1829,7 +1829,7 @@ Status DavinciModel::InitAippInfo(uint32_t index, const OpDescPtr &op_desc) { GE_CHK_BOOL_RET_STATUS(AttrUtils::GetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attr), GE_AIPP_NOT_EXIST, "Data node do not contain param aipp!"); GE_CHK_STATUS_RET(OpUtils::ConvertAippParams(aipp_attr, &aipp_params), "get aipp params failed"); - GELOGI("node data: %s, type: %s, current index: %u, current node related input rank: %u", + GELOGI("Node data: %s, type: %s, current index: %u, current node related input rank: %u", op_desc->GetName().c_str(), op_desc->GetType().c_str(), index, aipp_params.related_input_rank()); AippConfigInfo aipp_info; @@ -2492,7 +2492,7 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r uint64_t buffer_length = buffer.length; void *buffer_addr = reinterpret_cast(reinterpret_cast(buffer.data)); - GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]", + GELOGI("CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]", runtime_param_.graph_id, output.first, output.second.GetBasicAddr(), data_size, buffer_length); GE_CHK_RT_RET(rtMemcpy(buffer_addr, buffer_length, output.second.GetBasicAddr(), data_size, kind)); idx++; diff --git a/metadef b/metadef index dc6cceb6..fcd0833c 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit dc6cceb67bc82b567bcbd6f415776644253e1467 +Subproject commit fcd0833cffcd201701f71d17db0c696c1bb01715 diff --git a/parser b/parser index 4e72aae4..1601d66b 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit 4e72aae41e78af1a19cd965da4a45cbd988b9a75 +Subproject commit 1601d66b6187c83cbf38e762beb5538ce2c7c573