diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc index 97a0aed6..fff589f3 100644 --- a/ge/graph/build/memory/binary_block_mem_assigner.cc +++ b/ge/graph/build/memory/binary_block_mem_assigner.cc @@ -69,8 +69,8 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { GELOGW("Vector all_memory_size is empty!"); return SUCCESS; } - if ((all_memory_size.front() <= 0) || (log(kLogBase) == 0)) { - GELOGE(FAILED, "Memory size:%ld is invalid.", all_memory_size.front()); + if ((all_memory_size.front() == 0) || (log(kLogBase) == 0)) { + GELOGE(FAILED, "dividend is 0!"); return FAILED; } // Memory size is 512 aligned, so it is not necessary to take less than 512 diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc index 21d6a49e..76e7efbe 100755 --- a/ge/graph/build/memory/block_mem_assigner.cc +++ b/ge/graph/build/memory/block_mem_assigner.cc @@ -65,7 +65,10 @@ void AlignMemOffset(size_t &mem_align_size) { } static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { - if (left.GetLifeBegin() < right.GetLifeBegin()) { + auto left_node_op_desc = left.node->GetOpDesc(); + auto right_node_op_desc = right.node->GetOpDesc(); + if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr) + && (left_node_op_desc->GetId() < right_node_op_desc->GetId())) { return true; } return false; @@ -97,14 +100,14 @@ bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { auto left_node_op_desc = left.node->GetOpDesc(); auto right_node_op_desc = right.node->GetOpDesc(); if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) { - if (left.GetLifeBegin() < right.GetLifeBegin()) { - if (left.life_time_end >= right.GetLifeBegin()) { + if (left_node_op_desc->GetId() < right_node_op_desc->GetId()) { + if (left.life_time_end >= static_cast(right_node_op_desc->GetId())) { return true; } - } else if (left.GetLifeBegin() == right.GetLifeBegin()) { + } else if (left_node_op_desc->GetId() == right_node_op_desc->GetId()) { return true; } else { - if (right.life_time_end >= left.GetLifeBegin()) { + if (right.life_time_end >= static_cast(left_node_op_desc->GetId())) { return true; } } @@ -322,7 +325,12 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_ size_t MemoryBlock::GetLifeBegin() { size_t life_time = 0; if (!node_type_index_list_.empty()) { - life_time = node_type_index_list_.front().GetLifeBegin(); + if (node_type_index_list_.front().node != nullptr) { + auto node_op_desc = node_type_index_list_.front().node->GetOpDesc(); + if (node_op_desc != nullptr) { + life_time = node_op_desc->GetId(); + } + } } return life_time; } @@ -409,7 +417,7 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_ depend_stream_life_[stream_id_] = GetLifeBegin(); } -size_t MemoryBlock::GetLifeEnd() const { +size_t MemoryBlock::GetLifeEnd() { if (!node_type_index_list_.empty()) { return node_type_index_list_.back().life_time_end; } @@ -563,29 +571,32 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { for (auto &out_anchor : n->GetAllOutDataAnchors()) { GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); - int64_t size = 0; - GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); - GE_IF_BOOL_EXEC(size < 0, GELOGE(FAILED, "Node:%s size:%ld is invalid, maybe it is unknown shape node.", - node_op_desc->GetName().c_str(), size); - return;); - batch_all_memory_size[batch_label].emplace_back(size); - if (batch_total_size.find(batch_label) == batch_total_size.end()) { - batch_total_size[batch_label] = size; - } else { - batch_total_size[batch_label] += size; - } - - if (!anchor_to_symbol_.empty()) { - auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString()); - if (iter1 == anchor_to_symbol_.end()) { - continue; + bool reuse_input = false; + GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInput(output_desc, reuse_input) != SUCCESS, + GELOGI("Get reuse_input failed")); + + if (!reuse_input) { + int64_t size = 0; + GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); + batch_all_memory_size[batch_label].emplace_back(size); + if (batch_total_size.find(batch_label) == batch_total_size.end()) { + batch_total_size[batch_label] = size; + } else { + batch_total_size[batch_label] += size; } - const std::string &symbol = iter1->second; - auto iter2 = symbol_size_.find(symbol); - if (iter2 == symbol_size_.end()) { - symbol_size_[symbol] = size; - } else if (size > static_cast(iter2->second)) { - iter2->second = size; + + if (!anchor_to_symbol_.empty()) { + auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString()); + if (iter1 == anchor_to_symbol_.end()) { + continue; + } + const std::string &symbol = iter1->second; + auto iter2 = symbol_size_.find(symbol); + if (iter2 == symbol_size_.end()) { + symbol_size_[symbol] = size; + } else if (size > static_cast(iter2->second)) { + iter2->second = size; + } } } } @@ -626,17 +637,35 @@ bool IsDirectOutputNode(const NodePtr &node, int idx) { return false; } -bool CanReuseBlock(size_t continuous_life_begin, const MemoryBlock &reusable_block, size_t block_size) { +void AddReusableBlockCount(const MemoryBlock &mem_block, map &reusable_block_counts) { + string key = std::to_string(mem_block.Size()); + key += "_" + std::to_string(mem_block.stream_id_); + key += "_" + std::to_string(mem_block.memory_type_); + auto it = reusable_block_counts.find(key); + if (it != reusable_block_counts.end()) { + it->second++; + } else { + reusable_block_counts[key] = 1; + } +} + +void ReduceReusableBlockCount(const MemoryBlock &mem_block, map &reusable_block_counts) { + string key = std::to_string(mem_block.Size()); + key += "_" + std::to_string(mem_block.stream_id_); + key += "_" + std::to_string(mem_block.memory_type_); + auto it = reusable_block_counts.find(key); + if (it != reusable_block_counts.end()) { + if (it->second > 0) { + it->second--; + } + } +} + +bool CanReuseBySize(const map &reusable_block_counts, const MemoryBlock &reusable_block, + size_t block_size, size_t real_size, bool continuous) { bool can_reuse = false; if (reusable_block.Size() == block_size) { - // in some continuous input case, continuous first input node's is not same as topo first node. - if (continuous_life_begin > 0) { - if (continuous_life_begin > reusable_block.GetLifeEnd()) { - can_reuse = true; - } - } else { - can_reuse = true; - } + can_reuse = true; } return can_reuse; } @@ -647,13 +676,6 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) { return false; } - auto node_desc = n->GetOpDesc(); - GE_IF_BOOL_EXEC(node_desc == nullptr, GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); - return false;); - std::vector offsets_for_fusion = {}; - bool has_lx_fusion_attr = - AttrUtils::GetListInt(node_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion); - if (static_cast(out_index) < n->GetAllOutDataAnchors().size()) { auto out_anchor = n->GetOutDataAnchor(out_index); GE_IF_BOOL_EXEC(out_anchor == nullptr, @@ -676,17 +698,16 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou return false;); // If GetBool fail, is_input_continuous is false. - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous); - if (is_input_continuous) { + bool is_input_continuous_no_padding = false; + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, + is_input_continuous_no_padding); + if (is_input_continuous_no_padding) { reset_zero_copy_flag = true; - has_lx_fusion_attr = true; - } else { - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); + return false; } + (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); - // lx_fusion memory only assign first input, broadcast's input some are variable some are not, reassign later - GE_IF_BOOL_EXEC(is_input_continuous && - (CheckIsZeroMemNodeType(peer_node->GetType()) || (has_lx_fusion_attr && (peer_in_anchor->GetIdx() != 0))), + GE_IF_BOOL_EXEC(is_input_continuous && CheckIsZeroMemNodeType(peer_node->GetType()), GELOGI("Node[%s] output[%u] no_need_assign_memory.", n->GetName().c_str(), out_index); no_need_assign_memory = true; return false;); @@ -700,10 +721,6 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou // Only set attr one times. if (node_continuous_input_blocks_[peer_in_node_desc->GetName()].size() == 0) { (void)ge::AttrUtils::SetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); - // lx fusion case assign max size for first block, so reuse as none continuous - GE_IF_BOOL_EXEC(has_lx_fusion_attr, - is_op_reuse_mem_ = IsContinuousMemoryReuse(n, peer_node, out_index); - return false;); node_continuous_input_counts_[peer_in_node_desc->GetName()] = peer_node->GetAllInDataAnchorsSize(); } peer_input_index = peer_in_anchor->GetIdx(); @@ -716,95 +733,6 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou return false; } -bool IsContinuousInputNodeMaxLife(const NodePtr &n, uint32_t out_index) { - if (n == nullptr) { - return false; - } - - int64_t max_node_life_time = 0; - int64_t continuous_input_node_life_time = 0; - if (static_cast(out_index) < n->GetAllOutDataAnchors().size()) { - auto out_anchor = n->GetOutDataAnchor(out_index); - if(out_anchor == nullptr) { - return false; - } - - // continuous input node's life time should be max - for (auto const &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) { - if ((peer_in_anchor == nullptr) || (peer_in_anchor->GetOwnerNode() == nullptr)){ - return false; - } - auto peer_in_node_desc = peer_in_anchor->GetOwnerNode()->GetOpDesc(); - GE_IF_BOOL_EXEC(peer_in_node_desc == nullptr, - GELOGE(FAILED, "Node[%s] output[%u] peer in node desc is null.", n->GetName().c_str(), out_index); - return false;); - - if(peer_in_node_desc->GetId() > max_node_life_time) { - max_node_life_time = peer_in_node_desc->GetId(); - } - - // If GetBool fail, is_input_continuous is false. - bool is_input_continuous = false; - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous); - if (!is_input_continuous) { - (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); - } - if (is_input_continuous) { - continuous_input_node_life_time = peer_in_node_desc->GetId(); - } - } - } - return ((max_node_life_time != 0) && (continuous_input_node_life_time == max_node_life_time)) ; -} - -/// -/// @ingroup GE -/// @brief Check continuous memory reuseable -/// @return void -/// -bool BlockMemAssigner::IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index) { - // n,peer_node_desc have been checked - auto node_desc = n->GetOpDesc(); - auto peer_node_desc = peer_node->GetOpDesc(); - continuous_life_begin_ = static_cast(node_desc->GetId()); - // lx fusion case check all continuous input node, firt input node's life time should be min - for (const auto &in_anchor : peer_node->GetAllInDataAnchors()) { - if ((in_anchor == nullptr) || (in_anchor->GetPeerOutAnchor() == nullptr) || - (in_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) || - (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr)) { - GELOGE(FAILED, "Node[%s] output[%u] peer input node desc is null.", n->GetName().c_str(), out_index); - return false; - } - auto peer_out_node_desc = in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc(); - /// - /// node2 node1 node3 - /// | / / | - /// node5 node6 - /// firt input node's life time is not min - /// when node5's first input node2's life time is not min(node2 > node1), use node1's life time to reuse - /// - if (static_cast(peer_out_node_desc->GetId()) < continuous_life_begin_) { - continuous_life_begin_ = static_cast(peer_out_node_desc->GetId()); - GELOGI( - "Node[%s] life[%ld] output[%u] is not continuous input node[%s] life[%ld]'s min life time," - "min is node[%s] life[%zu]", - n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(), - peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), continuous_life_begin_); - } - // when node3's output node5's life time is not max(node6 > node5), not reuse - if (!IsContinuousInputNodeMaxLife(in_anchor->GetPeerOutAnchor()->GetOwnerNode(), - in_anchor->GetPeerOutAnchor()->GetIdx())) { - GELOGI( - "Node[%s] life[%ld] output[%u]'s continuous input node[%s] life[%ld]'s is not node[%s] output[%d]'s " - "max life node", - n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(), - peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), in_anchor->GetPeerOutAnchor()->GetIdx()); - return false; - } - } - return true; -} - /// /// @ingroup GE /// @brief Check pre_reuse flag & post_reuse glag for each symbol @@ -1090,9 +1018,8 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue); // A node can reuse blocks of the same stream and preorder streams - if (CanReuseBlock(continuous_life_begin_, *reusable_block, block_size)) { - reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, - real_size, no_align_size); + if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) { + reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false}, real_size, no_align_size); if (mem_type == kOutput) { auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString()); if (iter != anchor_to_symbol_.end()) { @@ -1101,6 +1028,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, } reusable_block->continuous_block_ = continuous; reusable_block->ref_count_++; + ReduceReusableBlockCount(*reusable_block, reusable_block_counts_); reusable_blocks_[memory_type][stream_id].erase((++it).base()); return reusable_block; } @@ -1113,7 +1041,8 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, // Data and netoutput need zero copy block block->is_zero_copy_ = IsZeroCopyBlock(n, continuous); - block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, real_size, no_align_size); + + block->Init(real_size, mem_type, n, out_index, no_align_size, node_op_desc->GetStreamId()); block->stream_id_ = node_op_desc->GetStreamId(); block->ref_count_++; block->continuous_block_ = continuous; @@ -1213,23 +1142,8 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, std::string symbol; if (IsSymbolExist(node_index_io, symbol)) { block = symbol_blocks_[symbol]; - GE_IF_BOOL_EXEC(block == nullptr, GELOGE(FAILED, "Node %s ref block is nullptr.", node_op_desc->GetName().c_str()); - return nullptr); - // reduce old size - size_t align_size = block->Size(); - AlignMemOffset(align_size); - theory_memory_size_ -= align_size; - - auto block_size = GetBlockSize(size, ranges); - block->SetSize(block_size); - block->SetLifeTimeEnd(life_time_); - block->AddNodeTypeIndex({n, kOutput, index, true, continuous_life_begin_}, size, no_align_size); + block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size); block->ref_count_++; - - // add new size - align_size = block_size; - AlignMemOffset(align_size); - theory_memory_size_ += align_size; } else { int64_t max_size = size; int64_t memory_type = RT_MEMORY_HBM; @@ -1282,6 +1196,7 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS, GELOGI("Get dst_reuse_input_index failed")); if (dst_reuse_input && (dst_reuse_input_index == static_cast(in_anchor->GetIdx()))) { + block->AddNodeTypeIndex({owner_node, kOutput, i, true}, block->Size(), block->Size()); out_count_reuse_input += 1; reuse_input = true; } @@ -1322,7 +1237,7 @@ bool IsAtomicOutputMemory(const ge::NodePtr &node, uint32_t output_index, bool i if (static_cast(index) == output_index) { if (node->GetOwnerComputeGraph() != nullptr) { string graph_name = node->GetOwnerComputeGraph()->GetName(); - GELOGD("Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(), + GELOGD("[IMAS]Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(), op_desc->GetName().c_str(), index, op_desc->GetStreamId()); } return true; @@ -1360,6 +1275,7 @@ void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vectorsame_stream_) { to_release->SetLifeTimeEnd(life_time_); reusable_memory.emplace_back(to_release); + AddReusableBlockCount(*to_release, reusable_block_counts_); } } } @@ -1459,7 +1375,6 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector } is_op_reuse_mem_ = true; - continuous_life_begin_ = 0; if (op_reuse_env_valid_ == true) { vector::iterator it_name = std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), op_desc->GetName()); @@ -1511,7 +1426,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector continue; } // atomic can't be reused - bool need_change = is_op_reuse_mem_ && is_atomic; + bool need_change = is_op_reuse_mem_ && out_node_set_continuous_input && is_atomic; if (need_change) { is_op_reuse_mem_ = false; } @@ -1904,12 +1819,11 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, } op_desc->SetWorkspace(workspace_list); } - GELOGI("[IMAS]Set %s name[%s] optype[%s] %s[%u] offset to [%ld] streamid[%ld] memtype[%ld] size[%zu] realsize[%zu] " - "noalignsize[%zu] life time begin[%s] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", - graph_name.c_str(), op_desc->GetName().c_str(), node_type.node->GetType().c_str(), - node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),block->memory_type_, - block->Size(), real_size, no_align_size, node_type.GetLifeBeginDesc().c_str(), end, child_block_level, - block->reuse_mem_, block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input, + GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] " + "life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(), + op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), + block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block_level, block->reuse_mem_, + block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input, block->batch_label_.c_str()); } diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h index 78584078..58bcda75 100755 --- a/ge/graph/build/memory/block_mem_assigner.h +++ b/ge/graph/build/memory/block_mem_assigner.h @@ -39,15 +39,14 @@ using DependStreamLife = std::map>; enum OpMemoryType { kOutput, kWorkspace }; struct NodeTypeIndex { - NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false, size_t begin = 0) - : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input), life_time_begin(begin) {} + NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false) + : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input) {} ge::NodePtr node = nullptr; OpMemoryType mem_type = kOutput; uint32_t index = 0; - bool ref_input = false; - size_t life_time_begin = 0; size_t life_time_end = kMaxLifeTime; + bool ref_input = false; const string GetMemType() const { if (mem_type == kOutput) { return "output"; @@ -56,34 +55,6 @@ struct NodeTypeIndex { } return "unknown"; } - - size_t GetLifeBegin() const { - if ((node == nullptr) || (node->GetOpDesc() == nullptr)) { - return 0; - } - - if ((life_time_begin > 0) && (life_time_begin < static_cast(node->GetOpDesc()->GetId()))) { - return life_time_begin; - } else { - return node->GetOpDesc()->GetId(); - } - } - - std::string GetLifeBeginDesc() const { - if (node == nullptr) { - return ""; - } - auto node_op_desc = node->GetOpDesc(); - if (node_op_desc != nullptr) { - auto life_begin = GetLifeBegin(); - if (life_begin != static_cast(node_op_desc->GetId())) { - return std::to_string(life_begin) + "-" + std::to_string(node_op_desc->GetId()); - } else { - return std::to_string(node_op_desc->GetId()); - } - } - return ""; - } }; class MemoryBlock { @@ -115,13 +86,16 @@ class MemoryBlock { symbol_list_.clear(); } - size_t Size() const { return block_size_; } - - void SetSize(size_t size) { - if (size > block_size_) { - block_size_ = size; + void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size, + int64_t stream_id) { + real_size_list_.emplace_back(real_size); + no_align_size_list_.emplace_back(no_align_size); + node_type_index_list_.emplace_back(node, type, out_index, false); + if (stream_id != stream_id_) { + same_stream_ = false; } } + size_t Size() const { return block_size_; } size_t AlignSize() const; @@ -169,7 +143,7 @@ class MemoryBlock { size_t GetLifeBegin(); - size_t GetLifeEnd() const; + size_t GetLifeEnd(); void AddDependLifeBegin(DependStreamLife &node_depend_stream_life); @@ -432,7 +406,6 @@ class BlockMemAssigner : public MemAssigner { bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name, uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag); - bool IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index); /// /// @ingroup GE /// @|+++++++++block1++++++++| |+++++++++block1++++++++| @@ -452,6 +425,8 @@ class BlockMemAssigner : public MemAssigner { std::unordered_map>> reusable_blocks_; + std::map reusable_block_counts_; + std::unordered_map>> stream_workspace_blocks_; std::unordered_map> node_out_blocks_; @@ -481,7 +456,6 @@ class BlockMemAssigner : public MemAssigner { std::string max_batch_label_; - size_t continuous_life_begin_ = 0; /// /// @ [stream1][nodeid] /// @[nodeid] [stream2][nodeid] diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index f94eb275..98d073d4 100755 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -35,9 +35,10 @@ namespace { const int kAllInputAddrIsAtomic = -1; const int kVirtualInputNodeMemoryReuse = 0; const int kVirtualOutputNodeMemoryReuse = 1; -// One state per bit cannot be repeated -enum ContinuousType { kTypeInput = 1, kTypeInputNoPadding = 2, kTypeOutput = 4, kTypeOutputNoPadding = 8 }; - +const size_t kVirtualInputNodeOutputSize = 1; +const size_t kVirtualOutputNodeInputSize = 1; +const size_t kVirtualNodeDataIndex = 0; +const char *const kMbatchNodeNameFlag = "_ascend_mbatch_batch_"; int64_t GetSymbolOutputOffset(const std::map &anchor_to_symbol, const std::map> &symbol_to_anchors, const ge::NodePtr &node, const uint32_t i) { @@ -135,7 +136,7 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() { return ge::SUCCESS; } -ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, +ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index, int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size) { graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size); @@ -180,6 +181,68 @@ ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &out return SUCCESS; } +Status GraphMemoryAssigner::GetMaxBatchLabel(const map> &mem_reuse_virtual_nodes_map, + int32_t mem_reuse_model, string &max_batch_label) { + for (auto &i_map : mem_reuse_virtual_nodes_map) { + vector virtual_nodes_list = i_map.second; + vector max_shape_dims; + size_t max_batch_dim = 0; + bool max_batch_dim_find = false; + for (size_t i = 0; i < virtual_nodes_list.size(); ++i) { + GE_CHECK_NOTNULL(virtual_nodes_list[i]); + OpDescPtr op_desc = virtual_nodes_list[i]->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + + ge::ConstGeTensorDescPtr input_output_desc; + if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { + input_output_desc = op_desc->GetOutputDescPtr(kVirtualNodeDataIndex); + } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { + input_output_desc = op_desc->GetInputDescPtr(kVirtualNodeDataIndex); + } else { + std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + GE_CHECK_NOTNULL(input_output_desc); + + if (i == 0) { + // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. + (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label); + max_shape_dims = input_output_desc->GetShape().GetDims(); + } else { + vector current_shape_dims = input_output_desc->GetShape().GetDims(); + if (current_shape_dims.size() != max_shape_dims.size()) { + std::string error = "The shape of several nodes between multiple batches does not match."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + for (size_t j = 0; j < current_shape_dims.size(); ++j) { + if (current_shape_dims[j] == max_shape_dims[j]) { + continue; + } + if (max_batch_dim_find && max_batch_dim != j) { + std::string error = "The shape of several nodes between multiple batches does not match."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + max_batch_dim_find = true; + max_batch_dim = j; + if (current_shape_dims[j] > max_shape_dims[j]) { + max_shape_dims[j] = current_shape_dims[j]; + // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. + (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label); + } + // Only compare the first different dim in shape. + break; + } + } + } + // In every element of virtual_input_nodes_map, the label of the max batch node is the same. + break; + } + return SUCCESS; +} + Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map &mem_type_to_offset) { if (memory_offset_.empty()) { GELOGE(FAILED, "memory_offset_ is empty."); @@ -187,6 +250,13 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, mapGetGraphMemoryMaxSize())}); - GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(), - iter.second, iter.first); } return ge::FAILED; } @@ -245,137 +313,22 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(map &mem_offse return SUCCESS; } -uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) { - if (op_desc == nullptr) { - return 0; - }; - - bool is_continuous = false; - uint32_t continuous_type = 0; - // If GetBool fail, is_continuous is false. - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_continuous); - if (is_continuous) { - continuous_type |= kTypeInput; - } else { - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_continuous); - if (is_continuous) { - bool attr_reuse = false; - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); - if (attr_reuse) { - continuous_type |= kTypeInputNoPadding; - } - } - } - - is_continuous = false; - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_continuous); - if (is_continuous) { - continuous_type |= kTypeOutput; - } else { - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, is_continuous); - if (is_continuous) { - bool attr_reuse = false; - (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); - if (attr_reuse) { - continuous_type |= kTypeOutputNoPadding; - } - } - } - - if (continuous_type != 0) { - GELOGI("Current node %s continuous type %d.", op_desc->GetName().c_str(), continuous_type); - } - return continuous_type; -} - -Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &output_desc, uint32_t continuous_type, - int64_t &tensor_size, int64_t &nopadding_size) { - if ((op_desc == nullptr) || (output_desc == nullptr)) { - GELOGE(FAILED, "Input para is nullptr."); - return FAILED; - } - tensor_size = 0; - nopadding_size = 0; - bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0); - if (is_nopadding) { - int64_t attr_dim_index; - bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); - if (!get_attr_dim_flag) { - GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); - return FAILED; - } - - // Calculate tensor real size of each piece of data and out size of complete data - int64_t batch_dim_num = 1; - if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, nopadding_size, batch_dim_num, tensor_size) != - SUCCESS) { - GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s.", op_desc->GetName().c_str()); - return FAILED; - } - } else { - if (ge::TensorUtils::GetSize(*output_desc, tensor_size) != ge::SUCCESS) { - GELOGE(FAILED, "GetSize failed."); - return FAILED; - } - } - if ((tensor_size < 0) || (nopadding_size < 0)) { - GELOGE(FAILED, "GetMemorySize for node %s failed.", op_desc->GetName().c_str()); - return FAILED; - } - return SUCCESS; -} - -void AlignMemOffset(int64_t &mem_align_size) { - if (mem_align_size <= 0) { - return; - } - mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; -} - -bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op_desc) { - bool is_peer_output_continuous = false; - // If GetBool fail, is_peer_output_continuous is false. - (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous); - - // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and - // continuous output of the previous node is the same, we can support it. If size != 1, there may be - // conflict between the two, we can not support it. - auto peer_output_size = peer_op_desc->GetOutputsSize(); - GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1), - std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + - " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + - " requires continuous output. There may be conflict between the two." + - "This node is not supported now."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return true;); - - bool is_peer_reference = false; - // If GetBool fail, is_peer_reference is false. - (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); - GE_IF_BOOL_EXEC(is_peer_reference, - std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + - " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + - " requires continuous output. There may be conflict between the two." + - "This node is not supported now."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); - return true;); - return false; -} - Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { Status ret; for (auto &node : compute_graph_->GetAllNodes()) { - GE_CHECK_NOTNULL(node); - auto continuous_type = GetContinuousMemoryType(node->GetOpDesc()); + // Get the continuous input type of the node, default is false + bool is_input_continuous = false; + GE_CHECK_NOTNULL(node->GetOpDesc()); + // If GetBool fail, is_input_continuous is false. + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); // Assign continuous input memory - bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0); - int64_t memory_type = RT_MEMORY_HBM; - GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed."); - if (continuous_input) { + if (is_input_continuous) { + int64_t memory_type = RT_MEMORY_HBM; + GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed."); int64_t mem_clean_start = 0; int64_t mem_clean_size = 0; - ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type, continuous_type); + ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type); if (ret != ge::SUCCESS) { GELOGE(ret, "Assign continuous input memory failed!"); return ret; @@ -385,6 +338,7 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { vector input_indexes; // If GetListInt fail, input_indexes is empty. (void) ge::AttrUtils::GetListInt(node->GetOpDesc(), ATOMIC_ATTR_INPUT_INDEX, input_indexes); + if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) { // check whether there is an atomic conflict between the current node and the peer out node if (!CheckInputIsSupportAtomic(node)) { @@ -396,10 +350,9 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { const auto &in_control_anchor = node->GetInControlAnchor(); GE_CHECK_NOTNULL(in_control_anchor); for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) { - GE_CHECK_NOTNULL(peer_out_control_anchor); auto peer_out_node = peer_out_control_anchor->GetOwnerNode(); if (peer_out_node->GetType() == ATOMICADDRCLEAN) { - ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}, memory_type); + ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}); if (ret != SUCCESS) { GELOGE(ret, "Failed to set attr for atomic addr clean node %s.", peer_out_node->GetName().c_str()); return ret; @@ -409,12 +362,23 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { } } - // Assign continuous output memory - bool continuous_output = ((continuous_type & kTypeOutput) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0); - if (continuous_output) { - ret = AssignContinuousOutputMemory(node, memory_type, continuous_type); + // Get the reference type of the node, default is false + bool is_ref = false; + // If GetBool fail, is_ref is false. + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref); + + // Get the continuous output type of the node, default is false + bool is_output_continuous = false; + // If GetBool fail, is_output_continuous is false. + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); + + // If the output is ref type and refers to the ref of an input, the name of the output + // and the input are the same. Ge encounters ref type, finds matching relationship according + // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast + if (!is_ref && is_output_continuous) { // Assign continuous output memory + ret = AssignContinuousOutputMemory(node); if (ret != ge::SUCCESS) { - GELOGE(ret, "Assign continuous output memory failed!"); + GELOGE(ret, "Assign reference memory failed!"); return ret; } } @@ -427,181 +391,520 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { } Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, - int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type) { + int64_t &continuous_mem_size, int64_t memory_type) { GELOGI("Current node %s needs continuous input.", node->GetName().c_str()); + bool continuous_input_alloc = false; + (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, continuous_input_alloc); auto iter = memory_offset_.find(memory_type); if (iter == memory_offset_.end()) { std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } - // The head and tail of hcom continuous input should be added 512 - iter->second.mem_offset_ += MEM_ALIGN_SIZE; continuous_mem_start = iter->second.mem_offset_; - int64_t mem_offset = iter->second.mem_offset_; - int64_t extra_memory_size = 0; - bool is_continuous_input_allocated = false; - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, is_continuous_input_allocated); for (auto &in_data_anchor : node->GetAllInDataAnchors()) { - GE_IF_BOOL_EXEC(in_data_anchor == nullptr, continue); auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue); + auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); GE_IF_BOOL_EXEC(peer_op_desc == nullptr, continue); - GE_IF_BOOL_EXEC(IsContinuousInputConflict(node, peer_op_desc), return PARAM_INVALID;); + bool is_peer_output_continuous = false; + // If GetBool fail, is_peer_output_continuous is false. + (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous); + + // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and + // continuous output of the previous node is the same, we can support it. If size != 1, there may be + // conflict between the two, we can not support it. + auto peer_output_size = peer_op_desc->GetOutputsSize(); + GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1), + std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + + " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + + " requires continuous output. There may be conflict between the two." + + "This node is not supported now."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return PARAM_INVALID;); + + bool is_peer_reference = false; + // If GetBool fail, is_peer_reference is false. + (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); + GE_IF_BOOL_EXEC(is_peer_reference, + std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) + + " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) + + " requires continuous output. There may be conflict between the two." + + "This node is not supported now."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return PARAM_INVALID;); + + vector output_list = peer_op_desc->GetOutputOffset(); + std::vector offsets_for_fusion = {}; + bool has_offset_attr = + AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion); + if (peer_out_data_anchor->GetIdx() < static_cast(output_list.size())) { + if (continuous_input_alloc && !has_offset_attr) { + if (in_data_anchor->GetIdx() == 0) { + continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx()); + } + // can not use else if, incase only one input + if (in_data_anchor->GetIdx() == static_cast(node->GetAllInDataAnchors().size()) - 1) { + int64_t tensor_desc_size = 0; + Status ret = ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), + tensor_desc_size); + GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;); + + tensor_desc_size = (tensor_desc_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; + continuous_mem_size = + output_list.at(peer_out_data_anchor->GetIdx()) - continuous_mem_start + tensor_desc_size + MEM_ALIGN_SIZE; + } + GELOGI( + "[IMAS]Check Continuous input : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%u] " + "real_size[%u].", + node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), + peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), + 0, 0); + continue; + } + + output_list.at(peer_out_data_anchor->GetIdx()) = iter->second.mem_offset_; + } else { + std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx()); + return FAILED; + } + peer_op_desc->SetOutputOffset(output_list); + size_t pre_mem_offset = iter->second.mem_offset_; int64_t tensor_desc_size = 0; - int64_t nopadding_size = 0; - int64_t real_size = 0; - std::vector offsets_of_fusion = {}; - bool lx_fusion = AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_of_fusion); - lx_fusion = lx_fusion && !offsets_of_fusion.empty(); - if (lx_fusion) { - if (peer_out_data_anchor->GetIdx() >= static_cast(offsets_of_fusion.size())) { + if (has_offset_attr) { + if (peer_out_data_anchor->GetIdx() < static_cast(offsets_for_fusion.size())) { + auto offset_for_fusion = offsets_for_fusion[peer_out_data_anchor->GetIdx()]; + iter->second.mem_offset_ += offset_for_fusion; + } else { std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) + " index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } - nopadding_size = offsets_of_fusion[peer_out_data_anchor->GetIdx()]; - tensor_desc_size = nopadding_size; } else { - if (GetMemorySize(node->GetOpDesc(), peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()), - continuous_type, tensor_desc_size, nopadding_size) != ge::SUCCESS) { - return FAILED; - } + Status ret = + TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size); + GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;); + + iter->second.mem_offset_ += tensor_desc_size; } - bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || lx_fusion; - vector output_list = peer_op_desc->GetOutputOffset(); - if (peer_out_data_anchor->GetIdx() >= static_cast(output_list.size())) { - std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; - GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + // If set tensor_actual_size, Memory alignment is not required. + int32_t is_tensor_actual_size = 0; + ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size); + if (is_tensor_actual_size == 0) { + AlignMemOffset(MEM_ALIGN_SIZE, memory_type); + } + GELOGI( + "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] " + "real_size[%ld].", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), + peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), + (iter->second.mem_offset_ - pre_mem_offset), tensor_desc_size); + } + + iter->second.mem_offset_ += MEM_ALIGN_SIZE; + if (!continuous_input_alloc) { + continuous_mem_size = iter->second.mem_offset_ - continuous_mem_start; + } + return SUCCESS; +} + +Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node) { + GELOGI("Current node %s needs continuous output.", node->GetName().c_str()); + auto out_op_desc = node->GetOpDesc(); + GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); + vector output_list = out_op_desc->GetOutputOffset(); + + if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) { + GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", + out_op_desc->GetOutputsSize(), output_list.size()); + return ge::FAILED; + } + + size_t mem_offset = output_list[0]; + for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { + output_list[out_data_anchor->GetIdx()] = mem_offset; + int64_t tensor_desc_size = 0; + if (ge::TensorUtils::GetSize(*(out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx())), tensor_desc_size) != + ge::SUCCESS) { + GELOGE(FAILED, "GetSize failed."); + return FAILED; + } + mem_offset += tensor_desc_size; + if (mem_offset <= 0) { return FAILED; } + mem_offset = (mem_offset + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; + GELOGI( + "[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] " + "real_size[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), + output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), tensor_desc_size, tensor_desc_size); + } + out_op_desc->SetOutputOffset(output_list); + return ge::SUCCESS; +} - // when continuous input has been allocated first input is beginning offset - bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0); - if (is_allocated_first_input) { - mem_offset = output_list.at(peer_out_data_anchor->GetIdx()); - continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx()); - } else { - // set offset for input - output_list.at(peer_out_data_anchor->GetIdx()) = mem_offset; - peer_op_desc->SetOutputOffset(output_list); +Status GraphMemoryAssigner::ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse) { + OpDescPtr op_desc = node->GetOpDesc(); + vector output_list = op_desc->GetOutputOffset(); + if (output_list.empty()) { + GELOGE(FAILED, "Outputoffset is empty node name:%s", node->GetName().c_str()); + return FAILED; + } + output_list.at(0) = mem_offset_reuse; + op_desc->SetOutputOffset(output_list); + GELOGI("Set virtual input node %s output offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse); + + int64_t attr_dim_index; + bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); + if (!get_attr_dim_flag) { + GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + return FAILED; + } + + size_t extra_memory_size = 0; + for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { + auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_data_anchor); + auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); + GE_CHECK_NOTNULL(peer_op_desc); + vector output_offsets = peer_op_desc->GetOutputOffset(); + if (peer_out_data_anchor->GetIdx() >= static_cast(output_offsets.size())) { + GELOGE(ge::FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); + return ge::FAILED; } + output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse; + peer_op_desc->SetOutputOffset(output_offsets); + size_t pre_mem_offset = mem_offset_reuse; - int64_t align_size = tensor_desc_size; - if (is_nopadding) { - mem_offset += nopadding_size; - extra_memory_size += (tensor_desc_size - nopadding_size); - real_size = nopadding_size; - } else { - ge::AlignMemOffset(align_size); - mem_offset += align_size; - // The head and tail of hcom continuous input should be added 512 - extra_memory_size = MEM_ALIGN_SIZE; - real_size = tensor_desc_size; + // Calculate tensor real size of each piece of data and out size of complete data + ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()); + GE_CHECK_NOTNULL(output_desc); + int64_t output_mem_size; + int64_t batch_dim_num = 1; + int64_t out_size; + if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != + SUCCESS) { + GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", + peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx()); + return FAILED; } - GELOGI("[IMAS]Continuous input : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld] " - "size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), - node->GetType().c_str(), peer_op_desc->GetName().c_str(),peer_out_data_anchor->GetIdx(), - output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), memory_type, - is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding); - } + mem_offset_reuse += output_mem_size; + extra_memory_size = extra_memory_size + out_size - output_mem_size; - mem_offset += extra_memory_size; - ge::AlignMemOffset(mem_offset); - continuous_mem_size = mem_offset - continuous_mem_start; - if (is_continuous_input_allocated) { - // not allocate memory here, so no need add 512 in header - iter->second.mem_offset_ -= MEM_ALIGN_SIZE; - } else { - iter->second.mem_offset_ = mem_offset; + GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] " + "real_size[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), + peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), out_size, + output_mem_size); } + mem_offset_reuse += extra_memory_size; + size_t after_mem_offset = mem_offset_reuse; + GELOGI("After reassign virtual input node[name: %s, type: %s] memory, memory offset = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset); return SUCCESS; } -Status GetFirstInputPeerOutOutputOffset(const ge::NodePtr &node, int64_t &mem_offset) { - auto in_data_anchor_list = node->GetAllInDataAnchors(); - if (in_data_anchor_list.empty()) { - GELOGE(FAILED, "Node %s's in data anchor is empty.", node->GetName().c_str()); +Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() { + map> mem_reuse_virtual_input_nodes_map; + int64_t memory_type = RT_MEMORY_HBM; + for (const auto &n : compute_graph_->GetAllNodes()) { + OpDescPtr op_desc = n->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + bool attr_continuous = false; + bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, attr_continuous); + GE_IF_BOOL_EXEC(!get_continuous_flag, continue); + bool attr_reuse = false; + bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + GE_IF_BOOL_EXEC(!get_reuse_flag, continue); + if (attr_reuse && attr_continuous) { + if (op_desc->GetOutputsSize() != kVirtualInputNodeOutputSize) { + // When current virtual node has several outputs, can't directly determine which input is the tensor for reuse. + std::string error = "Only one output is supported, current virtual node" + FmtToStr(n->GetName()) + + " has " + FmtToStr(op_desc->GetOutputsSize()) + " outputs."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed."); + auto iter = memory_offset_.find(memory_type); + if (iter == memory_offset_.end()) { + std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + GELOGD("Start to reassign memory for virtual input node, memory offset = %zu, memory type = %ld.", + iter->second.mem_offset_, memory_type); + string batch_label_string; + // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter + (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); + if (batch_label_string.empty()) { + size_t node_mem_offset = iter->second.mem_offset_; + // No ATTR_NAME_BATCH_LABEL, no need to reuse memory. + Status status = ReAssignVirtualInputNodeMemory(n, node_mem_offset); + if (status != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual input node failed, node name: %s.", n->GetName().c_str()); + return FAILED; + } + + iter->second.mem_offset_ = node_mem_offset; + AlignMemOffset(MEM_ALIGN_SIZE, memory_type); + GELOGD("After reassign memory for virtual input node, align memory = %zu, memory type = %ld.", + iter->second.mem_offset_, memory_type); + } else { + // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory. + string current_node_full_name = op_desc->GetName(); + size_t pos = current_node_full_name.find(kMbatchNodeNameFlag); + if (pos == string::npos) { + GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.", + kMbatchNodeNameFlag, n->GetName().c_str()); + return FAILED; + } + string fixed_name = current_node_full_name.substr(0, pos); + vector parallel_virtual_input_nodes; + if (mem_reuse_virtual_input_nodes_map.count(fixed_name) != 0) { + parallel_virtual_input_nodes = mem_reuse_virtual_input_nodes_map[fixed_name]; + } + parallel_virtual_input_nodes.emplace_back(n); + mem_reuse_virtual_input_nodes_map[fixed_name] = parallel_virtual_input_nodes; + } + } + } + + int32_t mem_reuse_model = 0; + if (ReAssignVirtualNodesMemory(mem_reuse_virtual_input_nodes_map, mem_reuse_model) != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual input nodes failed."); return FAILED; } + return SUCCESS; +} + +Status GraphMemoryAssigner::ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse) { + OpDescPtr op_desc = node->GetOpDesc(); + + // 1. set memory of to be reused input tensor + auto in_data_anchor_list = node->GetAllInDataAnchors(); auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, GELOGE(ge::FAILED, "peer_out_data_anchor is null."); - return ge::FAILED); + GE_CHECK_NOTNULL(peer_out_data_anchor); auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_IF_BOOL_EXEC(peer_op_desc == nullptr, GELOGE(ge::FAILED, "peer_op_desc is null."); return ge::FAILED); + GE_CHECK_NOTNULL(peer_op_desc); vector in_node_output_offsets = peer_op_desc->GetOutputOffset(); if (peer_out_data_anchor->GetIdx() >= static_cast(in_node_output_offsets.size())) { GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); return FAILED; } - mem_offset = in_node_output_offsets.at(peer_out_data_anchor->GetIdx()); - return SUCCESS; -} + in_node_output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse; + peer_op_desc->SetOutputOffset(in_node_output_offsets); + GELOGI("Set virtual output node %s input data offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse); -Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type, - uint32_t continuous_type) { - GELOGI("Current node %s needs continuous output.", node->GetName().c_str()); - auto out_op_desc = node->GetOpDesc(); - GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); - vector output_list = out_op_desc->GetOutputOffset(); - if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) { - GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.", - out_op_desc->GetOutputsSize(), output_list.size()); - return ge::FAILED; + // 2. set memory of output tensor + vector output_list = op_desc->GetOutputOffset(); + if (output_list.empty()) { + GELOGE(FAILED, "Outputoffset is empty, node name: %s", node->GetName().c_str()); + return FAILED; + } + if (op_desc->GetOutputsSize() > output_list.size()) { + GELOGE(FAILED, "The size %zu of op_desc is more than output_list's size %zu.", op_desc->GetOutputsSize(), + output_list.size()); + return FAILED; + } + int64_t attr_dim_index; + bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); + if (!get_attr_dim_flag) { + GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + return FAILED; } - int64_t mem_offset = 0; - bool is_nopadding = ((continuous_type & kTypeOutputNoPadding) != 0); - if (is_nopadding) { - // out tensor memory must be reused input tensor memory - if (GetFirstInputPeerOutOutputOffset(node, mem_offset) != SUCCESS) { - return ge::FAILED; + size_t extra_memory_size = 0; + for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { + output_list[out_data_anchor->GetIdx()] = mem_offset_reuse; + size_t pre_mem_offset = mem_offset_reuse; + + // calculate tensor real size of each piece of data and out size of complete data + ge::ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()); + GE_CHECK_NOTNULL(output_desc); + int64_t output_mem_size; + int64_t batch_dim_num = 1; + int64_t out_size; + if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != + SUCCESS) { + GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", + op_desc->GetName().c_str(), out_data_anchor->GetIdx()); + return FAILED; } - } else { - // Get the reference type of the node, default is false - bool is_ref = false; - // If GetBool fail, is_ref is false. - (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref); - // If the output is ref type and refers to the ref of an input, the name of the output - // and the input are the same. Ge encounters ref type, finds matching relationship according - // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast - if (is_ref) { - GELOGI("Current node %s no needs assign continuous output because reference input by name.", - node->GetName().c_str()); - return SUCCESS; + mem_offset_reuse += output_mem_size; + extra_memory_size = extra_memory_size + out_size - output_mem_size; + + GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu], size[%ld], real_size[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), op_desc->GetName().c_str(), out_data_anchor->GetIdx(), + pre_mem_offset, out_size, output_mem_size); + } + op_desc->SetOutputOffset(output_list); + mem_offset_reuse += extra_memory_size; + size_t after_mem_offset = mem_offset_reuse; + GELOGI("After reassign virtual output node[name: %s, type: %s] memory, memory offset = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset); + return SUCCESS; +} + +Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() { + map> mem_reuse_virtual_output_nodes_map; + int64_t memory_type = RT_MEMORY_HBM; + for (const auto &n : compute_graph_->GetAllNodes()) { + OpDescPtr op_desc = n->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + bool attr_continuous = false; + bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, attr_continuous); + GE_IF_BOOL_EXEC(!get_continuous_flag, continue); + bool attr_reuse = false; + bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + GE_IF_BOOL_EXEC(!get_reuse_flag, continue); + + if (attr_reuse && attr_continuous) { + auto in_data_anchor_list = n->GetAllInDataAnchors(); + if (in_data_anchor_list.size() != kVirtualOutputNodeInputSize) { + // When current virtual node has several inputs, can't directly determine which input is the tensor for reuse. + std::string error = "Only one input is supported, current virtual node" + FmtToStr(n->GetName()) + + " has " + FmtToStr(in_data_anchor_list.size()) + " inputs."; + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed."); + auto iter = memory_offset_.find(memory_type); + if (iter == memory_offset_.end()) { + std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + GELOGD("Start to reassign memory for virtual output node, memory offset = %zu, memory type = %ld.", + iter->second.mem_offset_, memory_type); + string batch_label_string; + // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter + (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); + if (batch_label_string.empty()) { + size_t node_mem_offset = iter->second.mem_offset_; + // No ATTR_NAME_BATCH_LABEL, no need to reuse memory. + Status status = ReAssignVirtualOutputNodeMemory(n, node_mem_offset); + if (status != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual output node failed, node name: %s.", n->GetName().c_str()); + return FAILED; + } + iter->second.mem_offset_ = node_mem_offset; + AlignMemOffset(MEM_ALIGN_SIZE, memory_type); + GELOGD("After reassign memory for virtual output node, align memory = %zu, memory type = %ld.", + iter->second.mem_offset_, memory_type); + } else { + // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory. + string current_node_full_name = op_desc->GetName(); + size_t pos = current_node_full_name.find(kMbatchNodeNameFlag); + if (pos == string::npos) { + std::string error = "Cannot find key string" + FmtToStr(kMbatchNodeNameFlag) + + " of multi-batch in name of virtual output node, the node name is " + FmtToStr(n->GetName()); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + string fixed_name = current_node_full_name.substr(0, pos); + vector parallel_virtual_output_nodes; + if (mem_reuse_virtual_output_nodes_map.count(fixed_name) != 0) { + parallel_virtual_output_nodes = mem_reuse_virtual_output_nodes_map[fixed_name]; + } + parallel_virtual_output_nodes.emplace_back(n); + mem_reuse_virtual_output_nodes_map[fixed_name] = parallel_virtual_output_nodes; + } } - mem_offset = output_list[0]; } - for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { - output_list[out_data_anchor->GetIdx()] = mem_offset; - int64_t tensor_desc_size = 0; - int64_t nopadding_size = 0; - if (GetMemorySize(out_op_desc, out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()), continuous_type, - tensor_desc_size, nopadding_size) != ge::SUCCESS) { + int32_t mem_reuse_model = 1; + if (ReAssignVirtualNodesMemory(mem_reuse_virtual_output_nodes_map, mem_reuse_model) != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual output nodes failed."); + return FAILED; + } + return SUCCESS; +} + +Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map> &mem_reuse_nodes_map, + int32_t mem_reuse_model) { + // Find max batch label value + string max_batch_label; + GE_CHK_STATUS_RET(GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label), + "Get max batch label failed."); + PrintMemoryOffset(); + vector nodes_mem_offset_list; + for (auto &i_map : mem_reuse_nodes_map) { + vector virtual_nodes_list = i_map.second; + int64_t memory_type = RT_MEMORY_HBM; + GE_CHK_STATUS_RET(GetNodeListMemoryType(virtual_nodes_list, mem_reuse_model, memory_type), + "Get node list memory type failed."); + auto iter = memory_offset_.find(memory_type); + if (iter == memory_offset_.end()) { + std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); return FAILED; } + size_t max_batch_node_mem_offset = iter->second.mem_offset_; + nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset); + for (auto &i_node : virtual_nodes_list) { + // Op_desc is not nullptr, it has been checked. + OpDescPtr op_desc = i_node->GetOpDesc(); + string batch_label_string; + // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value. + (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string); + if (batch_label_string == max_batch_label) { + Status status = SUCCESS; + if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { + status = ReAssignVirtualInputNodeMemory(i_node, max_batch_node_mem_offset); + } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { + status = ReAssignVirtualOutputNodeMemory(i_node, max_batch_node_mem_offset); + } else { + std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } - if (is_nopadding) { - mem_offset += nopadding_size; - } else { - mem_offset += tensor_desc_size; - ge::AlignMemOffset(mem_offset); + if (status != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str()); + return FAILED; + } + iter->second.mem_offset_ = max_batch_node_mem_offset; + AlignMemOffset(MEM_ALIGN_SIZE, memory_type); + GELOGD("After reassign memory for virtual node, align memory = %zu, memory type = %ld.", + iter->second.mem_offset_, memory_type); + // Only assign memory of max batch nodes. + break; + } } - GELOGI("[IMAS]Continuous output : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld]" - " size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), - node->GetType().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), - output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), memory_type, 0UL, - is_nopadding ? nopadding_size : tensor_desc_size, is_nopadding); } - out_op_desc->SetOutputOffset(output_list); - return ge::SUCCESS; + PrintMemoryOffset(); + size_t memory_reuse_index = 0; + for (auto &i_map : mem_reuse_nodes_map) { + vector virtual_nodes_list = i_map.second; + for (auto &i_node : virtual_nodes_list) { + size_t remaining_batch_node_mem_offset = nodes_mem_offset_list[memory_reuse_index]; + Status status = SUCCESS; + if (mem_reuse_model == kVirtualInputNodeMemoryReuse) { + status = ReAssignVirtualInputNodeMemory(i_node, remaining_batch_node_mem_offset); + } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) { + status = ReAssignVirtualOutputNodeMemory(i_node, remaining_batch_node_mem_offset); + } else { + std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model); + GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); + return FAILED; + } + + if (status != SUCCESS) { + GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str()); + return FAILED; + } + } + memory_reuse_index++; + } + return SUCCESS; } Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { @@ -643,7 +946,7 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { GE_CHECK_NOTNULL(mem_assigner_); GE_CHECK_NOTNULL(mem_assigner_->GetPriorityAssinger()); if ((atomic_mem_size != 0) && (iter_batch.first == mem_assigner_->GetPriorityAssinger()->GetMaxBatchLabel())) { - GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}, RT_MEMORY_HBM), + GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}), "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str()); } } @@ -781,7 +1084,7 @@ Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector & } // All atomic nodes use atomic_addr_clean op independently, so we need to set the attr separately. - if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end, RT_MEMORY_HBM) != SUCCESS) { + if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end) != SUCCESS) { GELOGE(FAILED, "Failed to set atomic attr separately."); return FAILED; } @@ -928,10 +1231,9 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve output_list[output_index] = iter->second.mem_offset_; std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); - GELOGI("[IMAS]Atomic output : Set %s name[%s] optype[%s] output[%ld] offset to [%zu] stream_id[%ld] memtype[%ld] " - "size[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), - node->GetType().c_str(), output_index, iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM, - size, size, batch_label.c_str()); + GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld]" + " batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, + iter->second.mem_offset_, op_desc->GetStreamId(), size, size, batch_label.c_str()); iter->second.mem_offset_ += size; AlignMemOffset(MEM_ALIGN_SIZE, RT_MEMORY_HBM); @@ -1007,10 +1309,10 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); GELOGI( - "[IMAS]Atomic ordinary workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " - "memtype[%ld] size[%ld] real_size[%ld] batch[%s].", - compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index, - mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size, + "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " + "size[%ld] real_size[%ld] batch[%s].", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, + mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size, batch_label.c_str()); mem_type_iter->second.mem_offset_ += workspace_size; @@ -1048,10 +1350,10 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt std::string batch_label; (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); GELOGI( - "[IMAS]Atomic fusion workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " - "memtype[%ld] ssize[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), - op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index, mem_type_iter->second.mem_offset_, - op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size, batch_label.c_str()); + "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] " + "real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, + mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size, + batch_label.c_str()); mem_type_iter->second.mem_offset_ += workspace_size; mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_); @@ -1127,7 +1429,7 @@ ge::Status GraphMemoryAssigner::SetInputOffset() { return FAILED; } for (auto pair : memory_offset_) { - GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(), + GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memory type[%ld]", compute_graph_->GetName().c_str(), pair.second.mem_offset_, pair.first); } @@ -1296,7 +1598,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { } Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start, - const vector &mem_offset_end, int64_t memory_type) { + const vector &mem_offset_end) { GELOGD("Start to set independent atomic attr, atomic_addr_clean memory offset start is %ld", atomic_mem_start); // Parsing offset and size vectors @@ -1325,7 +1627,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in GELOGD("Current node memory_offset vector size is %zu, node name %s, node type is %s.", memory_offset_size.size(), peer_out_node_desc->GetName().c_str(), peer_out_node_desc->GetType().c_str()); if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) { - if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size, memory_type) != SUCCESS) { + if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size) != SUCCESS) { GELOGE(FAILED, "Set atomic clean attr failed."); return FAILED; } @@ -1336,7 +1638,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in } ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const vector &atomic_mem_start, - const vector &atomic_mem_size, int64_t memory_type) { + const vector &atomic_mem_size) { auto node_op_desc = node->GetOpDesc(); if (node_op_desc != nullptr) { GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str()); @@ -1375,10 +1677,9 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve } string atomic_mem_size_str = ss.str(); - GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] optype[%s] output[0] offset to [%s] streamid[%ld]" - " memtype[%ld] size[%s]",node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), - node->GetType().c_str(), atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), memory_type, - atomic_mem_size_str.c_str()); + GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] output[0] offset to [%s] streamid[%ld] size[%s]", + node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), + atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), atomic_mem_size_str.c_str()); } return SUCCESS; } diff --git a/ge/graph/build/memory/graph_mem_assigner.h b/ge/graph/build/memory/graph_mem_assigner.h index a380e594..def24287 100755 --- a/ge/graph/build/memory/graph_mem_assigner.h +++ b/ge/graph/build/memory/graph_mem_assigner.h @@ -119,15 +119,31 @@ class GraphMemoryAssigner { /// ge::Status ReAssignContinuousMemory(bool is_loop_graph); + ge::Status ReAssignReuseAndNoPaddingContinuousInputMemory(); + + ge::Status ReAssignReuseAndNoPaddingContinuousOutputMemory(); + + ge::Status ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse); + + ge::Status ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse); + + ge::Status ReAssignVirtualNodesMemory(map> &mem_reuse_nodes_map, int32_t mem_reuse_model); + + ge::Status GetMaxBatchLabel(const map> &mem_reuse_virtual_nodes_map, + int32_t mem_reuse_model, string &max_batch_label); + + ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index, + int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size); + ge::Status ReAssignAtomicMemory(bool is_loop_graph); ge::Status FilterAtomicNodesForMemoryAssign(map>> &normal_atomic_nodes_map, map> &connecting_output_atomic_nodes); ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, - int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type); + int64_t &continuous_mem_size, int64_t memory_type); - ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type, uint32_t continuous_type); + ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node); /// /// @brief check the input of node whether support atomic attr @@ -153,10 +169,10 @@ class GraphMemoryAssigner { ge::Status AssignConnectNetOutputAtomicMemory(vector &connect_netoutput_nodes); ge::Status SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start, - const std::vector &mem_offset_end, int64_t memory_type); + const std::vector &mem_offset_end); ge::Status SetAtomicCleanAttr(const ge::NodePtr &node, const std::vector &atomic_mem_start, - const std::vector &atomic_mem_size, int64_t memory_type); + const std::vector &atomic_mem_size); ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node); diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index 2afbdf30..35844b2d 100755 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -1809,7 +1809,7 @@ void DavinciModel::GetUserDesignateShapeOrder(std::vector &user_inp /// Status DavinciModel::InitAippInfo(uint32_t index, const OpDescPtr &op_desc) { if (!op_desc->HasAttr(ATTR_NAME_AIPP)) { - GELOGW("There is not AIPP related with index %u.", index); + GELOGW("there is not AIPP related with index %u.", index); return SUCCESS; } @@ -1818,7 +1818,7 @@ Status DavinciModel::InitAippInfo(uint32_t index, const OpDescPtr &op_desc) { GE_CHK_BOOL_RET_STATUS(AttrUtils::GetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attr), GE_AIPP_NOT_EXIST, "Data node do not contain param aipp!"); GE_CHK_STATUS_RET(OpUtils::ConvertAippParams(aipp_attr, &aipp_params), "get aipp params failed"); - GELOGI("Node data: %s, type: %s, current index: %u, current node related input rank: %u", + GELOGI("node data: %s, type: %s, current index: %u, current node related input rank: %u", op_desc->GetName().c_str(), op_desc->GetType().c_str(), index, aipp_params.related_input_rank()); AippConfigInfo aipp_info; @@ -2481,7 +2481,7 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r uint64_t buffer_length = buffer.length; void *buffer_addr = reinterpret_cast(reinterpret_cast(buffer.data)); - GELOGI("CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]", + GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]", runtime_param_.graph_id, output.first, output.second.GetBasicAddr(), data_size, buffer_length); GE_CHK_RT_RET(rtMemcpy(buffer_addr, buffer_length, output.second.GetBasicAddr(), data_size, kind)); idx++; diff --git a/metadef b/metadef index fcd0833c..dc6cceb6 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit fcd0833cffcd201701f71d17db0c696c1bb01715 +Subproject commit dc6cceb67bc82b567bcbd6f415776644253e1467 diff --git a/parser b/parser index 1601d66b..4e72aae4 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit 1601d66b6187c83cbf38e762beb5538ce2c7c573 +Subproject commit 4e72aae41e78af1a19cd965da4a45cbd988b9a75