From 56b950a09ddfb6a2cfb0efbe3da9f2e024a554ba Mon Sep 17 00:00:00 2001 From: TangQunzhang Date: Wed, 9 Dec 2020 10:50:30 +0800 Subject: [PATCH] Dynamic multi batch memory optimization --- .../build/memory/binary_block_mem_assigner.cc | 10 +- ge/graph/build/memory/block_mem_assigner.cc | 511 +++++++++++------- ge/graph/build/memory/block_mem_assigner.h | 36 +- ge/graph/build/memory/graph_mem_assigner.cc | 4 +- .../load/new_model_manager/davinci_model.cc | 2 +- .../load/new_model_manager/model_utils.cc | 4 +- 6 files changed, 363 insertions(+), 204 deletions(-) diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc index ecd2488c..16420123 100644 --- a/ge/graph/build/memory/binary_block_mem_assigner.cc +++ b/ge/graph/build/memory/binary_block_mem_assigner.cc @@ -22,7 +22,7 @@ namespace { const uint32_t kRangeCeilInterval = 2; const uint32_t kLogBase = 2; const int64_t kLargeBlockSize = 8 * 1024 * 1024; -const int64_t kLargeBlockRangeSize = 10; +const int64_t kLargeBlockRangeSize = 2; } // namespace namespace ge { @@ -73,15 +73,17 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { GELOGE(FAILED, "dividend is 0!"); return FAILED; } + // Memory size is 512 aligned, so it is not necessary to take less than 512 + int64_t min_memory_size = (all_memory_size.back() > MEM_ALIGN_SIZE) ? MEM_ALIGN_SIZE : all_memory_size.front(); auto range_number = static_cast( - ceil(log(all_memory_size.back() / static_cast(all_memory_size.front())) / log(kLogBase))); + ceil(log(all_memory_size.back() / static_cast(min_memory_size)) / log(kLogBase))); range_number = (range_number == 0) ? 1 : range_number; GELOGD("Range number: %zu", range_number); vector> ranges(range_number); GE_CHK_BOOL_EXEC((range_number != 0), return PARAM_INVALID, "range_number can't be 0."); size_t range_number_limit = all_memory_size.size() / range_number; - int64_t range_ceil = all_memory_size[0]; + int64_t range_ceil = min_memory_size; for (size_t i = 1; i <= range_number; i++) { GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(static_cast(range_ceil), kRangeCeilInterval), GELOGE(FAILED, "Multiply result is out of range."); @@ -114,7 +116,7 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector &range_ceils) { range_ceils.push_back(range.back()); } } - GELOGD("Range ceils: %s", ToString(range_ceils).c_str()); + GELOGI("Range ceils: %s", ToString(range_ceils).c_str()); return SUCCESS; } diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc index cdf768d8..bd2a9912 100755 --- a/ge/graph/build/memory/block_mem_assigner.cc +++ b/ge/graph/build/memory/block_mem_assigner.cc @@ -65,6 +65,98 @@ void AlignMemOffset(size_t &mem_align_size) { mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; } +static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { + auto left_node_op_desc = left.node->GetOpDesc(); + auto right_node_op_desc = right.node->GetOpDesc(); + if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr) + && (left_node_op_desc->GetId() < right_node_op_desc->GetId())) { + return true; + } + return false; +} + +void GetLifeList(const MemoryBlock &block, std::vector &life_list, bool child) { + for (auto &node : block.NodeTypeIndexList()) { + life_list.emplace_back(node); + } + + if (child) { + for (auto child_block : block.ChildBlockList()) { + if (child_block == nullptr) { + continue; + } + if (block.stream_id_ != child_block->stream_id_ || !block.same_stream_ || !child_block->same_stream_) { + life_list.clear(); + return; + } + GetLifeList(*child_block, life_list, child); + } + } +} + +bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { + if ((left.node == nullptr) || (right.node == nullptr)) { + return true; + } + auto left_node_op_desc = left.node->GetOpDesc(); + auto right_node_op_desc = right.node->GetOpDesc(); + if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) { + if (left_node_op_desc->GetId() < right_node_op_desc->GetId()) { + if (left.life_time_end >= static_cast(right_node_op_desc->GetId())) { + return true; + } + } else if (left_node_op_desc->GetId() == right_node_op_desc->GetId()) { + return true; + } else { + if (right.life_time_end >= static_cast(left_node_op_desc->GetId())) { + return true; + } + } + } + return false; +} + +/// +/// When child block's life time are not cross with parent block, they can be reused(only same stream). +/// |-----------------------------parent block---------------------| +/// |------child block1--------------||------child block2------| +/// |--child block1-1-| +/// +bool CanIntervalLifeReuse(MemoryBlock &parent_block, MemoryBlock &child_block) { + // judge by interval life time, only same stream can be judged by interval life time + if (parent_block.stream_id_ != child_block.stream_id_ || !parent_block.same_stream_ || !child_block.same_stream_ + || parent_block.NodeTypeIndexList().empty() || child_block.NodeTypeIndexList().empty()) { + return false; + } + + // quick judge by front and back node + if (CrossLifeTime(parent_block.NodeTypeIndexList().front(), child_block.NodeTypeIndexList().front())) { + return false; + } + if (CrossLifeTime(parent_block.NodeTypeIndexList().back(), child_block.NodeTypeIndexList().back())) { + return false; + } + + std::vector life_list; + GetLifeList(parent_block, life_list, false); + GetLifeList(child_block, life_list, true); + if (life_list.empty()) { + return false; + } + std::sort(life_list.begin(), life_list.end(), CompareLifeTime); + size_t pre_life_end = 0; + for (auto &node : life_list) { + auto node_op_desc = node.node->GetOpDesc(); + if (node_op_desc != nullptr && pre_life_end >= static_cast(node_op_desc->GetId())) { + // life time cross + return false; + } + pre_life_end = node.life_time_end; + } + GELOGI("Block size[%zu, %zu] life time are not cross.", parent_block.Size(), child_block.Size()); + return true; +} + void MemoryBlock::SetHeadOffset(size_t offset) { head_offset_ = offset; size_t child_offset = head_offset_; @@ -125,20 +217,12 @@ size_t MemoryBlock::AlignSize() const { return align_block_size; } -bool MemoryBlock::IsSameLabel(std::string &first_batch_label) { - if (node_type_index_list_.empty()) { +bool MemoryBlock::IsSameBatchLabel() { + // only same batch label can reuse + if (batch_label_.empty() || node_type_index_list_.empty()) { return false; } - auto node_op_desc = node_type_index_list_[0].node->GetOpDesc(); - if (node_op_desc == nullptr) { - return false; - } - // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter - (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, first_batch_label); - if (first_batch_label.empty()) { - return false; - } bool all_same_label = true; for (size_t index = 1; index < node_type_index_list_.size(); ++index) { if (node_type_index_list_[index].node == nullptr) { @@ -147,8 +231,9 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) { std::string batch_label; auto index_op_desc = node_type_index_list_[index].node->GetOpDesc(); GE_IF_BOOL_EXEC(index_op_desc == nullptr, continue); + // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter (void)ge::AttrUtils::GetStr(index_op_desc, ATTR_NAME_BATCH_LABEL, batch_label); - if (first_batch_label != batch_label) { + if (batch_label_ != batch_label) { all_same_label = false; break; } @@ -197,7 +282,7 @@ void MemoryBlock::AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLi } void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life) { - if (CanNotLifeReuse(this) || CanNotLifeReuse(block)) { + if (CanNotLifeReuse(this) || CanNotLifeReuse(block) || (batch_label_ != block->batch_label_)) { return; } if (block->continuous_block_) { @@ -207,16 +292,27 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_ MemoryBlock *parent = nullptr; MemoryBlock *child = nullptr; // merge small block to large block - if (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd()) { - if ((child_offset_ + block->AlignSize()) <= AlignSize()) { - parent = this; - child = block; - } else if ((block->child_offset_ + AlignSize()) <= block->AlignSize()) { - parent = block; - child = this; + // noalign size 802816 + 802816 = 1605632 can reuse + // after 32 align size 802848 + 802848 > 1605664 can't reuse + // after 512 align size 803328 + 803328 > 1606144 can't reuse + // so 803328 + 803328 = 1606144 + 512 can reuse + if ((child_offset_ + block->AlignSize()) <= (AlignSize() + MEM_ALIGN_SIZE)) { + parent = this; + child = block; + } else if ((block->child_offset_ + AlignSize()) <= (block->AlignSize() + MEM_ALIGN_SIZE)) { + parent = block; + child = this; + } + + if ((parent != nullptr) && (child != nullptr)) { + // Different streams must use stream dependency to judge the life cycle + // In case same stream if it has child block, can judge all the child block's life time in CanIntervalLifeReuse + bool can_block_life_reuse = (child->child_blocks_.empty() + && (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd())); + if (!can_block_life_reuse && !CanIntervalLifeReuse(*parent, *child)) { + return; } - } - if ((parent != nullptr) && (child != nullptr) && child->child_blocks_.empty()) { + parent->child_blocks_.emplace_back(child); parent->child_offset_ += child->AlignSize(); child->deleted_block_ = true; @@ -261,6 +357,7 @@ size_t MemoryBlock::GetDependLifeBegin(int64_t stream_id, DependStreamLife &tota void AddDependLife(const ge::NodePtr &org_node, const ge::NodePtr &node, int64_t stream_id, std::map &depend_stream_life, DependStreamLife &total_node_depend_stream_life) { GE_CHECK_NOTNULL_EXEC(node, return); + GE_CHECK_NOTNULL_EXEC(org_node, return); auto node_desc = node->GetOpDesc(); GE_CHECK_NOTNULL_EXEC(node_desc, return); auto node_id = node_desc->GetId(); @@ -415,12 +512,60 @@ BlockMemAssigner::~BlockMemAssigner() { } } +void GetMaxBatchAllMemorySize(std::map> &batch_all_memory_size, + std::map batch_total_size, vector &all_memory_size, + std::string &max_batch_label) { + // use max batch all memory size for reuse range + int64_t max_batch_size = 0; + for (const auto &it : batch_total_size) { + GELOGI("Batch[%s] total memory size[%ld]", it.first.c_str(), it.second); + // no batch label + if (it.first.empty()) { + continue; + } + if (it.second > max_batch_size) { + max_batch_size = it.second; + max_batch_label = it.first; + } + } + GELOGI("Max batch[%s] total memory size[%ld]", max_batch_label.c_str(), max_batch_size); + + for (const auto &it : batch_all_memory_size) { + if (it.first.empty() || (it.first == max_batch_label)) { + all_memory_size.insert(all_memory_size.end(), it.second.begin(), it.second.end()); + } + } + // all_memory_size can't be empty + if (all_memory_size.empty()) { + all_memory_size.emplace_back(MEM_ALIGN_SIZE); + } + sort(all_memory_size.begin(), all_memory_size.end()); + GELOGD("All memory size: %s", ToString(all_memory_size).c_str()); + + for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) { + if (*iter == 0) { + iter = all_memory_size.erase(iter); + } else { + ++iter; + } + } +} + void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { vector temp; + std::map> batch_all_memory_size; + std::map batch_total_size; for (const NodePtr &n : compute_graph_->GetAllNodes()) { auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); + if (CheckIsZeroMemNodeType(node_op_desc->GetType())) { + continue; + } + + std::string batch_label; + (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label); + if (node_op_desc->GetType() == ATOMICADDRCLEAN) { atomic_addr_clean_id_ = node_op_desc->GetId(); } @@ -434,9 +579,14 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { if (!reuse_input) { int64_t size = 0; GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed")); - if (anchor_to_symbol_.empty()) { - all_memory_size.emplace_back(size); + batch_all_memory_size[batch_label].emplace_back(size); + if (batch_total_size.find(batch_label) == batch_total_size.end()) { + batch_total_size[batch_label] = size; } else { + batch_total_size[batch_label] += size; + } + + if (!anchor_to_symbol_.empty()) { auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString()); if (iter1 == anchor_to_symbol_.end()) { continue; @@ -452,23 +602,11 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { } } temp.clear(); - GetNodeWorkSpaceSize(n, temp); - all_memory_size.insert(all_memory_size.end(), temp.begin(), temp.end()); - } - for (const auto &pair : symbol_size_) { - all_memory_size.emplace_back(pair.second); - } - sort(all_memory_size.begin(), all_memory_size.end()); - GELOGD("All memory size: %s", ToString(all_memory_size).c_str()); - - for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) { - if (*iter == 0) { - iter = all_memory_size.erase(iter); - } else { - ++iter; - } + GetNodeWorkSpaceSize(n, temp, batch_total_size[batch_label]); + batch_all_memory_size[batch_label].insert(batch_all_memory_size[batch_label].end(), temp.begin(), temp.end()); } - + GELOGI("The last atomic_addr_clean node id: %ld", atomic_addr_clean_id_); + GetMaxBatchAllMemorySize(batch_all_memory_size, batch_total_size, all_memory_size, max_batch_label_); InitReuseFlag(); PrintSymbolMap(); } @@ -529,16 +667,6 @@ bool CanReuseBySize(const map &reusable_block_counts, const Me bool can_reuse = false; if (reusable_block.Size() == block_size) { can_reuse = true; - } else { - string key = std::to_string(reusable_block.Size()); - key += "_" + std::to_string(reusable_block.stream_id_); - key += "_" + std::to_string(reusable_block.memory_type_); - auto it = reusable_block_counts.find(key); - GE_IF_BOOL_EXEC((it != reusable_block_counts.end() && (it->second > kReuseMaxCount)) && - (reusable_block.Size() > block_size), - can_reuse = true; - GELOGD("Less size mem reuse, reuse block size:%zu, current block size:%zu", - reusable_block.Size(), block_size);); } return can_reuse; } @@ -860,17 +988,26 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null."); auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr); + std::string batch_label; + (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label); + if (batch_label.empty() || (batch_label == max_batch_label_)) { + size_t align_size = real_size; + AlignMemOffset(align_size); + theory_memory_size_ += align_size; + if (theory_memory_size_ > theory_min_memory_size_) { + theory_min_memory_size_ = theory_memory_size_; + } + } bool is_reuse_memory = false; - string ge_disable_reuse_mem_env = "0"; - (void)ge::GetContext().GetOption(OPTION_EXEC_DISABLE_REUSED_MEMORY, ge_disable_reuse_mem_env); - if (ge_disable_reuse_mem_env != "1") { + if (ge_disable_reuse_mem_env_ != "1") { bool reuse_mem_flag = (mem_type == kOutput) ? IsPreReuse(n, out_index) : !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]); is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) && !node_op_desc->HasAttr(kOpNoReuseMem) && reuse_mem_flag && is_op_reuse_mem; - auto stream_id = node_op_desc->GetStreamId(); - if (is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty()) { + bool do_reuse = is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty(); + if (do_reuse) { + auto stream_id = node_op_desc->GetStreamId(); for (auto it = reusable_blocks_[memory_type][stream_id].rbegin(); it != reusable_blocks_[memory_type][stream_id].rend(); ++it) { MemoryBlock *reusable_block = *it; @@ -879,15 +1016,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, GELOGI("Unreusable block."); continue; } - std::string batch_label; - if (reusable_block->IsSameLabel(batch_label)) { - std::string op_label; - (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, op_label); - if (batch_label != op_label) { - GELOGI("label diff, op name %s", node_op_desc->GetName().c_str()); - continue; - } - } + GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue); // A node can reuse blocks of the same stream and preorder streams if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) { @@ -914,10 +1043,11 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, // Data and netoutput need zero copy block block->is_zero_copy_ = IsZeroCopyBlock(n, continuous); - block->Init(real_size, mem_type, n, out_index, no_align_size); + block->Init(real_size, mem_type, n, out_index, no_align_size, node_op_desc->GetStreamId()); block->stream_id_ = node_op_desc->GetStreamId(); block->ref_count_++; block->continuous_block_ = continuous; + block->batch_label_ = batch_label; if (mem_type == kOutput) { auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString()); if (iter != anchor_to_symbol_.end()) { @@ -945,6 +1075,11 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec return nullptr; } + if (CheckIsZeroMemNodeType(n->GetType())) { + zero_memory_list_.emplace_back(n, kOutput, index); + continue; + } + int64_t size = 0; if (ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS) { GELOGI("Get size failed"); @@ -957,9 +1092,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec // only apply total size in first block if (index != 0) { zero_memory_list_.emplace_back(n, kOutput, index); - } - - if (index == 0) { + } else { NodeIndexIO node_index_io(n, index, kOut); auto iter = anchor_to_symbol_.find(node_index_io.ToString()); if (iter != anchor_to_symbol_.end()) { @@ -972,6 +1105,10 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec } } + if (total_size == 0) { + return nullptr; + } + auto block_size = GetBlockSize(total_size, ranges); GELOGI("Node[%s] continuous out memory size[%ld] block size[%zu]", node_op_desc->GetName().c_str(), total_size, block_size); @@ -1119,15 +1256,28 @@ bool IsKnownSubgraphData(const NodePtr &node) { return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX); } -void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector &reusable_memory) { +void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector &reusable_memory, + bool same_stream) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); --to_release->ref_count_; + if (!same_stream) { + to_release->same_stream_ = false; + } if (to_release->ref_count_ == 0) { - to_release->SetLifeTimeEnd(life_time_); - reusable_memory.emplace_back(to_release); - AddReusableBlockCount(*to_release, reusable_block_counts_); + if (to_release->reuse_mem_ && !to_release->RealSizeList().empty()) { + if (to_release->batch_label_.empty() || (to_release->batch_label_ == max_batch_label_)) { + size_t align_size = to_release->RealSizeList().back(); + AlignMemOffset(align_size); + theory_memory_size_ -= align_size; + } + } + if (to_release->same_stream_) { + to_release->SetLifeTimeEnd(life_time_); + reusable_memory.emplace_back(to_release); + AddReusableBlockCount(*to_release, reusable_block_counts_); + } } } @@ -1167,10 +1317,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_mapGetName().c_str()); if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && - (node_type_indexs.back().index == static_cast(in_anchor->GetPeerOutAnchor()->GetIdx())) && - (node->GetOpDesc()->GetStreamId() == block->stream_id_)) { - ReleaseMemory(block, reusable_memory); - if (block->ref_count_ == 0) { + (node_type_indexs.back().index == static_cast(in_anchor->GetPeerOutAnchor()->GetIdx()))) { + ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_)); + if (block->ref_count_ == 0 && block->same_stream_) { SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); } } @@ -1328,7 +1477,8 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { iter->second[stream_id].clear(); } vector temp; - GetNodeWorkSpaceSize(n, temp); + int64_t tatal_size = 0; + GetNodeWorkSpaceSize(n, temp, tatal_size); vector workspace_bytes; vector tvm_workspace_memory_type; bool has_tvm_workspace_mem_type_attr = @@ -1380,9 +1530,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { (void)mem_block; // Fix warning } - bool merge_dynamic_batch = false; - GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), merge_dynamic_batch = MergeDynamicBatchBlocks()); - GE_IF_BOOL_EXEC((!(ge_disable_reuse_mem_env_ == "1") && !merge_dynamic_batch), ReuseBlocksByLifeTime(ranges.size())); + GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), ReuseBlocksByLifeTime(ranges.size())); AssignContinuousBlocks(); ResizeMemoryBlocks(); @@ -1402,92 +1550,19 @@ void BlockMemAssigner::CheckWorkspaceReuse(const vector &workspace_reuse_f } } -void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector &workspace_memory) { +void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector &workspace_memory, + int64_t &total_size) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node->GetOpDesc() == nullptr, return, "Op desc is null."); vector workspace_byte_nums = node->GetOpDesc()->GetWorkspaceBytes(); GELOGD("node[%s] size:%zu", node->GetOpDesc()->GetName().c_str(), workspace_byte_nums.size()); for (int64_t byte_size : workspace_byte_nums) { workspace_memory.emplace_back(byte_size); + total_size += byte_size; GELOGD("push back size:%ld", byte_size); } } -// descending order -static bool CompareBlockMaxSize(MemoryBlock *left, MemoryBlock *right) { - if (left == nullptr || right == nullptr) { - return false; - } - auto left_max_size = std::max_element(left->RealSizeList().begin(), left->RealSizeList().end()); - if (left_max_size != left->RealSizeList().end()) { - auto right_max_size = std::max_element(right->RealSizeList().begin(), right->RealSizeList().end()); - if (right_max_size == right->RealSizeList().end() || (*left_max_size > *right_max_size)) { - return true; - } - } - return false; -} - -void MergeBlocks(std::vector &dest, std::vector &src) { - for (size_t i = 0; i < dest.size(); ++i) { - if (i >= src.size()) { - return; - } - if (dest[i] != nullptr && src[i] != nullptr) { - if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) { - GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.", - i, dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_); - continue; - } - for (auto &symbol : src[i]->SymbolList()) { - dest[i]->AddSymbol(symbol); - } - for (size_t j = 0; j < src[i]->NodeTypeIndexList().size(); ++j) { - dest[i]->AddNodeTypeIndex(src[i]->NodeTypeIndexList()[j], - src[i]->RealSizeList()[j], - src[i]->NoAlignSizeList()[j]); - src[i]->deleted_block_ = true; - } - } - } -} - -bool BlockMemAssigner::MergeDynamicBatchBlocks() { - bool merged = false; - std::map> dynamic_batch_blocks; - for (auto block : memory_blocks_) { - if (block == nullptr) { - continue; - } - std::string batch_label; - if (block->IsSameLabel(batch_label)) { - dynamic_batch_blocks[batch_label].emplace_back(block); - } - } - - auto it = dynamic_batch_blocks.begin(); - auto it_max = it; - - // find max block counts - for (; it != dynamic_batch_blocks.end(); ++it) { - if (it->second.size() > it_max->second.size()) { - it_max = it; - } - std::sort(it->second.begin(), it->second.end(), CompareBlockMaxSize); - } - if (it_max != dynamic_batch_blocks.end()) { - GELOGD("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size()); - } - for (it = dynamic_batch_blocks.begin(); it != dynamic_batch_blocks.end(); ++it) { - if (it != it_max) { - GELOGD("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str()); - MergeBlocks(it_max->second, it->second); - merged = true; - } - } - return merged; -} - // asending order static bool CompareBlockIndex(MemoryBlock *left, MemoryBlock *right) { if (left == nullptr || right == nullptr) { @@ -1597,38 +1672,93 @@ void BlockMemAssigner::ReuseBlocksByLifeTime(size_t range_size) { } } +void AddBlockMemOffset(size_t &mem_offset, size_t &p2p_mem_offset, MemoryBlock &block) { + if (block.memory_type_ == RT_MEMORY_HBM) { + if (block.first_continuous_block_) { + mem_offset += MEM_ALIGN_SIZE; + } + block.Resize(); + block.SetHeadOffset(mem_offset); + mem_offset += block.Size(); + block.SetTailOffset(mem_offset - 1); + } else if (block.memory_type_ == RT_MEMORY_P2P_DDR) { + if (block.first_continuous_block_) { + p2p_mem_offset += MEM_ALIGN_SIZE; + } + block.Resize(); + block.SetHeadOffset(p2p_mem_offset); + p2p_mem_offset += block.Size(); + block.SetTailOffset(p2p_mem_offset - 1); + } +} + +bool DynamicBatchBlockReuse(MemoryBlock &block) { + return (block.IsSameBatchLabel() && block.reuse_mem_); +} + /// /// @ingroup domi_omg -/// @brief traverse memory size, resize, calculate offset +/// @brief get max batch memory size, others reuse this block memory /// @param [in&out] memory_blocks_ memory block, after calculating offset +/// |-dynamic batch block batch1| +/// |-dynamic batch block batch2----| +/// |-dynamic batch block batch3--| /// -void BlockMemAssigner::ResizeMemoryBlocks() { - for (auto &memory_block : memory_blocks_) { - if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_) { +void BlockMemAssigner::ResizeDynamicBatchBlocks() { + std::map> dynamic_batch_blocks; + for (auto block : memory_blocks_) { + if (block == nullptr) { continue; } - if (memory_block->memory_type_ == RT_MEMORY_HBM) { - if (memory_block->first_continuous_block_) { - mem_offset_ += MEM_ALIGN_SIZE; - } + // when memory is not reuseable, it can't be reused by different branch + if (DynamicBatchBlockReuse(*block)) { + dynamic_batch_blocks[block->batch_label_].emplace_back(block); + } + } - memory_block->Resize(); - memory_block->SetHeadOffset(mem_offset_); - mem_offset_ += memory_block->Size(); - memory_block->SetTailOffset(mem_offset_ - 1); - } else if (memory_block->memory_type_ == RT_MEMORY_P2P_DDR) { - if (memory_block->first_continuous_block_) { - p2p_mem_offset_ += MEM_ALIGN_SIZE; + size_t max_mem_offset = mem_offset_; + size_t max_p2p_mem_offset = p2p_mem_offset_; + for (auto &batch_blocks : dynamic_batch_blocks) { + size_t mem_offset = mem_offset_; + size_t p2p_mem_offset = p2p_mem_offset_; + for (auto block : batch_blocks.second) { + if (block == nullptr || block->deleted_block_ || block->is_zero_copy_) { + continue; } + AddBlockMemOffset(mem_offset, p2p_mem_offset, *block); + } + if (mem_offset > max_mem_offset) { + max_mem_offset = mem_offset; + } + if (p2p_mem_offset > max_p2p_mem_offset) { + max_p2p_mem_offset = p2p_mem_offset; + } + GELOGI("Batch[%s] offset[%zu] p2p_offset[%zu]", batch_blocks.first.c_str(), mem_offset, p2p_mem_offset); + } + mem_offset_ = max_mem_offset; + p2p_mem_offset_ = max_p2p_mem_offset; +} - memory_block->Resize(); - memory_block->SetHeadOffset(p2p_mem_offset_); - p2p_mem_offset_ += memory_block->Size(); - memory_block->SetTailOffset(p2p_mem_offset_ - 1); +/// +/// @ingroup domi_omg +/// @brief traverse memory size, resize, calculate offset +/// @param [in&out] memory_blocks_ memory block, after calculating offset +/// |-not dynamic batch block-||-dynamic batch block batch1| |-zero copy block-| +/// |-not dynamic batch block-||-dynamic batch block batch2----||-zero copy block-| +/// |-not dynamic batch block-||-dynamic batch block batch3--| |-zero copy block-| +/// +void BlockMemAssigner::ResizeMemoryBlocks() { + for (auto &memory_block : memory_blocks_) { + if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_ + || DynamicBatchBlockReuse(*memory_block)) { + continue; } + + AddBlockMemOffset(mem_offset_, p2p_mem_offset_, *memory_block); } - GELOGD("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu.", - mem_offset_, p2p_mem_offset_); + ResizeDynamicBatchBlocks(); + GELOGI("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu," + "theory_min_memory_size %zu", mem_offset_, p2p_mem_offset_, theory_min_memory_size_); } /// @@ -1641,7 +1771,7 @@ void BlockMemAssigner::ResizeMemoryBlocks() { /// @return Status result /// void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, - size_t real_size, size_t no_align_size, bool child_block) { + size_t real_size, size_t no_align_size, int32_t child_block_level) { ge::OpDescPtr op_desc = node_type.node->GetOpDesc(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(op_desc == nullptr, return, "op_desc is null."); string graph_name = node_type.node->GetOwnerComputeGraph()->GetName(); @@ -1689,14 +1819,15 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, } op_desc->SetWorkspace(workspace_list); } - GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]" - " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(), + GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] " + "life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(), op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), - block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_, - block->continuous_block_, block->deleted_block_, node_type.ref_input); + block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block_level, block->reuse_mem_, + block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input, + block->batch_label_.c_str()); } -void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { +void SetBlockOpMemOffset(MemoryBlock *block, int32_t child_block_level) { if (block == nullptr) { return; } @@ -1709,9 +1840,14 @@ void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { real_size = block->RealSizeList()[index]; no_align_size = block->NoAlignSizeList()[index]; } - SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block); + SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block_level); index++; } + + child_block_level++; + for (MemoryBlock *child_block : block->ChildBlockList()) { + SetBlockOpMemOffset(child_block, child_block_level); + } } void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) { @@ -1724,16 +1860,13 @@ void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) { continue; } - SetBlockOpMemOffset(memory_block, false); - for (MemoryBlock *child_block : memory_block->ChildBlockList()) { - SetBlockOpMemOffset(child_block, true); - } + SetBlockOpMemOffset(memory_block, 0); } if (!is_zero_copy) { for (const NodeTypeIndex &node_type_index : zero_memory_list_) { MemoryBlock block(0, 0); - SetOffsetSize(node_type_index, &block, 0, 0, false); + SetOffsetSize(node_type_index, &block, 0, 0, 0); } } } diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h index f3d26c1d..d514ca34 100755 --- a/ge/graph/build/memory/block_mem_assigner.h +++ b/ge/graph/build/memory/block_mem_assigner.h @@ -65,6 +65,7 @@ class MemoryBlock { stream_id_(stream_id), deleted_block_(false), reuse_mem_(reuse_mem), + same_stream_(true), input_index_(0), continuous_block_(false), first_continuous_block_(false), @@ -85,10 +86,14 @@ class MemoryBlock { symbol_list_.clear(); } - void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) { + void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size, + int64_t stream_id) { real_size_list_.emplace_back(real_size); no_align_size_list_.emplace_back(no_align_size); node_type_index_list_.emplace_back(node, type, out_index, false); + if (stream_id != stream_id_) { + same_stream_ = false; + } } size_t Size() const { return block_size_; } @@ -106,6 +111,12 @@ class MemoryBlock { node_type_index_list_.emplace_back(node_type_index); real_size_list_.emplace_back(real_size); no_align_size_list_.emplace_back(no_align_size); + if ((node_type_index.node != nullptr) && (node_type_index.node->GetOpDesc() != nullptr)) { + auto stream_id = node_type_index.node->GetOpDesc()->GetStreamId(); + if (stream_id != stream_id_) { + same_stream_ = false; + } + } } void AddSymbol(const std::string &symbol) { @@ -122,7 +133,7 @@ class MemoryBlock { std::string String(); - bool IsSameLabel(std::string &first_batch_label); + bool IsSameBatchLabel(); void AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life); @@ -142,6 +153,7 @@ class MemoryBlock { int64_t stream_id_; bool deleted_block_; bool reuse_mem_; + bool same_stream_; uint32_t input_index_; bool continuous_block_; bool first_continuous_block_; @@ -149,6 +161,7 @@ class MemoryBlock { bool is_zero_copy_; std::map depend_stream_life_; int64_t memory_type_; + std::string batch_label_; private: size_t block_size_; std::vector real_size_list_; @@ -209,7 +222,7 @@ class BlockMemAssigner : public MemAssigner { void GetOutAndWorkSpaceMem(std::vector &all_memory_size); - void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector &workspace_memory); + void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector &workspace_memory, int64_t &total_size); /// /// @ingroup GE @@ -353,7 +366,7 @@ class BlockMemAssigner : public MemAssigner { /// @return void /// @author /// - void ReleaseMemory(MemoryBlock *to_release, vector &reusable_memory); + void ReleaseMemory(MemoryBlock *to_release, vector &reusable_memory, bool same_stream = true); /// /// @ingroup GE @@ -379,11 +392,11 @@ class BlockMemAssigner : public MemAssigner { /// /// @ingroup GE - /// @brief Merge memory blocks between different batchs + /// @brief Resize memory blocks for each batchs /// @return merge or not /// @author /// - bool MergeDynamicBatchBlocks(); + void ResizeDynamicBatchBlocks(); void AssignContinuousBlocks(); @@ -436,6 +449,17 @@ class BlockMemAssigner : public MemAssigner { int64_t atomic_addr_clean_id_ = 0; + size_t theory_min_memory_size_ = 0; + + size_t theory_memory_size_ = 0; + + std::string max_batch_label_; + + /// + /// @ [stream1][nodeid] + /// @[nodeid] [stream2][nodeid] + /// @ [stream2][nodeid] + /// DependStreamLife total_node_depend_stream_life_; }; } // namespace ge diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index ad0235d5..a6da4682 100755 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -1646,9 +1646,9 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve } string atomic_mem_size_str = ss.str(); - GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]", + GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] output[0] offset to [%s] streamid[%ld] size[%s]", node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), - atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId()); + atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), atomic_mem_size_str.c_str()); } return SUCCESS; } diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index 93cb8d89..a97f8fdb 100755 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -2178,7 +2178,7 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data void *mem_addr = data.second.GetBasicAddr(); void *data_buf_addr = reinterpret_cast(reinterpret_cast(data_buf.data)); uint64_t data_buf_length = data_buf.length; - GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]", + GELOGI("CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]", runtime_param_.graph_id, data.first, mem_addr, data_buf_addr, data_size, data_buf_length); GE_CHK_RT_RET(rtMemcpy(mem_addr, data_size, data_buf_addr, data_buf_length, kind)); } diff --git a/ge/graph/load/new_model_manager/model_utils.cc b/ge/graph/load/new_model_manager/model_utils.cc index 34fb7ff3..22a657ad 100755 --- a/ge/graph/load/new_model_manager/model_utils.cc +++ b/ge/graph/load/new_model_manager/model_utils.cc @@ -61,7 +61,7 @@ vector ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); continue); - GELOGI("[IMAS]GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size); + GELOGI("GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size); v_input_size.push_back(tensor_size); } @@ -96,7 +96,7 @@ vector ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) { GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i); continue); - GELOGI("[IMAS]GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size); + GELOGI("GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size); v_output_size.push_back(tensor_size); }