From d23b490946ef326da4748b6188602d9a598189dc Mon Sep 17 00:00:00 2001 From: weiyang Date: Mon, 30 Nov 2020 19:44:33 +0800 Subject: [PATCH] support known aicpu --- .../load/new_model_manager/davinci_model.cc | 31 +++-- .../load/new_model_manager/davinci_model.h | 12 +- .../load/new_model_manager/model_manager.cc | 2 +- .../task_info/kernel_task_info.cc | 118 ++++++++---------- .../task_info/kernel_task_info.h | 2 + 5 files changed, 85 insertions(+), 80 deletions(-) diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index bc755e07..720c3c28 100755 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -2991,19 +2991,19 @@ Status DavinciModel::CreateKnownZeroCopyMap(const vector &inputs, const return SUCCESS; } -Status DavinciModel::UpdateKnownZeroCopyAddr() { - for (size_t i = 0; i < total_io_addrs_.size(); ++i) { - auto it_in = knonw_input_data_info_.find(total_io_addrs_[i]); +Status DavinciModel::UpdateKnownZeroCopyAddr(vector &total_io_addrs) { + for (size_t i = 0; i < total_io_addrs.size(); ++i) { + auto it_in = knonw_input_data_info_.find(total_io_addrs[i]); if (it_in != knonw_input_data_info_.end()) { - GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs_[i], - knonw_input_data_info_.at(total_io_addrs_[i])); - total_io_addrs_[i] = knonw_input_data_info_.at(total_io_addrs_[i]); + GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs[i], + knonw_input_data_info_.at(total_io_addrs[i])); + total_io_addrs[i] = knonw_input_data_info_.at(total_io_addrs[i]); } - auto it_out = knonw_output_data_info_.find(total_io_addrs_[i]); + auto it_out = knonw_output_data_info_.find(total_io_addrs[i]); if (it_out != knonw_output_data_info_.end()) { - GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs_[i], - knonw_output_data_info_.at(total_io_addrs_[i])); - total_io_addrs_[i] = knonw_output_data_info_.at(total_io_addrs_[i]); + GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs[i], + knonw_output_data_info_.at(total_io_addrs[i])); + total_io_addrs[i] = knonw_output_data_info_.at(total_io_addrs[i]); } } GELOGI("DavinciModel::UpdateKnownZeroCopyAddr success."); @@ -3032,7 +3032,7 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector &inputs, const vec } else { total_io_addrs_ = orig_total_io_addrs_; } - GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(), "DavinciModel::UpdateKnownZeroCopyAddr failed."); + GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_), "DavinciModel::UpdateKnownZeroCopyAddr failed."); if (total_args_size_ == 0) { GELOGW("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, pass rtMemcpy.", args_, total_args_size_); @@ -3099,7 +3099,14 @@ Status DavinciModel::MallocKnownArgs() { GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } - + // malloc dynamic and static hybrid memory + if (total_hybrid_args_size_ != 0) { + rt_ret = rtMalloc(&hybrid_addrs_, total_hybrid_args_size_, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + } // malloc fixed addr memory, eg: rts op if (total_fixed_addr_size_ != 0) { GELOGI("Begin to allocate fixed addr."); diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h index 19888e1f..27bd4de5 100755 --- a/ge/graph/load/new_model_manager/davinci_model.h +++ b/ge/graph/load/new_model_manager/davinci_model.h @@ -476,6 +476,14 @@ class DavinciModel { void SetTotalIOAddrs(vector &io_addrs) { total_io_addrs_.insert(total_io_addrs_.end(), io_addrs.begin(), io_addrs.end()); } + void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; } + uint32_t GetHybridArgsSize() { + return total_hybrid_args_size_; + } + void *GetCurrentHybridArgsAddr(uint32_t offset) { + void *cur_args = static_cast(hybrid_addrs_) + offset; + return cur_args; + } void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size); int64_t GetFixedAddrsSize(string tensor_name); void *GetCurrentFixedAddr(int64_t offset) const { @@ -494,7 +502,7 @@ class DavinciModel { Status MallocKnownArgs(); Status UpdateKnownNodeArgs(const vector &inputs, const vector &outputs); Status CreateKnownZeroCopyMap(const vector &inputs, const vector &outputs); - Status UpdateKnownZeroCopyAddr(); + Status UpdateKnownZeroCopyAddr(vector &total_io_addrs); void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; } Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info); @@ -977,6 +985,8 @@ class DavinciModel { void *args_ = nullptr; void *args_host_ = nullptr; void *fixed_addrs_ = nullptr; + void *hybrid_addrs_ = nullptr; + uint32_t total_hybrid_args_size_ = 0; int64_t total_fixed_addr_size_ = 0; std::map knonw_input_data_info_; std::map knonw_output_data_info_; diff --git a/ge/graph/load/new_model_manager/model_manager.cc b/ge/graph/load/new_model_manager/model_manager.cc index b595ac39..da4856d3 100755 --- a/ge/graph/load/new_model_manager/model_manager.cc +++ b/ge/graph/load/new_model_manager/model_manager.cc @@ -1214,7 +1214,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "Invalid model id %u, check weather model has been loaded or not.", model_id); + "Invalid model id %u, check whether model has been loaded or not.", model_id); if (davinci_model->NeedDestroyAicpuKernel()) { GELOGI("Start to destroy specified aicpu kernel."); diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc index 74faeb24..364c7ac2 100755 --- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc +++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc @@ -372,7 +372,11 @@ Status KernelTaskInfo::SuperKernelDistribute() { Status KernelTaskInfo::Distribute() { GELOGD("KernelTaskInfo Distribute Start."); if (davinci_model_->IsKnownNode()) { - args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + if (kernel_type_ == ccKernelType::TE) { + args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_); + } GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_); } rtError_t rt_ret = RT_ERROR_NONE; @@ -428,36 +432,31 @@ Status KernelTaskInfo::UpdateArgs() { const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); vector input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_); vector output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_); - vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_); vector io_addrs; - if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) { - io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); - io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); + io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); + io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); + if (kernel_type_ == ccKernelType::TE) { + vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_); io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); - } else { - string peer_input_name; - if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) { - uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name); - if (output_index > output_data_addrs.size()) { - GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.", - output_data_addrs.size(), output_index); - return FAILED; - } - io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); - for (size_t i = 0; i < output_data_addrs.size(); ++i) { - if (i == output_index) { - void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_); - io_addrs.emplace_back(fixed_addr); - continue; - } - io_addrs.emplace_back(output_data_addrs[i]); - } - io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); + davinci_model_->SetTotalIOAddrs(io_addrs); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + davinci_model_->UpdateKnownZeroCopyAddr(io_addrs); + uintptr_t io_addr = reinterpret_cast(args_addr.get()) + sizeof(aicpu::AicpuParamHead); + auto addrs_size = sizeof(uint64_t) * io_addrs.size(); + errno_t sec_ret = memcpy_s(reinterpret_cast(io_addr), addrs_size, io_addrs.data(), addrs_size); + if (sec_ret != EOK) { + GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); + return FAILED; + } + // copy args to device + rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); } } - davinci_model_->SetTotalIOAddrs(io_addrs); GELOGI("KernelTaskInfo::UpdateArgs success."); return SUCCESS; } @@ -533,33 +532,18 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) { } Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) { - domi::KernelDef kernel_def = task_def.kernel(); - uint32_t args_size = kernel_def.args_size(); - args_offset_ = davinci_model->GetTotalArgsSize(); - davinci_model->SetTotalArgsSize(args_size); - GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); - - // get opcontext stored in model + const domi::KernelDef &kernel_def = task_def.kernel(); const domi::KernelContext &context = kernel_def.context(); - // get opdesc - op_desc_ = davinci_model->GetOpByIndex(context.op_index()); - GE_CHECK_NOTNULL(op_desc_); - // alloc fixed addr - string peer_input_name; - if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) { - uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name); - if (output_index > op_desc_->GetOutputsSize()) { - GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", op_desc_->GetOutputsSize(), - output_index); - return FAILED; - } - fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name); - auto tensor_desc = op_desc_->GetOutputDesc(output_index); - int64_t tensor_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); - davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size); - GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size, - fixed_addr_offset_); + kernel_type_ = static_cast(context.kernel_type()); + if (kernel_type_ == ccKernelType::TE) { + uint32_t args_size = kernel_def.args_size(); + args_offset_ = davinci_model->GetTotalArgsSize(); + davinci_model->SetTotalArgsSize(args_size); + GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + hybrid_args_offset_ = davinci_model->GetHybridArgsSize(); + davinci_model->SetHybridArgsSize(kernel_def.args_size()); + GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_); } return SUCCESS; } @@ -888,7 +872,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } // copy args to new host memory - std::unique_ptr args_addr(new (std::nothrow) uint8_t[args_size_]); + args_addr = std::unique_ptr(new (std::nothrow) uint8_t[args_size_]); GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_) errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_); if (sec_ret != EOK) { @@ -896,8 +880,23 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k return FAILED; } - const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); + auto aicpu_param_head = reinterpret_cast(args_addr.get()); + const auto &ext_info = kernel_def.kernel_ext_info(); + auto init_ret = InitAicpuTaskExtInfo(ext_info); + if (init_ret != SUCCESS) { + GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size()); + return init_ret; + } + GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(), + op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_); + aicpu_param_head->extInfoAddr = reinterpret_cast(aicpu_ext_info_addr_); + aicpu_param_head->extInfoLength = static_cast(ext_info.size()); + + if (davinci_model_->IsKnownNode()) { + return SUCCESS; + } + const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); vector input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); vector output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); vector io_addrs; @@ -914,19 +913,6 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } } - auto aicpu_param_head = reinterpret_cast(args_addr.get()); - const auto &ext_info = kernel_def.kernel_ext_info(); - auto init_ret = InitAicpuTaskExtInfo(ext_info); - if (init_ret != SUCCESS) { - GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size()); - return init_ret; - } - GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(), - op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_); - - aicpu_param_head->extInfoAddr = reinterpret_cast(aicpu_ext_info_addr_); - aicpu_param_head->extInfoLength = static_cast(ext_info.size()); - // malloc device memory for args rtError_t rt_ret = rtMalloc(static_cast(&args_), args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h index 1f90ede1..7717edd3 100644 --- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h +++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h @@ -159,7 +159,9 @@ class KernelTaskInfo : public TaskInfo { OpDescPtr op_desc_; DavinciModel *davinci_model_; uint32_t args_offset_ = 0; + uint32_t hybrid_args_offset_ = 0; int64_t fixed_addr_offset_ = 0; + std::unique_ptr args_addr = nullptr; bool call_save_dump_ = false; // aicpu ext_info device mem