diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc index c83a76d1..0cd50196 100755 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc @@ -230,7 +230,7 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::functionnum_outputs == 0)) { GELOGD("Node[%s] type[%s] unknown_type is %d, output num is %d.", node_name_.c_str(), node_item_->node_type.c_str(), unknown_type_, node_item_->num_outputs); @@ -263,9 +263,15 @@ Status AicpuTfNodeTask::InitForDependComputeTask() { node_name_.c_str(), copy_input_buf_len); // copy task args buf - GE_CHK_STATUS_RET(AllocTensorBuffer(sizeof(STR_FWK_OP_KERNEL), copy_task_args_buf_), - "[Alloc][TensorBuffer] failed for Node[%s] to copy task args, size=%zu", - node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL)); + if (is_aicpu_kernel) { + GE_CHK_STATUS_RET(AllocTensorBuffer(sizeof(aicpu::AicpuParamHead), copy_task_args_buf_), + "[Alloc][TensorBuffer] failed for Node[%s] to copy task args, size=%zu", + node_name_.c_str(), sizeof(aicpu::AicpuParamHead)); + } else { + GE_CHK_STATUS_RET(AllocTensorBuffer(sizeof(STR_FWK_OP_KERNEL), copy_task_args_buf_), + "[Alloc][TensorBuffer] failed for Node[%s] to copy task args, size=%zu", + node_name_.c_str(), sizeof(STR_FWK_OP_KERNEL)); + } std::vector copy_io_addr; copy_io_addr.emplace_back(reinterpret_cast(copy_input_release_flag_dev_->GetData())); @@ -327,7 +333,7 @@ Status AicpuTfNodeTask::Init(const HybridModel &model) { uint64_t ext_session_id = model.GetSessionId(); GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), "[Init][ExtInfo] failed for Node[%s].", node_name_.c_str()); - GE_CHK_STATUS_RET(InitForDependComputeTask(), "[Init][DependComputeTask] failed for Node[%s].", node_name_.c_str()); + GE_CHK_STATUS_RET(InitForDependComputeTask(false), "[Init][DependComputeTask] failed for Node[%s].", node_name_.c_str()); // build fwk_op_kernel. GE_IF_BOOL_EXEC(sizeof(STR_FWK_OP_KERNEL) < kernel_ex_def.args_size(), @@ -585,7 +591,8 @@ Status AicpuTfNodeTask::UpdateShapeAndDataByResultSummary(TaskContext &context) return SUCCESS; } -Status AicpuTfNodeTask::UpdateIoAddr(TaskContext &context) { +Status AicpuTfNodeTask:: +(TaskContext &context) { vector io_addrs; io_addrs.reserve(node_item_->num_inputs + node_item_->num_outputs); for (auto i = 0; i < node_item_->num_inputs; ++i) { @@ -648,31 +655,10 @@ Status AicpuTfNodeTask::LaunchTask(TaskContext &context) { return SUCCESS; } -Status AicpuTfNodeTask::TaskCallback(TaskContext &context) { - GELOGD("Node[%s] task callback start. is_dynamic=%s, unknown_type=%d.", - node_name_.c_str(), node_item_->is_dynamic ? "true" : "false", unknown_type_); - Status callback_ret = SUCCESS; - if (node_item_->is_dynamic) { - // check need update shape, call update shape. - if (unknown_type_ == DEPEND_SHAPE_RANGE) { - // check result - callback_ret = UpdateOutputShapeFromExtInfo(context); - } else if (unknown_type_ == DEPEND_COMPUTE) { - callback_ret = UpdateShapeAndDataByResultSummary(context); - } - } - GELOGD("Node[%s] task callback end.", node_name_.c_str()); - return callback_ret; -} - Status AicpuNodeTask::Init(const HybridModel &model) { auto node_name = node_name_; GELOGD("Node[%s] init start.", node_name.c_str()); - GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED, - "[Check][Type]Node[%s] unknown type[%d] is depend compute, it's not supported now.", - node_name.c_str(), unknown_type_); - GE_CHK_BOOL_RET_STATUS(task_def_.has_kernel(), FAILED, "[Check][task_def_]Node[%s] task def does not has kernel.", node_name.c_str()); auto &kernel_def = task_def_.kernel(); @@ -761,6 +747,8 @@ Status AicpuNodeTask::Init(const HybridModel &model) { uint64_t ext_session_id = model.GetSessionId(); GE_CHK_STATUS_RET(InitExtInfo(kernel_ext_info, ext_session_id), "[Init][ExtInfo] failed for Node[%s].", node_name.c_str()); + GE_CHK_STATUS_RET(InitForDependComputeTask(true), + "[Init][DependComputeTask] failed for Node[%s].", node_name_.c_str()); if (ext_info_addr_dev_ == nullptr) { aicpu_param_head->extInfoLength = 0; @@ -769,11 +757,65 @@ Status AicpuNodeTask::Init(const HybridModel &model) { aicpu_param_head->extInfoLength = ext_info_addr_dev_->GetSize(); aicpu_param_head->extInfoAddr = reinterpret_cast(ext_info_addr_dev_->GetData()); } - + auto task_defs = model.GetTaskDefs(node_item_->node); + GE_CHECK_NOTNULL(task_defs); + if (unknown_type_ == DEPEND_COMPUTE) { + GE_CHK_STATUS_RET_NOLOG(SetMemCopyTask((*task_defs)[1])); + } GELOGD("Node[%s] init end.", node_name.c_str()); return SUCCESS; } +Status AicpuNodeTask::SetMemCopyTask(const domi::TaskDef &task_def) { + if (node_item_->num_outputs == 0) { + GELOGD("Node[%s] type[%s] has no output, no need set mem_copy task.", + node_name_.c_str(), node_item_->node_type.c_str()); + return SUCCESS; + } + + GELOGD("Start to set memcpy task for node[%s].", node_name_.c_str()); + const domi::KernelDef &kernel_def = task_def.kernel(); + auto &memcpy_args = kernel_def.args(); + uint32_t memcpy_args_size = kernel_def.args_size(); + GE_IF_BOOL_EXEC(memcpy_args.size() != memcpy_args_size, + REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.", + memcpy_args.size(), memcpy_args_size); + GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.", + memcpy_args.size(), memcpy_args_size); + return FAILED;); + GE_IF_BOOL_EXEC(memcpy_args_size < sizeof(aicpu::AicpuParamHead), + REPORT_INNER_ERROR("E19999", + "Task def args_size=%u is less than aicpu param head len=%zu.", + memcpy_args_size, sizeof(aicpu::AicpuParamHead)); + GELOGE(FAILED, + "[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.", + memcpy_args_size, sizeof(aicpu::AicpuParamHead)); + return FAILED;); + + std::unique_ptr memcpy_task; + memcpy_task.reset(new(std::nothrow) uint8_t[memcpy_args_size]()); + GE_IF_BOOL_EXEC(memcpy_task == nullptr, + REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].", + memcpy_args_size); + GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].", + memcpy_args_size); + return FAILED;); + + errno_t sec_ret = memcpy_s(memcpy_task.get(), memcpy_args_size, memcpy_args.c_str(), memcpy_args.size()); + GE_IF_BOOL_EXEC(sec_ret != EOK, + REPORT_INNER_ERROR("E19999", + "memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret); + GELOGE(INTERNAL_ERROR, + "[Update][args] failed for Node[MemCopy], ret: %d", sec_ret); + return sec_ret;); + + + GE_CHK_RT_RET(rtMemcpy(copy_task_args_buf_->GetData(), sizeof(aicpu::AicpuParamHead), + &memcpy_task, sizeof(aicpu::AicpuParamHead), RT_MEMCPY_HOST_TO_DEVICE)); + GELOGD("Set memcpy task for node[%s] successfully.", node_name_.c_str()); + return SUCCESS; +} + Status AicpuNodeTask::UpdateIoAddr(TaskContext &context) { vector io_addrs; io_addrs.reserve(node_item_->num_inputs + node_item_->num_outputs); @@ -829,15 +871,19 @@ Status AicpuNodeTask::LaunchTask(TaskContext &context) { return SUCCESS; } -Status AicpuNodeTask::TaskCallback(TaskContext &context) { +Status AicpuNodeTaskBase::TaskCallback(TaskContext &context) { GELOGD("Node[%s] task callback start, is_dynamic = %s, unknown_type=%d.", node_name_.c_str(), node_item_->is_dynamic ? "true" : "false", unknown_type_); Status callback_ret = SUCCESS; // check need update shape, call update shape. - if (node_item_->is_dynamic && unknown_type_ == DEPEND_SHAPE_RANGE) { - // check result - callback_ret = UpdateOutputShapeFromExtInfo(context); + if (node_item_->is_dynamic) { + if (unknown_type_ == DEPEND_SHAPE_RANGE) { + // check result + callback_ret = UpdateOutputShapeFromExtInfo(context); + } else if (unknown_type_ == DEPEND_COMPUTE) { + callback_ret = UpdateShapeAndDataByResultSummary(context); + } } else { GELOGD("Node[%s] unknown shape type is %d no need update output shape.", node_name_.c_str(), unknown_type_); @@ -854,6 +900,31 @@ Status AiCpuNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) cons return status; } +Status AicpuNodeTask::UpdateShapeAndDataByResultSummary(TaskContext &context) { + GELOGD("Node[%s] update shape and data by result summary begin.", node_name_.c_str()); + + std::vector> out_shape_hbm; + GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(context, out_shape_hbm), + "[Invoke][ReadResultSummaryAndPrepareMemory] failed for Node[%s].", + node_name_.c_str()); + + RECORD_CALLBACK_EVENT(context.GetExecutionContext(), node_name_.c_str(), + "[ReadResultSummaryAndPrepareMemory] End"); + + GE_CHK_STATUS_RET(CopyDataToHbm(context, out_shape_hbm), + "[Invoke][CopyDataToHbm] failed for Node[%s] copy data to output.", + node_name_.c_str()); + + RECORD_CALLBACK_EVENT(context.GetExecutionContext(), node_name_.c_str(), "[CopyDataToHbm] End"); + + GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(context, out_shape_hbm), + "[Update][ShapeByHbmBuffer] failed for Node[%s].", + node_name_.c_str()); + + GELOGD("Node[%s] update shape and data by result summary end.", node_name_.c_str()); + return SUCCESS; +} + Status AiCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, std::shared_ptr &task) const { diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h index 14bc8fcc..25a6e8c3 100644 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h @@ -51,11 +51,17 @@ class AicpuNodeTaskBase : public NodeTask { virtual Status UpdateOutputShapeFromExtInfo(TaskContext &task_context); + virtual Status UpdateShapeAndDataByResultSummary(TaskContext &context) = 0; + Status UpdateShapeToOutputDesc(TaskContext &task_context, const GeShape &shape_new, int32_t output_index); virtual Status LaunchTask(TaskContext &context) = 0; + + Status InitForDependComputeTask(bool is_aicpu_kernel); - virtual Status TaskCallback(TaskContext &context) = 0; + Status TaskCallback(TaskContext &context); + + virtual Status SetMemCopyTask(const domi::TaskDef &task_def) = 0; virtual Status UpdateIoAddr(TaskContext &context) = 0; @@ -78,6 +84,20 @@ class AicpuNodeTaskBase : public NodeTask { // ext info addr, device mem std::unique_ptr ext_info_addr_dev_; + + // just used for depend DEPEND_COMPUTE op + std::unique_ptr copy_task_args_buf_; + + std::vector> output_summary_; + std::vector output_summary_host_; + + std::unique_ptr copy_ioaddr_dev_; + + std::unique_ptr copy_input_release_flag_dev_; + std::unique_ptr copy_input_data_size_dev_; + std::unique_ptr copy_input_src_dev_; + std::unique_ptr copy_input_dst_dev_; + bool need_sync_ = false; }; class AicpuTfNodeTask : public AicpuNodeTaskBase { @@ -93,17 +113,12 @@ class AicpuTfNodeTask : public AicpuNodeTaskBase { Status LaunchTask(TaskContext &context) override; - Status TaskCallback(TaskContext &context) override; - Status UpdateIoAddr(TaskContext &context) override; + + Status UpdateShapeAndDataByResultSummary(TaskContext &context) override; + Status SetMemCopyTask(const domi::TaskDef &task_def) override; private: - Status SetMemCopyTask(const domi::TaskDef &task_def); - - Status InitForDependComputeTask(); - - Status UpdateShapeAndDataByResultSummary(TaskContext &context); - /// /// read result summary and prepare copy task memory. /// @param context task context @@ -132,20 +147,6 @@ class AicpuTfNodeTask : public AicpuNodeTaskBase { // input and output addr, device mem std::unique_ptr input_output_addr_; - // just used for depend DEPEND_COMPUTE op - std::unique_ptr copy_task_args_buf_; - - std::vector> output_summary_; - std::vector output_summary_host_; - - std::unique_ptr copy_ioaddr_dev_; - - std::unique_ptr copy_input_release_flag_dev_; - std::unique_ptr copy_input_data_size_dev_; - std::unique_ptr copy_input_src_dev_; - std::unique_ptr copy_input_dst_dev_; - bool need_sync_ = false; - std::unique_ptr copy_workspace_buf_; }; @@ -162,10 +163,11 @@ class AicpuNodeTask : public AicpuNodeTaskBase { Status LaunchTask(TaskContext &context) override; - Status TaskCallback(TaskContext &context) override; - Status UpdateIoAddr(TaskContext &context) override; + + Status UpdateShapeAndDataByResultSummary(TaskContext &context) override; + Status SetMemCopyTask(const domi::TaskDef &task_def) override; protected: // host mem std::unique_ptr args_;