diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc index 9a52a83d..426d3233 100755 --- a/ge/single_op/single_op_model.cc +++ b/ge/single_op/single_op_model.cc @@ -46,7 +46,12 @@ namespace { const size_t kDataOutputNum = 1; const uint32_t kInputIndexOfData = 0; const uint32_t kOutputIndexOfData = 0; +const size_t kNumTaskWithAtomicAddrCleanTask = 2; +const size_t kNumTaskWithMemCpyTask = 2; constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; +const char *const kEngineNameAiCore = "AIcoreEngine"; +const char *const kEngineNameAiCpu = "aicpu_ascend_kernel"; +const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel"; Status CheckHostMem(const std::vector &dependencies, const NodePtr &node, bool &is_host_mem) { auto op_desc = node->GetOpDesc(); @@ -395,7 +400,7 @@ void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) { } } } - + Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task) { GE_CHECK_NOTNULL(task); auto task_type = static_cast(task_def.type()); @@ -408,7 +413,7 @@ Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask * return ACL_ERROR_GE_INTERNAL_ERROR; } - auto *tbe_task = new (std::nothrow) TbeOpTask(); + std::unique_ptr tbe_task(new (std::nothrow) TbeOpTask()); if (tbe_task == nullptr) { GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][TbeOpTask]failed."); REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for new TbeOpTask."); @@ -418,12 +423,41 @@ Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask * auto builder = TbeTaskBuilder(model_name_, iter->second, task_def); auto ret = builder.BuildTask(*tbe_task, model_params_); if (ret != SUCCESS) { - delete tbe_task; - tbe_task = nullptr; + GELOGE(ret, "[Build][TbeOpTask]failed."); + REPORT_INNER_ERROR("E19999", "[Build][TbeOpTask]failed."); + return ret; + } + + *task = tbe_task.release(); + return SUCCESS; +} + +Status SingleOpModel::BuildAtomicTask(const domi::TaskDef &task_def, AtomicAddrCleanOpTask **task) { + GE_CHECK_NOTNULL(task); + const auto &context = task_def.kernel().context(); + auto iter = op_list_.find(context.op_index()); + if (iter == op_list_.end()) { + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][Param:TaskDef]op desc not found. op index = %u", context.op_index()); + REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for op desc not found. op index = %u", context.op_index()); + return ACL_ERROR_GE_INTERNAL_ERROR; + } + + std::unique_ptr atomic_task(new (std::nothrow) AtomicAddrCleanOpTask()); + if (atomic_task == nullptr) { + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][AtomicAddrCleanOpTask]failed."); + REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for new AtomicAddrCleanOpTask."); + return ACL_ERROR_GE_MEMORY_ALLOCATION; + } + + auto builder = AtomicAddrCleanTaskBuilder(model_name_, iter->second, task_def); + auto ret = builder.BuildTask(*atomic_task, model_params_); + if (ret != SUCCESS) { + GELOGE(ret, "[Build][AtomicAddrCleanOpTask]failed."); + REPORT_INNER_ERROR("E19999", "[Build][AtomicAddrCleanOpTask]failed."); return ret; } - *task = tbe_task; + *task = atomic_task.release(); return SUCCESS; } @@ -536,9 +570,29 @@ Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph()); GE_CHECK_NOTNULL(compute_graph); single_op.compute_graph_ = compute_graph; - if (tbe_tasks_.size() > 0) { - const auto &task_def = tbe_tasks_[0]; + + if (node_tasks_.size() != 1) { + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); + REPORT_INNER_ERROR("E19999", "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); + return ACL_ERROR_GE_PARAM_INVALID; + } + + auto iter = node_tasks_.begin(); + auto node = iter->first; + const auto &task_defs = iter->second; + if (task_defs.size() <= 0 || task_defs.size() > kNumTaskWithAtomicAddrCleanTask) { + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); + REPORT_INNER_ERROR("E19999", "[Check][Size]task_defs size must be 1 or 2, but get %zu.", task_defs.size()); + return ACL_ERROR_GE_PARAM_INVALID; + } + + GE_CHECK_NOTNULL(node); + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + const auto &lib_name = op_desc->GetOpKernelLibName(); + if (lib_name == kEngineNameAiCore) { GELOGD("Building TBE task."); + const auto &task_def = task_defs.back(); TbeOpTask *tbe_task = nullptr; GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def, &tbe_task)); tbe_task->SetModelArgs(model_name_, model_id_); @@ -546,37 +600,43 @@ Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, GELOGD("tiling buffer is not nullptr."); tbe_task->stream_resource_ = stream_resource; } + if (task_defs.size() == kNumTaskWithAtomicAddrCleanTask) { + const auto &atomic_task_def = task_defs.front(); + AtomicAddrCleanOpTask *atomic_task = nullptr; + GE_CHK_STATUS_RET_NOLOG(BuildAtomicTask(atomic_task_def, &atomic_task)); + GE_CHK_STATUS_RET_NOLOG(atomic_task->InitAtomicAddrCleanIndices()); + tbe_task->SetAtomicAddrCleanTask(atomic_task); + } single_op.op_task_.reset(tbe_task); - } else if (aicpu_tasks_.size() > 0) { - const auto &task_def = aicpu_tasks_[0]; - auto task_type = static_cast(task_def.type()); - if (task_type == RT_MODEL_TASK_KERNEL) { - GELOGD("Building AICPU_CC task"); - OpTask *task = nullptr; - uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++; - GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id); - GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id)); - task->SetModelArgs(model_name_, model_id_); - single_op.op_task_.reset(task); - } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { - GELOGD("Building AICPU_TF task"); - AiCpuTask *aicpu_task = nullptr; - uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++; - GELOGI("Build dynamic singleOp TfTask, kernel_id = %lu", dynamic_singleop_kernel_id); - GE_CHK_STATUS_RET_NOLOG(BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, dynamic_singleop_kernel_id)); - if (aicpu_task->GetUnknownType() == DEPEND_COMPUTE) { - if (aicpu_tasks_.size() < 2) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Task]The copy task of the fourth operator was not found."); - REPORT_INNER_ERROR("E19999", "The copy task of the fourth operator was not found."); - return ACL_ERROR_GE_PARAM_INVALID; - } - const TaskDef ©_task_def = aicpu_tasks_[1]; - GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex())); + } else if (lib_name == kEngineNameAiCpu) { + const auto &task_def = task_defs[0]; + GELOGD("Building AICPU_CC task"); + OpTask *task = nullptr; + uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++; + GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id); + GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id)); + task->SetModelArgs(model_name_, model_id_); + single_op.op_task_.reset(task); + } else if (lib_name == kEngineNameAiCpuTf) { + const auto &task_def = task_defs[0]; + GELOGD("Building AICPU_TF task"); + AiCpuTask *aicpu_task = nullptr; + uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++; + GELOGI("Build dynamic singleOp TfTask, kernel_id = %lu", dynamic_singleop_kernel_id); + GE_CHK_STATUS_RET_NOLOG(BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, dynamic_singleop_kernel_id)); + if (aicpu_task->GetUnknownType() == DEPEND_COMPUTE) { + if (task_defs.size() < kNumTaskWithMemCpyTask) { + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Task]The copy task of the fourth operator was not found."); + REPORT_INNER_ERROR("E19999", "The copy task of the fourth operator was not found."); + return ACL_ERROR_GE_PARAM_INVALID; } - aicpu_task->SetModelArgs(model_name_, model_id_); - single_op.op_task_.reset(aicpu_task); + const TaskDef ©_task_def = task_defs[1]; + GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex())); } + aicpu_task->SetModelArgs(model_name_, model_id_); + single_op.op_task_.reset(aicpu_task); } + return SUCCESS; } @@ -585,9 +645,7 @@ Status SingleOpModel::NeedHybridModel(GeModelPtr &ge_model, bool &need_hybrid_mo bool is_host_mem = false; GE_CHK_STATUS_RET(CheckInferDepend(ge_model, is_infer_depend, is_host_mem), "[Check][InferDepend] failed."); bool need_d2h_cpy = is_infer_depend && !is_host_mem; - bool aicpu_multi_task = tbe_tasks_.size() >= 1 && aicpu_tasks_.size() >= 1; - bool aicore_multi_task = tbe_tasks_.size() > 1; - need_hybrid_model = need_d2h_cpy || aicore_multi_task || aicpu_multi_task; + need_hybrid_model = need_d2h_cpy || node_tasks_.size() > 1; return SUCCESS; } @@ -601,31 +659,27 @@ Status SingleOpModel::ParseTasks() { GELOGI("[%s] Task[%d], type = [%u], DebugString = [%s]", model_name_.c_str(), i, task_def.type(), task_def.DebugString().c_str()); auto task_type = static_cast(task_def.type()); + uint32_t op_index = 0; if (task_type == RT_MODEL_TASK_KERNEL) { - const auto &kernel_def = task_def.kernel(); - const auto &context = kernel_def.context(); - auto kernel_type = static_cast(context.kernel_type()); - if (kernel_type == ccKernelType::TE) { - tbe_tasks_.emplace_back(task_def); - } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { - aicpu_tasks_.emplace_back(task_def); - } else { - GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, - "[Check][Param:TaskDef]Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", - context.kernel_type()); - REPORT_INNER_ERROR("E19999", - "BuildModelTaskKernel fail for got:%u not supported, Only TBE, AI_CPU, CUST_AI_CPU kernel are supported.", - context.kernel_type()); - return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID; - } - } else if (task_type == RT_MODEL_TASK_ALL_KERNEL) { - tbe_tasks_.emplace_back(task_def); + op_index = task_def.kernel().context().op_index(); } else if (task_type == RT_MODEL_TASK_KERNEL_EX) { - aicpu_tasks_.emplace_back(task_def); + op_index = task_def.kernel_ex().op_index(); + } else if (task_type == RT_MODEL_TASK_ALL_KERNEL) { + op_index = task_def.kernel_with_handle().context().op_index(); } else { - // skip GELOGD("Skip task type: %d", static_cast(task_type)); + continue; + } + GELOGD("op_index = %u, task_type = %d", op_index, task_type); + + auto iter = op_list_.find(op_index); + if (iter == op_list_.end()) { + GELOGE(INTERNAL_ERROR, "[Find][Node]Failed to get node by op_index = %u", op_index); + REPORT_INNER_ERROR("E19999", "Failed to get node by op_index = %u.", op_index); + return INTERNAL_ERROR; } + auto &node = iter->second; + node_tasks_[node].emplace_back(task_def); } return SUCCESS; } diff --git a/ge/single_op/single_op_model.h b/ge/single_op/single_op_model.h index 45616d9a..b1cd161c 100755 --- a/ge/single_op/single_op_model.h +++ b/ge/single_op/single_op_model.h @@ -69,6 +69,7 @@ class SingleOpModel { Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); Status BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &dynamic_single_op); Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task); + Status BuildAtomicTask(const domi::TaskDef &task_def, AtomicAddrCleanOpTask **task); Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id); Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id); @@ -79,9 +80,7 @@ class SingleOpModel { Status NeedHybridModel(GeModelPtr &ge_model, bool &flag); Status ParseTasks(); - std::vector tbe_tasks_; - std::vector aicpu_tasks_; - + std::map> node_tasks_; std::string model_name_; uint32_t model_id_ = 0; const void *ori_model_data_; diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc index 9b8ef739..c6c99ab0 100755 --- a/ge/single_op/task/op_task.cc +++ b/ge/single_op/task/op_task.cc @@ -27,7 +27,6 @@ #include "common/formats/formats.h" #include "common/math/math_util.h" #include "framework/common/debug/log.h" -#include "register/op_tiling.h" #include "runtime/rt.h" #include "single_op/task/build_task_utils.h" @@ -222,19 +221,26 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { return SUCCESS; } -Status TbeOpTask::UpdateRunInfo() { - // invoke OpParaCalculate - GELOGD("Start to invoke OpParaCalculate."); - optiling::utils::OpRunInfo run_info(0, true, 0); +Status TbeOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) { auto ret = optiling::OpParaCalculateV2(*node_, run_info); if (ret != GRAPH_SUCCESS) { GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpParaCalculate] failed, ret = %u.", ret); REPORT_INNER_ERROR("E19999", "invoke OpParaCalculate failed, ret = %u.", ret); return ACL_ERROR_GE_INTERNAL_ERROR; } + return SUCCESS; +} + +Status TbeOpTask::UpdateRunInfo() { + // invoke OpParaCalculate + GELOGD("Start to invoke OpParaCalculate."); + optiling::utils::OpRunInfo run_info(0, true, 0); + GE_CHK_STATUS_RET(CalcTilingInfo(run_info), "[Calc][TilingInfo]failed."); + block_dim_ = run_info.GetBlockDim(); tiling_data_ = run_info.GetAllTilingData().str(); tiling_key_ = run_info.GetTilingKey(); + clear_atomic_ = run_info.GetClearAtomic(); run_info.GetAllWorkspaces(run_info_workspaces_); GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu, tiling_key = %u", block_dim_, tiling_data_.size(), tiling_key_); @@ -262,7 +268,6 @@ Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc dst_tensor.SetShape(GeShape(std::move(storage_shape))); dst_tensor.SetOriginShape(src_tensor.GetShape()); } - return SUCCESS; } @@ -346,6 +351,17 @@ Status TbeOpTask::AllocateWorkspaces(const vector &workspace_sizes) { return SUCCESS; } +Status TbeOpTask::CheckAndExecuteAtomic(const vector &input_desc, + const vector &input_buffers, + vector &output_desc, + vector &output_buffers, + rtStream_t stream) { + if (clear_atomic_ && atomic_task_ != nullptr) { + return atomic_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream); + } + return SUCCESS; +} + Status TbeOpTask::UpdateTilingArgs(rtStream_t stream) { size_t args_size = input_num_ + output_num_ + workspaces_.size(); if (tiling_buffer_ != nullptr) { @@ -433,6 +449,8 @@ Status TbeOpTask::LaunchKernel(const vector &input_desc, GE_CHK_STATUS_RET_NOLOG(UpdateNodeByShape(input_desc, output_desc)); GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo()); GE_CHK_STATUS_RET(AllocateWorkspaces(run_info_workspaces_), "[Allocate][Workspaces] failed."); + GE_CHK_STATUS_RET(CheckAndExecuteAtomic(input_desc, input_buffers, output_desc, output_buffers, stream), + "[Execute][AtomicTask] failed."); GE_CHK_STATUS_RET(UpdateTilingArgs(stream), "[Update][TilingArgs] failed."); GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str()); @@ -463,6 +481,85 @@ void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { } } +Status AtomicAddrCleanOpTask::UpdateNodeByShape(const vector &input_desc, + const vector &output_desc) { + return SUCCESS; +} + +Status AtomicAddrCleanOpTask::UpdateIoAddr(const vector &inputs, const vector &outputs) { + uintptr_t *arg_base = reinterpret_cast(args_.get()); + for (auto atomic_output_index : atomic_output_indices_) { + if (atomic_output_index >= static_cast(outputs.size())) { + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Update][Args] failed, atomic index must smaller then data size."); + REPORT_INNER_ERROR("E19999", "[Update][Args] failed, atomic index must smaller then data size."); + return ACL_ERROR_GE_PARAM_INVALID; + } + auto &output_buffer = outputs[atomic_output_index]; + *arg_base++ = reinterpret_cast(output_buffer.data); + + auto tensor_desc = op_desc_->MutableOutputDesc(atomic_output_index); + int64_t size = 0; + graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, size); + if (graph_status != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed!"); + GELOGE(graph_status, "[Get][TensorMemorySize] In Bytes failed!"); + return FAILED; + } + TensorUtils::SetSize(*tensor_desc, size); + } + return SUCCESS; +} + +Status AtomicAddrCleanOpTask::UpdateTilingArgs(rtStream_t stream) { + if (tiling_buffer_ != nullptr) { + GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); + GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(), + RT_MEMCPY_HOST_TO_DEVICE_EX, stream)); + uintptr_t *arg_base = reinterpret_cast(args_.get()); + size_t idx = atomic_output_indices_.size(); + arg_base[idx] = reinterpret_cast(tiling_buffer_); + } + return SUCCESS; +} + +Status AtomicAddrCleanOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) { + auto ret = optiling::OpAtomicCalculateV2(*node_, run_info); + if (ret != GRAPH_SUCCESS) { + GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpAtomicCalculate] failed, ret = %u.", ret); + REPORT_INNER_ERROR("E19999", "invoke OpAtomicCalculate failed, ret = %u.", ret); + return ACL_ERROR_GE_INTERNAL_ERROR; + } + return SUCCESS; +} + +Status AtomicAddrCleanOpTask::InitAtomicAddrCleanIndices() { + GELOGD("[%s] Start to setup AtomicAddrClean task.", op_desc_->GetName().c_str()); + std::vector atomic_output_indices; + (void) ge::AttrUtils::GetListInt(op_desc_, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices); + if (atomic_output_indices.empty()) { + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] atomic_output_indices must not be empty.", op_desc_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] atomic_output_indices must not be empty.", op_desc_->GetName().c_str()); + return INTERNAL_ERROR; + } + + size_t max_arg_size = tiling_buffer_ == nullptr ? arg_size_ : arg_size_ - 1; + if (atomic_output_indices.size() > max_arg_size) { + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] atomic_output_indices invalid. atomic_output_indices size is %zu," + "arg size is %zu.", op_desc_->GetName().c_str(), atomic_output_indices.size(), arg_size_); + REPORT_INNER_ERROR("E19999", "[%s] atomic_output_indices invalid. atomic_output_indices size is %zu," + "arg size is %zu.", op_desc_->GetName().c_str(), atomic_output_indices.size(), arg_size_); + return INTERNAL_ERROR; + } + + for (auto output_index : atomic_output_indices) { + GELOGD("[%s] Adding output index [%ld]", op_desc_->GetName().c_str(), output_index); + GE_CHECK_GE(output_index, 0); + GE_CHECK_LE(output_index, INT32_MAX); + atomic_output_indices_.emplace_back(static_cast(output_index)); + } + return SUCCESS; +} + AiCpuBaseTask::~AiCpuBaseTask() { if (ext_info_addr_dev_ != nullptr) { (void)rtFree(ext_info_addr_dev_); diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h index a73bcfda..132672b0 100644 --- a/ge/single_op/task/op_task.h +++ b/ge/single_op/task/op_task.h @@ -89,6 +89,7 @@ class TbeOpTask : public OpTask { void SetKernelArgs(std::unique_ptr &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); void SetKernelWithHandleArgs(std::unique_ptr &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc, const domi::KernelDefWithHandle& kernel_def_with_handle); + void SetAtomicAddrCleanTask(OpTask *task) { atomic_task_.reset(task); } Status UpdateRunInfo() override; Status SetArgIndex(); @@ -100,38 +101,63 @@ class TbeOpTask : public OpTask { const std::string &GetTaskType() const override; void SetHandle(void *handle); + protected: + NodePtr node_; + std::unique_ptr args_; + size_t arg_size_ = 0; + void *tiling_buffer_ = nullptr; + uint32_t max_tiling_size_ = 0; + std::string tiling_data_; + size_t input_num_; // include const input + size_t output_num_; + private: friend class SingleOpModel; friend class TbeTaskBuilder; static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); - Status UpdateNodeByShape(const vector &input_desc, - const vector &output_desc); Status AllocateWorkspaces(const std::vector &workspace_sizes); - Status UpdateTilingArgs(rtStream_t stream); Status DoLaunchKernel(rtStream_t stream); - Status UpdateIoAddr(const vector &inputs, const vector &outputs); + Status CheckAndExecuteAtomic(const vector &input_desc, + const vector &input_buffers, + vector &output_desc, + vector &output_buffers, + rtStream_t stream); + virtual Status UpdateNodeByShape(const vector &input_desc, + const vector &output_desc); + virtual Status UpdateTilingArgs(rtStream_t stream); + virtual Status UpdateIoAddr(const vector &inputs, const vector &outputs); + virtual Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info); const void *stub_func_ = nullptr; - std::unique_ptr args_; - size_t arg_size_ = 0; void *sm_desc_ = nullptr; std::string stub_name_; - StreamResource *stream_resource_ = nullptr; - void *tiling_buffer_ = nullptr; - uint32_t max_tiling_size_ = 0; - std::string tiling_data_; + std::vector run_info_workspaces_; std::vector workspaces_; - NodePtr node_; uint32_t tiling_key_ = 0; + bool clear_atomic_ = false; void* handle_ = nullptr; std::string original_kernel_key_; std::string node_info_; std::vector arg_index_; // data index in args - size_t input_num_; // include const input - size_t output_num_; + + std::unique_ptr atomic_task_; +}; + +class AtomicAddrCleanOpTask : public TbeOpTask { + public: + Status InitAtomicAddrCleanIndices(); + + private: + Status UpdateNodeByShape(const vector &input_desc, + const vector &output_desc) override; + Status UpdateIoAddr(const vector &inputs, const vector &outputs) override; + Status UpdateTilingArgs(rtStream_t stream) override; + Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info) override; + std::vector atomic_output_indices_; + }; class AiCpuBaseTask : public OpTask { diff --git a/ge/single_op/task/tbe_task_builder.cc b/ge/single_op/task/tbe_task_builder.cc index c1bafed8..017dac25 100644 --- a/ge/single_op/task/tbe_task_builder.cc +++ b/ge/single_op/task/tbe_task_builder.cc @@ -29,15 +29,8 @@ namespace ge { namespace { constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; constexpr char const *kAttrOpParamSize = "op_para_size"; +constexpr char const *kAttrAtomicOpParamSize = "atomic_op_para_size"; std::mutex g_reg_mutex; - -inline void GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) { - (void)AttrUtils::GetStr(op_desc, op_desc->GetName() + "_kernelname", kernel_name); -} - -inline TBEKernelPtr GetTbeKernel(const OpDescPtr &op_desc) { - return op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); -} } // namespace KernelHolder::KernelHolder(const char *stub_func, std::shared_ptr kernel_bin) @@ -96,7 +89,15 @@ TbeTaskBuilder::TbeTaskBuilder(const std::string &model_name, const NodePtr &nod task_def_(task_def), kernel_def_(task_def.kernel()), kernel_def_with_handle_(task_def.kernel_with_handle()), - stub_name_(model_name + "/" + node->GetName() + "_tvmbin") {} + model_name_(model_name) {} + +TBEKernelPtr TbeTaskBuilder::GetTbeKernel(const OpDescPtr &op_desc) const { + return op_desc->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); +} + +void TbeTaskBuilder::GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const { + (void)AttrUtils::GetStr(op_desc, op_desc->GetName() + "_kernelname", kernel_name); +} Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, const SingleOpModelParam ¶m) const { @@ -124,7 +125,7 @@ Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bi Status TbeTaskBuilder::DoRegisterMeta(void *bin_handle) { std::string meta_data; - (void)AttrUtils::GetStr(op_desc_, TVM_ATTR_NAME_METADATA, meta_data); + (void)AttrUtils::GetStr(op_desc_, GetKeyForTvmMetaData(), meta_data); GELOGI("TBE: meta data: %s", meta_data.empty() ? "null" : meta_data.c_str()); if (!meta_data.empty()) { auto rt_ret = rtMetadataRegister(bin_handle, meta_data.c_str()); @@ -307,6 +308,15 @@ Status TbeTaskBuilder::GetSmDesc(void **sm_desc, const SingleOpModelParam ¶m return SUCCESS; } +Status TbeTaskBuilder::InitKernelArgs(void *arg_addr, size_t arg_size, const SingleOpModelParam ¶m) { + // copy args + std::vector tensor_device_addr_vec = BuildTaskUtils::GetKernelArgs(op_desc_, param); + void *src_addr = reinterpret_cast(tensor_device_addr_vec.data()); + uint64_t src_len = sizeof(void *) * tensor_device_addr_vec.size(); + GE_CHK_RT_RET(rtMemcpy(arg_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST)); + return SUCCESS; +} + Status TbeTaskBuilder::SetKernelArgs(TbeOpTask &task, const SingleOpModelParam ¶m, const OpDescPtr &op_desc) { auto task_type = static_cast(task_def_.type()); bool is_task_all_kernel = (task_type == RT_MODEL_TASK_ALL_KERNEL); @@ -331,12 +341,7 @@ Status TbeTaskBuilder::SetKernelArgs(TbeOpTask &task, const SingleOpModelParam & kernel_def_with_handle_.context() : kernel_def_.context(); const auto *args_offset_tmp = reinterpret_cast(context.args_offset().data()); uint16_t offset = *args_offset_tmp; - - // copy args - std::vector tensor_device_addr_vec = BuildTaskUtils::GetKernelArgs(op_desc_, param); - void *src_addr = reinterpret_cast(tensor_device_addr_vec.data()); - uint64_t src_len = sizeof(void *) * tensor_device_addr_vec.size(); - GE_CHK_RT_RET(rtMemcpy(args.get() + offset, arg_size - offset, src_addr, src_len, RT_MEMCPY_HOST_TO_HOST)); + GE_CHK_STATUS_RET_NOLOG(InitKernelArgs(args.get() + offset, arg_size - offset, param)); if (is_task_all_kernel) { task.SetKernelWithHandleArgs(std::move(args), arg_size, kernel_def_with_handle_.block_dim(), op_desc, @@ -367,8 +372,15 @@ Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam ¶ } auto task_type = static_cast(task_def_.type()); - ret = task_type == RT_MODEL_TASK_ALL_KERNEL ? RegisterKernelWithHandle(task, param) : - RegisterKernel(task, param); + if (task_type == RT_MODEL_TASK_ALL_KERNEL) { + stub_name_ = model_name_ + "/" + node_->GetName() + "_tvmbin"; + ret = RegisterKernelWithHandle(task, param); + } else { + const domi::KernelDef &kernel_def = task_def_.kernel(); + stub_name_ = model_name_ + "/" + kernel_def.stub_func() + "_tvmbin"; + ret = RegisterKernel(task, param); + } + task.SetHandle(handle_); if (ret != SUCCESS) { return ret; @@ -397,8 +409,8 @@ Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam ¶ Status TbeTaskBuilder::InitTilingInfo(TbeOpTask &task) { GELOGD("Start alloc tiling data of node %s.", op_desc_->GetName().c_str()); int64_t max_size = -1; - (void)AttrUtils::GetInt(op_desc_, kAttrOpParamSize, max_size); - GELOGD("Got op param size by key: %s, ret = %ld", kAttrOpParamSize, max_size); + (void)AttrUtils::GetInt(op_desc_, GetKeyForOpParamSize(), max_size); + GELOGD("Got op param size by key: %s, ret = %ld", GetKeyForOpParamSize().c_str(), max_size); if (max_size < 0) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Get][Int] %s Invalid op_param_size: %ld.", op_desc_->GetName().c_str(), max_size); @@ -439,4 +451,32 @@ Status TbeTaskBuilder::GetMagic(uint32_t &magic) const { return SUCCESS; } +std::string TbeTaskBuilder::GetKeyForOpParamSize() const { + return kAttrOpParamSize; +} + +std::string TbeTaskBuilder::GetKeyForTvmMetaData() const { + return TVM_ATTR_NAME_METADATA; +} + +Status AtomicAddrCleanTaskBuilder::InitKernelArgs(void *args_addr, size_t arg_size, const SingleOpModelParam ¶m) { + return SUCCESS; +} + +std::string AtomicAddrCleanTaskBuilder::GetKeyForOpParamSize() const { + return kAttrAtomicOpParamSize; +} + +std::string AtomicAddrCleanTaskBuilder::GetKeyForTvmMetaData() const { + return ATOMIC_ATTR_TVM_METADATA; +} + +void AtomicAddrCleanTaskBuilder::GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const { + (void)AttrUtils::GetStr(op_desc, op_desc->GetName() + "_atomic_kernelname", kernel_name); +} + +TBEKernelPtr AtomicAddrCleanTaskBuilder::GetTbeKernel(const OpDescPtr &op_desc) const { + return op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_TBE_KERNEL, TBEKernelPtr()); +} + } // namespace ge diff --git a/ge/single_op/task/tbe_task_builder.h b/ge/single_op/task/tbe_task_builder.h index 6252feea..06d17901 100755 --- a/ge/single_op/task/tbe_task_builder.h +++ b/ge/single_op/task/tbe_task_builder.h @@ -90,10 +90,17 @@ class HandleRegistry { class TbeTaskBuilder { public: TbeTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::TaskDef &task_def); - ~TbeTaskBuilder() = default; + virtual ~TbeTaskBuilder() = default; Status BuildTask(TbeOpTask &task, const SingleOpModelParam ¶m); + protected: + virtual std::string GetKeyForOpParamSize() const; + virtual std::string GetKeyForTvmMetaData() const; + virtual TBEKernelPtr GetTbeKernel(const OpDescPtr &op_desc) const; + virtual void GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const; + virtual Status InitKernelArgs(void *args_addr, size_t arg_size, const SingleOpModelParam ¶m); + private: Status InitTilingInfo(TbeOpTask &task); Status SetKernelArgs(TbeOpTask &task, const SingleOpModelParam ¶m, const OpDescPtr &op_desc); @@ -114,9 +121,24 @@ class TbeTaskBuilder { const domi::TaskDef &task_def_; const domi::KernelDef &kernel_def_; const domi::KernelDefWithHandle &kernel_def_with_handle_; - const std::string stub_name_; + const std::string model_name_; + std::string stub_name_; void *handle_ = nullptr; }; + +class AtomicAddrCleanTaskBuilder : public TbeTaskBuilder { + public: + AtomicAddrCleanTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::TaskDef &task_def) + : TbeTaskBuilder(model_name, node, task_def) {} + ~AtomicAddrCleanTaskBuilder() override = default; + + protected: + std::string GetKeyForOpParamSize() const override; + std::string GetKeyForTvmMetaData() const override; + TBEKernelPtr GetTbeKernel(const OpDescPtr &op_desc) const override; + void GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const override; + Status InitKernelArgs(void *args_addr, size_t arg_size, const SingleOpModelParam ¶m) override; +}; } // namespace ge #endif // GE_SINGLE_OP_TASK_TBE_TASK_BUILDER_H_ diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index d1c51c67..1d1c4fa9 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -153,7 +153,6 @@ TEST_F(UtestGeHybrid, task_update_tiling_info) { ge::AttrUtils::SetStr(op_desc, "compile_info_json", "json"); ge::AttrUtils::SetBool(op_desc, "support_dynamicshape", true); ge::AttrUtils::SetInt(op_desc, "op_para_size", 1); - ge::AttrUtils::SetStr(op_desc, TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF"); auto node = graph->AddNode(op_desc); std::unique_ptr node_item; diff --git a/tests/ut/ge/single_op/single_op_model_unittest.cc b/tests/ut/ge/single_op/single_op_model_unittest.cc index 23269814..7b7a05d8 100644 --- a/tests/ut/ge/single_op/single_op_model_unittest.cc +++ b/tests/ut/ge/single_op/single_op_model_unittest.cc @@ -40,6 +40,9 @@ using namespace ge; namespace { constexpr char const *kAttrSupportDynamicShape = "support_dynamicshape"; +const char *const kEngineNameAiCore = "AIcoreEngine"; +const char *const kEngineNameAiCpu = "aicpu_ascend_kernel"; +const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel"; } // namespace class UtestSingleOpModel : public testing::Test { @@ -222,6 +225,7 @@ TEST_F(UtestSingleOpModel, test_build_dynamic_op) { auto graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph); model.model_helper_.model_->SetGraph(graph); + model.op_list_[0] = transdata; auto op_desc = transdata->GetOpDesc(); const vector depend_names = { "Data" }; @@ -330,7 +334,10 @@ TEST_F(UtestSingleOpModel, build_dynamic_task) { domi::TaskDef *task_def3 = model_task_def->add_task(); task_def3->set_type(RT_MODEL_TASK_ALL_KERNEL); - string model_data_str = "123456789"; + domi::TaskDef *task_def4 = model_task_def->add_task(); + task_def4->set_type(RT_MODEL_TASK_KERNEL); + + string model_data_str = "dynamic_model"; SingleOpModel model("model", model_data_str.c_str(), model_data_str.size()); std::mutex stream_mu; rtStream_t stream = nullptr; @@ -347,9 +354,15 @@ TEST_F(UtestSingleOpModel, build_dynamic_task) { StreamResource *res = new (std::nothrow) StreamResource(1); ASSERT_EQ(model.ParseTasks(), SUCCESS); + model.node_tasks_[node] = { *task_def3, *task_def4 }; + op_desc->SetOpKernelLibName(kEngineNameAiCore); + model.BuildTaskListForDynamicOp(res, single_op); + + model.node_tasks_[node] = { *task_def }; + op_desc->SetOpKernelLibName(kEngineNameAiCpuTf); ASSERT_EQ(model.BuildTaskListForDynamicOp(res, single_op), SUCCESS); - model.tbe_tasks_.clear(); - ASSERT_EQ(model.BuildTaskListForDynamicOp(res, single_op), SUCCESS); - model.aicpu_tasks_[0] = *task_def2; + + model.node_tasks_[node] = { *task_def2 }; + op_desc->SetOpKernelLibName(kEngineNameAiCpu); model.BuildTaskListForDynamicOp(res, single_op); } diff --git a/tests/ut/ge/single_op/single_op_task_unittest.cc b/tests/ut/ge/single_op/single_op_task_unittest.cc index 2424d209..3e3160c2 100644 --- a/tests/ut/ge/single_op/single_op_task_unittest.cc +++ b/tests/ut/ge/single_op/single_op_task_unittest.cc @@ -154,3 +154,38 @@ TEST_F(UtestSingleOpTask, test_update_ioaddr) { task.tiling_buffer_ = nullptr; } +TEST_F(UtestSingleOpTask, test_atomic_exec) { + auto graph = make_shared("graph"); + auto op_desc = make_shared("Add", "Add"); + GeTensorDesc desc; + op_desc->AddInputDesc(desc); + op_desc->AddOutputDesc(desc); + auto node = graph->AddNode(op_desc); + AtomicAddrCleanOpTask task; + task.op_desc_ = op_desc; + task.node_ = node; + + vector inputs; + vector outputs; + std::vector atomic_output_indices; + ge::AttrUtils::SetListInt(op_desc, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices); + ASSERT_EQ(task.InitAtomicAddrCleanIndices(), INTERNAL_ERROR); + atomic_output_indices = { 0 }; + ge::AttrUtils::SetListInt(op_desc, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices); + ASSERT_EQ(task.InitAtomicAddrCleanIndices(), INTERNAL_ERROR); + task.arg_size_ = sizeof(void *) * 2; + task.args_.reset(new (std::nothrow) uint8_t[task.arg_size_]); + ASSERT_EQ(task.InitAtomicAddrCleanIndices(), SUCCESS); + ASSERT_EQ(task.UpdateIoAddr(inputs, outputs), ACL_ERROR_GE_PARAM_INVALID); + + ge::DataBuffer data_buffer; + outputs = { data_buffer }; + ASSERT_EQ(task.UpdateIoAddr(inputs, outputs), SUCCESS); + + task.tiling_buffer_ = (void *)0x0001; + ASSERT_EQ(task.UpdateTilingArgs(nullptr), SUCCESS); + task.tiling_buffer_ = nullptr; + + optiling::utils::OpRunInfo run_info(0, true, 0); + task.CalcTilingInfo(run_info); +}