| @@ -432,7 +432,7 @@ Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask * | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status SingleOpModel::BuildAtomicTask(const domi::TaskDef &task_def, AtomicOpTask **task) { | |||||
| Status SingleOpModel::BuildAtomicTask(const domi::TaskDef &task_def, AtomicAddrCleanOpTask **task) { | |||||
| GE_CHECK_NOTNULL(task); | GE_CHECK_NOTNULL(task); | ||||
| const auto &context = task_def.kernel().context(); | const auto &context = task_def.kernel().context(); | ||||
| auto iter = op_list_.find(context.op_index()); | auto iter = op_list_.find(context.op_index()); | ||||
| @@ -442,18 +442,18 @@ Status SingleOpModel::BuildAtomicTask(const domi::TaskDef &task_def, AtomicOpTas | |||||
| return ACL_ERROR_GE_INTERNAL_ERROR; | return ACL_ERROR_GE_INTERNAL_ERROR; | ||||
| } | } | ||||
| std::unique_ptr<AtomicOpTask> atomic_task(new (std::nothrow) AtomicOpTask()); | |||||
| std::unique_ptr<AtomicAddrCleanOpTask> atomic_task(new (std::nothrow) AtomicAddrCleanOpTask()); | |||||
| if (atomic_task == nullptr) { | if (atomic_task == nullptr) { | ||||
| GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][AtomicOpTask]failed."); | |||||
| REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for new AtomicOpTask."); | |||||
| GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Create][AtomicAddrCleanOpTask]failed."); | |||||
| REPORT_INNER_ERROR("E19999", "BuildKernelTask fail for new AtomicAddrCleanOpTask."); | |||||
| return ACL_ERROR_GE_MEMORY_ALLOCATION; | return ACL_ERROR_GE_MEMORY_ALLOCATION; | ||||
| } | } | ||||
| auto builder = AtomicTaskBuilder(model_name_, iter->second, task_def); | |||||
| auto builder = AtomicAddrCleanTaskBuilder(model_name_, iter->second, task_def); | |||||
| auto ret = builder.BuildTask(*atomic_task, model_params_); | auto ret = builder.BuildTask(*atomic_task, model_params_); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| GELOGE(ret, "[Build][AtomicOpTask]failed."); | |||||
| REPORT_INNER_ERROR("E19999", "[Build][AtomicOpTask]failed."); | |||||
| GELOGE(ret, "[Build][AtomicAddrCleanOpTask]failed."); | |||||
| REPORT_INNER_ERROR("E19999", "[Build][AtomicAddrCleanOpTask]failed."); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -571,13 +571,21 @@ Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, | |||||
| GE_CHECK_NOTNULL(compute_graph); | GE_CHECK_NOTNULL(compute_graph); | ||||
| single_op.compute_graph_ = compute_graph; | single_op.compute_graph_ = compute_graph; | ||||
| GE_CHK_BOOL_RET_STATUS(node_tasks_.size() == 1, ACL_ERROR_GE_PARAM_INVALID, | |||||
| "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); | |||||
| if (node_tasks_.size() != 1) { | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); | |||||
| REPORT_INNER_ERROR("E19999", "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); | |||||
| return ACL_ERROR_GE_PARAM_INVALID; | |||||
| } | |||||
| auto iter = node_tasks_.begin(); | auto iter = node_tasks_.begin(); | ||||
| auto node = iter->first; | auto node = iter->first; | ||||
| auto task_defs = iter->second; | |||||
| GE_CHK_BOOL_RET_STATUS(task_defs.size() > 0 && task_defs.size() <= kNumTaskWithAtomicAddrCleanTask, | |||||
| ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]task_defs size must be 1 or 2, but get %zu.", task_defs.size()); | |||||
| const auto &task_defs = iter->second; | |||||
| if (task_defs.size() <= 0 || task_defs.size() > kNumTaskWithAtomicAddrCleanTask) { | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Size]Node size must be 1, but get %zu.", node_tasks_.size()); | |||||
| REPORT_INNER_ERROR("E19999", "[Check][Size]task_defs size must be 1 or 2, but get %zu.", task_defs.size()); | |||||
| return ACL_ERROR_GE_PARAM_INVALID; | |||||
| } | |||||
| GE_CHECK_NOTNULL(node); | GE_CHECK_NOTNULL(node); | ||||
| auto op_desc = node->GetOpDesc(); | auto op_desc = node->GetOpDesc(); | ||||
| GE_CHECK_NOTNULL(op_desc); | GE_CHECK_NOTNULL(op_desc); | ||||
| @@ -594,10 +602,10 @@ Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, | |||||
| } | } | ||||
| if (task_defs.size() == kNumTaskWithAtomicAddrCleanTask) { | if (task_defs.size() == kNumTaskWithAtomicAddrCleanTask) { | ||||
| const auto &atomic_task_def = task_defs.front(); | const auto &atomic_task_def = task_defs.front(); | ||||
| AtomicOpTask *atomic_task = nullptr; | |||||
| AtomicAddrCleanOpTask *atomic_task = nullptr; | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildAtomicTask(atomic_task_def, &atomic_task)); | GE_CHK_STATUS_RET_NOLOG(BuildAtomicTask(atomic_task_def, &atomic_task)); | ||||
| GE_CHK_STATUS_RET_NOLOG(atomic_task->InitAtomicAddrCleanIndices()); | GE_CHK_STATUS_RET_NOLOG(atomic_task->InitAtomicAddrCleanIndices()); | ||||
| tbe_task->SetAtomicTask(atomic_task); | |||||
| tbe_task->SetAtomicAddrCleanTask(atomic_task); | |||||
| } | } | ||||
| single_op.op_task_.reset(tbe_task); | single_op.op_task_.reset(tbe_task); | ||||
| } else if (lib_name == kEngineNameAiCpu) { | } else if (lib_name == kEngineNameAiCpu) { | ||||
| @@ -69,7 +69,7 @@ class SingleOpModel { | |||||
| Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); | Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); | ||||
| Status BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &dynamic_single_op); | Status BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &dynamic_single_op); | ||||
| Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task); | Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task); | ||||
| Status BuildAtomicTask(const domi::TaskDef &task_def, AtomicOpTask **task); | |||||
| Status BuildAtomicTask(const domi::TaskDef &task_def, AtomicAddrCleanOpTask **task); | |||||
| Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id); | Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id); | ||||
| Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id); | Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id); | ||||
| @@ -268,15 +268,6 @@ Status TbeOpTask::UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc | |||||
| dst_tensor.SetShape(GeShape(std::move(storage_shape))); | dst_tensor.SetShape(GeShape(std::move(storage_shape))); | ||||
| dst_tensor.SetOriginShape(src_tensor.GetShape()); | dst_tensor.SetOriginShape(src_tensor.GetShape()); | ||||
| } | } | ||||
| int64_t size = 0; | |||||
| graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(dst_tensor, size); | |||||
| if (graph_status != GRAPH_SUCCESS) { | |||||
| REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed!"); | |||||
| GELOGE(graph_status, "[Get][TensorMemorySize] In Bytes failed!"); | |||||
| return FAILED; | |||||
| } | |||||
| TensorUtils::SetSize(dst_tensor, size); | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -490,7 +481,12 @@ void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| } | } | ||||
| } | } | ||||
| Status AtomicOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) { | |||||
| Status AtomicAddrCleanOpTask::UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<GeTensorDesc> &output_desc) { | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AtomicAddrCleanOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) { | |||||
| uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get()); | uintptr_t *arg_base = reinterpret_cast<uintptr_t *>(args_.get()); | ||||
| for (auto atomic_output_index : atomic_output_indices_) { | for (auto atomic_output_index : atomic_output_indices_) { | ||||
| if (atomic_output_index >= static_cast<int>(outputs.size())) { | if (atomic_output_index >= static_cast<int>(outputs.size())) { | ||||
| @@ -500,11 +496,21 @@ Status AtomicOpTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector | |||||
| } | } | ||||
| auto &output_buffer = outputs[atomic_output_index]; | auto &output_buffer = outputs[atomic_output_index]; | ||||
| *arg_base++ = reinterpret_cast<uintptr_t>(output_buffer.data); | *arg_base++ = reinterpret_cast<uintptr_t>(output_buffer.data); | ||||
| auto tensor_desc = op_desc_->MutableOutputDesc(atomic_output_index); | |||||
| int64_t size = 0; | |||||
| graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, size); | |||||
| if (graph_status != GRAPH_SUCCESS) { | |||||
| REPORT_CALL_ERROR("E19999", "Get tensor size in bytes failed!"); | |||||
| GELOGE(graph_status, "[Get][TensorMemorySize] In Bytes failed!"); | |||||
| return FAILED; | |||||
| } | |||||
| TensorUtils::SetSize(*tensor_desc, size); | |||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AtomicOpTask::UpdateTilingArgs(rtStream_t stream) { | |||||
| Status AtomicAddrCleanOpTask::UpdateTilingArgs(rtStream_t stream) { | |||||
| if (tiling_buffer_ != nullptr) { | if (tiling_buffer_ != nullptr) { | ||||
| GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); | GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); | ||||
| GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(), | GE_CHK_RT_RET(rtMemcpyAsync(tiling_buffer_, max_tiling_size_, tiling_data_.data(), tiling_data_.size(), | ||||
| @@ -516,7 +522,7 @@ Status AtomicOpTask::UpdateTilingArgs(rtStream_t stream) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AtomicOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) { | |||||
| Status AtomicAddrCleanOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) { | |||||
| auto ret = optiling::OpAtomicCalculateV2(*node_, run_info); | auto ret = optiling::OpAtomicCalculateV2(*node_, run_info); | ||||
| if (ret != GRAPH_SUCCESS) { | if (ret != GRAPH_SUCCESS) { | ||||
| GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpAtomicCalculate] failed, ret = %u.", ret); | GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Invoke][OpAtomicCalculate] failed, ret = %u.", ret); | ||||
| @@ -526,7 +532,7 @@ Status AtomicOpTask::CalcTilingInfo(optiling::utils::OpRunInfo &run_info) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AtomicOpTask::InitAtomicAddrCleanIndices() { | |||||
| Status AtomicAddrCleanOpTask::InitAtomicAddrCleanIndices() { | |||||
| GELOGD("[%s] Start to setup AtomicAddrClean task.", op_desc_->GetName().c_str()); | GELOGD("[%s] Start to setup AtomicAddrClean task.", op_desc_->GetName().c_str()); | ||||
| std::vector<int64_t> atomic_output_indices; | std::vector<int64_t> atomic_output_indices; | ||||
| (void) ge::AttrUtils::GetListInt(op_desc_, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices); | (void) ge::AttrUtils::GetListInt(op_desc_, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_indices); | ||||
| @@ -89,7 +89,7 @@ class TbeOpTask : public OpTask { | |||||
| void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); | void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); | ||||
| void SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, | void SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, | ||||
| const OpDescPtr &op_desc, const domi::KernelDefWithHandle& kernel_def_with_handle); | const OpDescPtr &op_desc, const domi::KernelDefWithHandle& kernel_def_with_handle); | ||||
| void SetAtomicTask(OpTask *task) { atomic_task_.reset(task); } | |||||
| void SetAtomicAddrCleanTask(OpTask *task) { atomic_task_.reset(task); } | |||||
| Status UpdateRunInfo() override; | Status UpdateRunInfo() override; | ||||
| Status SetArgIndex(); | Status SetArgIndex(); | ||||
| @@ -108,13 +108,13 @@ class TbeOpTask : public OpTask { | |||||
| void *tiling_buffer_ = nullptr; | void *tiling_buffer_ = nullptr; | ||||
| uint32_t max_tiling_size_ = 0; | uint32_t max_tiling_size_ = 0; | ||||
| std::string tiling_data_; | std::string tiling_data_; | ||||
| size_t input_num_; // include const input | |||||
| size_t output_num_; | |||||
| private: | private: | ||||
| friend class SingleOpModel; | friend class SingleOpModel; | ||||
| friend class TbeTaskBuilder; | friend class TbeTaskBuilder; | ||||
| static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); | static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); | ||||
| Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<GeTensorDesc> &output_desc); | |||||
| Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes); | Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes); | ||||
| Status DoLaunchKernel(rtStream_t stream); | Status DoLaunchKernel(rtStream_t stream); | ||||
| Status CheckAndExecuteAtomic(const vector<GeTensorDesc> &input_desc, | Status CheckAndExecuteAtomic(const vector<GeTensorDesc> &input_desc, | ||||
| @@ -122,6 +122,8 @@ class TbeOpTask : public OpTask { | |||||
| vector<GeTensorDesc> &output_desc, | vector<GeTensorDesc> &output_desc, | ||||
| vector<DataBuffer> &output_buffers, | vector<DataBuffer> &output_buffers, | ||||
| rtStream_t stream); | rtStream_t stream); | ||||
| virtual Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<GeTensorDesc> &output_desc); | |||||
| virtual Status UpdateTilingArgs(rtStream_t stream); | virtual Status UpdateTilingArgs(rtStream_t stream); | ||||
| virtual Status UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs); | virtual Status UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs); | ||||
| virtual Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info); | virtual Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info); | ||||
| @@ -140,17 +142,17 @@ class TbeOpTask : public OpTask { | |||||
| std::string original_kernel_key_; | std::string original_kernel_key_; | ||||
| std::string node_info_; | std::string node_info_; | ||||
| std::vector<size_t> arg_index_; // data index in args | std::vector<size_t> arg_index_; // data index in args | ||||
| size_t input_num_; // include const input | |||||
| size_t output_num_; | |||||
| std::unique_ptr<OpTask> atomic_task_; | std::unique_ptr<OpTask> atomic_task_; | ||||
| }; | }; | ||||
| class AtomicOpTask : public TbeOpTask { | |||||
| class AtomicAddrCleanOpTask : public TbeOpTask { | |||||
| public: | public: | ||||
| Status InitAtomicAddrCleanIndices(); | Status InitAtomicAddrCleanIndices(); | ||||
| private: | private: | ||||
| Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<GeTensorDesc> &output_desc) override; | |||||
| Status UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) override; | Status UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) override; | ||||
| Status UpdateTilingArgs(rtStream_t stream) override; | Status UpdateTilingArgs(rtStream_t stream) override; | ||||
| Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info) override; | Status CalcTilingInfo(optiling::utils::OpRunInfo &run_info) override; | ||||
| @@ -459,23 +459,23 @@ std::string TbeTaskBuilder::GetKeyForTvmMetaData() const { | |||||
| return TVM_ATTR_NAME_METADATA; | return TVM_ATTR_NAME_METADATA; | ||||
| } | } | ||||
| Status AtomicTaskBuilder::InitKernelArgs(void *args_addr, size_t arg_size, const SingleOpModelParam ¶m) { | |||||
| Status AtomicAddrCleanTaskBuilder::InitKernelArgs(void *args_addr, size_t arg_size, const SingleOpModelParam ¶m) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| std::string AtomicTaskBuilder::GetKeyForOpParamSize() const { | |||||
| std::string AtomicAddrCleanTaskBuilder::GetKeyForOpParamSize() const { | |||||
| return kAttrAtomicOpParamSize; | return kAttrAtomicOpParamSize; | ||||
| } | } | ||||
| std::string AtomicTaskBuilder::GetKeyForTvmMetaData() const { | |||||
| std::string AtomicAddrCleanTaskBuilder::GetKeyForTvmMetaData() const { | |||||
| return ATOMIC_ATTR_TVM_METADATA; | return ATOMIC_ATTR_TVM_METADATA; | ||||
| } | } | ||||
| void AtomicTaskBuilder::GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const { | |||||
| void AtomicAddrCleanTaskBuilder::GetKernelName(const OpDescPtr &op_desc, std::string &kernel_name) const { | |||||
| (void)AttrUtils::GetStr(op_desc, op_desc->GetName() + "_atomic_kernelname", kernel_name); | (void)AttrUtils::GetStr(op_desc, op_desc->GetName() + "_atomic_kernelname", kernel_name); | ||||
| } | } | ||||
| TBEKernelPtr AtomicTaskBuilder::GetTbeKernel(const OpDescPtr &op_desc) const { | |||||
| TBEKernelPtr AtomicAddrCleanTaskBuilder::GetTbeKernel(const OpDescPtr &op_desc) const { | |||||
| return op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_TBE_KERNEL, TBEKernelPtr()); | return op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_TBE_KERNEL, TBEKernelPtr()); | ||||
| } | } | ||||
| @@ -126,11 +126,11 @@ class TbeTaskBuilder { | |||||
| void *handle_ = nullptr; | void *handle_ = nullptr; | ||||
| }; | }; | ||||
| class AtomicTaskBuilder : public TbeTaskBuilder { | |||||
| class AtomicAddrCleanTaskBuilder : public TbeTaskBuilder { | |||||
| public: | public: | ||||
| AtomicTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::TaskDef &task_def) | |||||
| AtomicAddrCleanTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::TaskDef &task_def) | |||||
| : TbeTaskBuilder(model_name, node, task_def) {} | : TbeTaskBuilder(model_name, node, task_def) {} | ||||
| ~AtomicTaskBuilder() override = default; | |||||
| ~AtomicAddrCleanTaskBuilder() override = default; | |||||
| protected: | protected: | ||||
| std::string GetKeyForOpParamSize() const override; | std::string GetKeyForOpParamSize() const override; | ||||
| @@ -157,8 +157,11 @@ TEST_F(UtestSingleOpTask, test_update_ioaddr) { | |||||
| TEST_F(UtestSingleOpTask, test_atomic_exec) { | TEST_F(UtestSingleOpTask, test_atomic_exec) { | ||||
| auto graph = make_shared<ComputeGraph>("graph"); | auto graph = make_shared<ComputeGraph>("graph"); | ||||
| auto op_desc = make_shared<OpDesc>("Add", "Add"); | auto op_desc = make_shared<OpDesc>("Add", "Add"); | ||||
| GeTensorDesc desc; | |||||
| op_desc->AddInputDesc(desc); | |||||
| op_desc->AddOutputDesc(desc); | |||||
| auto node = graph->AddNode(op_desc); | auto node = graph->AddNode(op_desc); | ||||
| AtomicOpTask task; | |||||
| AtomicAddrCleanOpTask task; | |||||
| task.op_desc_ = op_desc; | task.op_desc_ = op_desc; | ||||
| task.node_ = node; | task.node_ = node; | ||||