| @@ -25,6 +25,7 @@ | |||||
| #include "graph/load/new_model_manager/model_utils.h" | #include "graph/load/new_model_manager/model_utils.h" | ||||
| #include "runtime/mem.h" | #include "runtime/mem.h" | ||||
| #include "single_op/single_op_manager.h" | #include "single_op/single_op_manager.h" | ||||
| #include "single_op/task/build_task_utils.h" | |||||
| #include "graph/load/new_model_manager/model_manager.h" | #include "graph/load/new_model_manager/model_manager.h" | ||||
| namespace ge { | namespace ge { | ||||
| @@ -77,7 +78,8 @@ Status ProfilingTaskInfo(OpTask *op_task) { | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) { | |||||
| SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream) | |||||
| : stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) { | |||||
| } | } | ||||
| FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { | FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { | ||||
| @@ -159,37 +161,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve | |||||
| *arg_addr = args_[i]; | *arg_addr = args_[i]; | ||||
| } | } | ||||
| } | } | ||||
| // update aicpu_TF or aicpu_CC args | |||||
| for (auto &task : tasks_) { | |||||
| size_t io_addr_num = args_.size(); | |||||
| if (task->GetOpTaskType() == OP_TASK_AICPU) { | |||||
| GELOGD("Update aicpu_TF task args"); | |||||
| task->SetIoAddrsForDump(args_); | |||||
| auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr())); | |||||
| GE_CHECK_NOTNULL(dst_io_addr); | |||||
| auto rt_ret = rtMemcpyAsync(dst_io_addr, | |||||
| sizeof(uint64_t) * args_.size(), | |||||
| &args_[0], | |||||
| sizeof(uint64_t) * args_.size(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE_EX, | |||||
| stream_); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret); | |||||
| return rt_ret; | |||||
| } | |||||
| } else if (task->GetOpTaskType() == OP_TASK_AICPUCC) { | |||||
| GELOGD("Update aicpu_CC task args"); | |||||
| const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr()); | |||||
| GE_CHECK_NOTNULL(task_io_addr); | |||||
| auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr)); | |||||
| for (size_t i = 0; i < io_addr_num; ++i) { | |||||
| io_addr[i] = static_cast<uintptr_t>(args_[i]); | |||||
| } | |||||
| } else { | |||||
| GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType()); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -200,7 +171,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| GE_CHECK_NOTNULL(stream_resource_); | |||||
| std::lock_guard<std::mutex> lk(*stream_mutex_); | std::lock_guard<std::mutex> lk(*stream_mutex_); | ||||
| auto current_mem_base = stream_resource_->GetMemoryBase(); | |||||
| if (running_param_->mem_base != current_mem_base) { | |||||
| running_param_->mem_base = const_cast<uint8_t *>(current_mem_base); | |||||
| GELOGD("Memory base changed, new memory base = %p", current_mem_base); | |||||
| for (auto &task : tasks_) { | |||||
| auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_); | |||||
| GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_), | |||||
| "[%s] Failed to update arg table", | |||||
| task->GetOpdesc()->GetName().c_str()); | |||||
| } | |||||
| } | |||||
| ret = UpdateArgs(inputs, outputs); | ret = UpdateArgs(inputs, outputs); | ||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| return ret; | return ret; | ||||
| @@ -225,9 +208,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex | |||||
| : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) { | : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) { | ||||
| } | } | ||||
| DynamicSingleOp::~DynamicSingleOp() { | |||||
| } | |||||
| Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, | Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, | ||||
| const std::vector<DataBuffer> &inputs, | const std::vector<DataBuffer> &inputs, | ||||
| std::vector<GeTensorDesc> &output_desc, | std::vector<GeTensorDesc> &output_desc, | ||||
| @@ -249,65 +229,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, | |||||
| } | } | ||||
| if (input_desc.size() != num_inputs_) { | if (input_desc.size() != num_inputs_) { | ||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu", | |||||
| num_inputs_, input_desc.size()); | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||||
| "Input number mismatches. expect %zu, but given %zu", | |||||
| num_inputs_, | |||||
| input_desc.size()); | |||||
| return ACL_ERROR_GE_PARAM_INVALID; | return ACL_ERROR_GE_PARAM_INVALID; | ||||
| } | } | ||||
| if (output_desc.size() != num_outputs_) { | if (output_desc.size() != num_outputs_) { | ||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu", | |||||
| num_outputs_, output_desc.size()); | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||||
| "Output number mismatches. expect %zu, but given %zu", | |||||
| num_outputs_, | |||||
| output_desc.size()); | |||||
| return ACL_ERROR_GE_PARAM_INVALID; | return ACL_ERROR_GE_PARAM_INVALID; | ||||
| } | } | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, | |||||
| std::vector<void *> &workspaces) { | |||||
| static const std::string kPurpose("malloc workspace memory for dynamic op."); | |||||
| if (workspace_sizes.empty()) { | |||||
| GELOGD("No need to allocate workspace."); | |||||
| return SUCCESS; | |||||
| } | |||||
| int64_t total_size = 0; | |||||
| std::vector<int64_t> ws_offsets; | |||||
| for (auto ws_size : workspace_sizes) { | |||||
| // alignment and padding should be done in OpParaCalculate | |||||
| GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); | |||||
| ws_offsets.emplace_back(total_size); | |||||
| total_size += ws_size; | |||||
| } | |||||
| GELOGD("Total workspace size is %ld", total_size); | |||||
| StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_); | |||||
| GE_CHECK_NOTNULL(stream_resource); | |||||
| auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size)); | |||||
| if (ws_base == nullptr) { | |||||
| GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); | |||||
| return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||||
| } | |||||
| GELOGD("Done allocating workspace memory successfully."); | |||||
| for (auto ws_offset : ws_offsets) { | |||||
| workspaces.emplace_back(ws_base + ws_offset); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<void *> &inputs, | |||||
| vector<GeTensorDesc> &output_desc, | |||||
| vector<void *> &outputs) { | |||||
| GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc)); | |||||
| std::vector<void *> workspace_buffers; | |||||
| GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers)); | |||||
| return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_); | |||||
| } | |||||
| Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | ||||
| const vector<DataBuffer> &input_buffers, | const vector<DataBuffer> &input_buffers, | ||||
| vector<GeTensorDesc> &output_desc, | vector<GeTensorDesc> &output_desc, | ||||
| @@ -316,32 +255,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | |||||
| GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); | GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); | ||||
| std::lock_guard<std::mutex> lk(*stream_mutex_); | std::lock_guard<std::mutex> lk(*stream_mutex_); | ||||
| std::vector<void *> inputs; | |||||
| std::vector<void *> outputs; | |||||
| for (auto &buffer : input_buffers) { | |||||
| inputs.emplace_back(buffer.data); | |||||
| } | |||||
| for (auto &buffer : output_buffers) { | |||||
| outputs.emplace_back(buffer.data); | |||||
| } | |||||
| if (op_task_->GetOpTaskType() == OP_TASK_TBE) { | |||||
| auto ret = ExecuteTbeTask(input_desc, inputs, output_desc, outputs); | |||||
| if (ret == SUCCESS) { | |||||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||||
| } | |||||
| return ret; | |||||
| } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) { | |||||
| auto aicpu_ret = op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_); | |||||
| if (aicpu_ret == SUCCESS) { | |||||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||||
| } | |||||
| return aicpu_ret; | |||||
| } else { | |||||
| GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, | |||||
| "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u", | |||||
| op_task_->GetOpTaskType()); | |||||
| return ACL_ERROR_GE_OP_TASK_TYPE_INVALID; | |||||
| } | |||||
| GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); | |||||
| GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||||
| return SUCCESS; | |||||
| } | } | ||||
| } // namespace ge | } // namespace ge | ||||
| @@ -30,9 +30,11 @@ | |||||
| #include "cce/aicpu_engine_struct.h" | #include "cce/aicpu_engine_struct.h" | ||||
| namespace ge { | namespace ge { | ||||
| class StreamResource; | |||||
| struct SingleOpModelParam; | |||||
| class SingleOp { | class SingleOp { | ||||
| public: | public: | ||||
| SingleOp(std::mutex *stream_mutex, rtStream_t stream); | |||||
| SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream); | |||||
| ~SingleOp(); | ~SingleOp(); | ||||
| Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | ||||
| @@ -44,6 +46,7 @@ class SingleOp { | |||||
| Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | ||||
| friend class SingleOpModel; | friend class SingleOpModel; | ||||
| StreamResource *stream_resource_; | |||||
| std::mutex *stream_mutex_; | std::mutex *stream_mutex_; | ||||
| rtStream_t stream_ = nullptr; | rtStream_t stream_ = nullptr; | ||||
| std::vector<void *> input_addr_list_; | std::vector<void *> input_addr_list_; | ||||
| @@ -54,12 +57,13 @@ class SingleOp { | |||||
| std::vector<OpTask *> tasks_; | std::vector<OpTask *> tasks_; | ||||
| std::vector<std::vector<uintptr_t *>> arg_table_; | std::vector<std::vector<uintptr_t *>> arg_table_; | ||||
| std::unique_ptr<SingleOpModelParam> running_param_; | |||||
| }; | }; | ||||
| class DynamicSingleOp { | class DynamicSingleOp { | ||||
| public: | public: | ||||
| DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); | DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); | ||||
| ~DynamicSingleOp(); | |||||
| ~DynamicSingleOp() = default; | |||||
| Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, | Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, | ||||
| const std::vector<DataBuffer> &inputs, | const std::vector<DataBuffer> &inputs, | ||||
| std::vector<GeTensorDesc> &output_desc, | std::vector<GeTensorDesc> &output_desc, | ||||
| @@ -72,14 +76,6 @@ class DynamicSingleOp { | |||||
| std::vector<GeTensorDesc> &output_desc, | std::vector<GeTensorDesc> &output_desc, | ||||
| std::vector<DataBuffer> &outputs) const; | std::vector<DataBuffer> &outputs) const; | ||||
| Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, | |||||
| std::vector<void *> &workspaces); | |||||
| Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<void *> &inputs, | |||||
| vector<GeTensorDesc> &output_desc, | |||||
| vector<void *> &outputs); | |||||
| std::unique_ptr<OpTask> op_task_; | std::unique_ptr<OpTask> op_task_; | ||||
| uintptr_t resource_id_ = 0; | uintptr_t resource_id_ = 0; | ||||
| std::mutex *stream_mutex_; | std::mutex *stream_mutex_; | ||||
| @@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) { | |||||
| if (model_params_.memory_size > model_params_.zero_copy_mem_size) { | if (model_params_.memory_size > model_params_.zero_copy_mem_size) { | ||||
| const string purpose("malloc feature map memory on model execute."); | const string purpose("malloc feature map memory on model execute."); | ||||
| GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size); | GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size); | ||||
| model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size); | |||||
| model_params_.mem_base = | |||||
| res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false); | |||||
| if (model_params_.mem_base == nullptr) { | if (model_params_.mem_base == nullptr) { | ||||
| return ACL_ERROR_GE_MEMORY_ALLOCATION; | return ACL_ERROR_GE_MEMORY_ALLOCATION; | ||||
| } | } | ||||
| @@ -226,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) { | |||||
| auto ge_model = model_helper_.GetGeModel(); | auto ge_model = model_helper_.GetGeModel(); | ||||
| GE_CHECK_NOTNULL(ge_model); | GE_CHECK_NOTNULL(ge_model); | ||||
| single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); | |||||
| auto tasks = ge_model->GetModelTaskDefPtr()->task(); | auto tasks = ge_model->GetModelTaskDefPtr()->task(); | ||||
| for (int i = 0; i < tasks.size(); ++i) { | for (int i = 0; i < tasks.size(); ++i) { | ||||
| const TaskDef &task_def = tasks[i]; | const TaskDef &task_def = tasks[i]; | ||||
| @@ -247,9 +249,11 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); | |||||
| ParseArgTable(tbe_task, single_op); | ParseArgTable(tbe_task, single_op); | ||||
| tbe_task->SetModelArgs(model_name_, model_id_); | tbe_task->SetModelArgs(model_name_, model_id_); | ||||
| if (tbe_task->tiling_buffer_ != nullptr) { | |||||
| tbe_task->stream_resource_ = stream_resource; | |||||
| } | |||||
| single_op.tasks_.emplace_back(tbe_task); | single_op.tasks_.emplace_back(tbe_task); | ||||
| } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { | } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { | ||||
| GELOGD("Building AICPU_CC task"); | GELOGD("Building AICPU_CC task"); | ||||
| @@ -261,6 +265,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| task->SetModelArgs(model_name_, model_id_); | task->SetModelArgs(model_name_, model_id_); | ||||
| ParseArgTable(task, single_op); | |||||
| single_op.tasks_.emplace_back(task); | single_op.tasks_.emplace_back(task); | ||||
| } else { | } else { | ||||
| GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, | GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, | ||||
| @@ -278,6 +283,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| aicpu_task->SetModelArgs(model_name_, model_id_); | aicpu_task->SetModelArgs(model_name_, model_id_); | ||||
| ParseArgTable(aicpu_task, single_op); | |||||
| single_op.tasks_.emplace_back(aicpu_task); | single_op.tasks_.emplace_back(aicpu_task); | ||||
| } else { | } else { | ||||
| // skip | // skip | ||||
| @@ -287,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) { | |||||
| void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) { | |||||
| if (task == nullptr) { | if (task == nullptr) { | ||||
| GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr"); | GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr"); | ||||
| return; | return; | ||||
| } | } | ||||
| // args: addr1, addr2, addr3 ... | // args: addr1, addr2, addr3 ... | ||||
| auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs())); | |||||
| size_t arg_size = task->GetArgSize(); | |||||
| for (size_t i = 0; i < arg_size / sizeof(void *); ++i) { | |||||
| uintptr_t *ptr_to_addr = args + i; | |||||
| uintptr_t *arg_base = nullptr; | |||||
| size_t arg_num = 0; | |||||
| task->GetIoAddr(arg_base, arg_num); | |||||
| for (size_t i = 0; i < arg_num; ++i) { | |||||
| uintptr_t *ptr_to_addr = arg_base + i; | |||||
| uintptr_t addr = *ptr_to_addr; | uintptr_t addr = *ptr_to_addr; | ||||
| auto iter = model_params_.addr_mapping_.find(addr); | auto iter = model_params_.addr_mapping_.find(addr); | ||||
| if (iter != model_params_.addr_mapping_.end()) { | if (iter != model_params_.addr_mapping_.end()) { | ||||
| int arg_index = iter->second; | int arg_index = iter->second; | ||||
| GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index); | |||||
| GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index); | |||||
| op.arg_table_[iter->second].emplace_back(ptr_to_addr); | op.arg_table_[iter->second].emplace_back(ptr_to_addr); | ||||
| } | } | ||||
| } | } | ||||
| @@ -386,8 +394,10 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa | |||||
| Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { | Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { | ||||
| GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs()); | GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs()); | ||||
| GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource)); | GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource)); | ||||
| single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_)); | |||||
| GE_CHECK_NOTNULL(single_op.running_param_); | |||||
| GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op)); | GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op)); | ||||
| return BuildTaskList(single_op); | |||||
| return BuildTaskList(&resource, single_op); | |||||
| } | } | ||||
| Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { | Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { | ||||
| @@ -65,7 +65,7 @@ class SingleOpModel { | |||||
| Status ParseInputNode(const OpDescPtr &op_desc); | Status ParseInputNode(const OpDescPtr &op_desc); | ||||
| void ParseOutputNode(const OpDescPtr &op_desc); | void ParseOutputNode(const OpDescPtr &op_desc); | ||||
| Status BuildTaskList(SingleOp &single_op); | |||||
| Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); | |||||
| Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); | Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); | ||||
| Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); | Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); | ||||
| Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, | Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, | ||||
| @@ -74,7 +74,7 @@ class SingleOpModel { | |||||
| Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); | Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); | ||||
| static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); | static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); | ||||
| void ParseArgTable(TbeOpTask *task, SingleOp &op); | |||||
| void ParseArgTable(OpTask *task, SingleOp &op); | |||||
| std::string model_name_; | std::string model_name_; | ||||
| uint32_t model_id_ = 0; | uint32_t model_id_ = 0; | ||||
| @@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, | |||||
| size_t size, | size_t size, | ||||
| size_t &max_allocated, | size_t &max_allocated, | ||||
| std::vector<uint8_t *> &allocated) { | std::vector<uint8_t *> &allocated) { | ||||
| if (size == 0) { | |||||
| GELOGD("Mem size == 0"); | |||||
| return nullptr; | |||||
| } | |||||
| if (size <= max_allocated && !allocated.empty()) { | if (size <= max_allocated && !allocated.empty()) { | ||||
| GELOGD("reuse last memory"); | GELOGD("reuse last memory"); | ||||
| return allocated.back(); | return allocated.back(); | ||||
| } | } | ||||
| if (!allocated.empty()) { | |||||
| uint8_t *current_buffer = allocated.back(); | |||||
| allocated.pop_back(); | |||||
| if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) { | |||||
| GELOGW("Failed to invoke rtStreamSynchronize"); | |||||
| } | |||||
| (void) rtFree(current_buffer); | |||||
| } | |||||
| uint8_t *buffer = nullptr; | uint8_t *buffer = nullptr; | ||||
| auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM); | auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM); | ||||
| if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
| @@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, | |||||
| return buffer; | return buffer; | ||||
| } | } | ||||
| uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) { | |||||
| uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) { | |||||
| GELOGD("To Malloc memory, size = %zu", size); | GELOGD("To Malloc memory, size = %zu", size); | ||||
| uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||||
| return buffer; | |||||
| if (holding_lock) { | |||||
| return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||||
| } else { | |||||
| std::lock_guard<std::mutex> lk(stream_mu_); | |||||
| return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||||
| } | |||||
| } | } | ||||
| uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) { | uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) { | ||||
| @@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_)); | |||||
| auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_)); | |||||
| if (new_op == nullptr) { | if (new_op == nullptr) { | ||||
| GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed"); | GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed"); | ||||
| return ACL_ERROR_GE_MEMORY_ALLOCATION; | return ACL_ERROR_GE_MEMORY_ALLOCATION; | ||||
| @@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & | |||||
| op_map_[model_data.model_data] = std::move(new_op); | op_map_[model_data.model_data] = std::move(new_op); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| const uint8_t *StreamResource::GetMemoryBase() const { | |||||
| if (memory_list_.empty()) { | |||||
| return nullptr; | |||||
| } | |||||
| return memory_list_.back(); | |||||
| } | |||||
| } // namespace ge | } // namespace ge | ||||
| @@ -45,8 +45,9 @@ class StreamResource { | |||||
| Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op); | Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op); | ||||
| Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op); | Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op); | ||||
| uint8_t *MallocMemory(const std::string &purpose, size_t size); | |||||
| uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true); | |||||
| uint8_t *MallocWeight(const std::string &purpose, size_t size); | uint8_t *MallocWeight(const std::string &purpose, size_t size); | ||||
| const uint8_t *GetMemoryBase() const; | |||||
| private: | private: | ||||
| uint8_t *DoMallocMemory(const std::string &purpose, | uint8_t *DoMallocMemory(const std::string &purpose, | ||||
| @@ -17,17 +17,22 @@ | |||||
| #include "single_op/task/aicpu_kernel_task_builder.h" | #include "single_op/task/aicpu_kernel_task_builder.h" | ||||
| #include "framework/common/taskdown_common.h" | #include "framework/common/taskdown_common.h" | ||||
| #include "graph/load/new_model_manager/model_manager.h" | #include "graph/load/new_model_manager/model_manager.h" | ||||
| #include "build_task_utils.h" | |||||
| namespace ge { | namespace ge { | ||||
| AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def) | AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def) | ||||
| : op_desc_(op_desc), kernel_def_(kernel_def) {} | : op_desc_(op_desc), kernel_def_(kernel_def) {} | ||||
| Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { | |||||
| Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m) { | |||||
| size_t aicpu_arg_size = kernel_def_.args_size(); | size_t aicpu_arg_size = kernel_def_.args_size(); | ||||
| if (aicpu_arg_size <= 0) { | |||||
| if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) { | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); | GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); | ||||
| return ACL_ERROR_GE_PARAM_INVALID; | return ACL_ERROR_GE_PARAM_INVALID; | ||||
| } | } | ||||
| task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize(); | |||||
| GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *)); | |||||
| std::unique_ptr<uint8_t[]> aicpu_args; | std::unique_ptr<uint8_t[]> aicpu_args; | ||||
| aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]()); | aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]()); | ||||
| if (aicpu_args == nullptr) { | if (aicpu_args == nullptr) { | ||||
| @@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { | |||||
| return ACL_ERROR_GE_INTERNAL_ERROR; | return ACL_ERROR_GE_INTERNAL_ERROR; | ||||
| } | } | ||||
| task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)); | |||||
| task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead))); | |||||
| task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size); | task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size); | ||||
| auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param); | |||||
| GE_CHECK_GE(addresses.size(), task.io_addr_num_); | |||||
| for (size_t i = 0; i < task.io_addr_num_; ++i) { | |||||
| task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]); | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) { | |||||
| auto ret = SetKernelArgs(task); | |||||
| Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m) { | |||||
| auto ret = SetKernelArgs(task, param); | |||||
| if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder { | |||||
| explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def); | explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def); | ||||
| ~AiCpuCCTaskBuilder() = default; | ~AiCpuCCTaskBuilder() = default; | ||||
| Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id); | |||||
| Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m); | |||||
| private: | private: | ||||
| Status SetKernelArgs(AiCpuCCTask &task); | |||||
| Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m); | |||||
| const OpDescPtr op_desc_; | const OpDescPtr op_desc_; | ||||
| const domi::KernelDef &kernel_def_; | const domi::KernelDef &kernel_def_; | ||||
| }; | }; | ||||
| @@ -26,26 +26,6 @@ namespace ge { | |||||
| AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def) | AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def) | ||||
| : op_desc_(op_desc), kernel_def_(kernel_def) {} | : op_desc_(op_desc), kernel_def_(kernel_def) {} | ||||
| Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) { | |||||
| size_t arg_size = kernel_def_.args_size(); | |||||
| auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret); | |||||
| return rt_ret; | |||||
| } | |||||
| const void *src_addr = reinterpret_cast<const void *>(addresses.data()); | |||||
| uint64_t src_len = sizeof(void *) * addresses.size(); | |||||
| rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| (void)rtFree(*io_addr); | |||||
| GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret); | |||||
| return rt_ret; | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) { | Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) { | ||||
| auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), | auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), | ||||
| kernel_def_.args().data(), kernel_def_.args().size()); | kernel_def_.args().data(), kernel_def_.args().size()); | ||||
| @@ -80,39 +60,27 @@ namespace ge { | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, | |||||
| const SingleOpModelParam ¶m, bool dynamic_flag) { | |||||
| Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag) { | |||||
| if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | ||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", | GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", | ||||
| sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); | sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); | ||||
| return ACL_ERROR_GE_PARAM_INVALID; | return ACL_ERROR_GE_PARAM_INVALID; | ||||
| } | } | ||||
| auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | |||||
| auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); | |||||
| if (dynamic_flag) { | |||||
| GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM)); | |||||
| } else { | |||||
| if (ws_addr_vec.empty()) { | |||||
| GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty."); | |||||
| return ACL_ERROR_GE_PARAM_INVALID; | |||||
| } | |||||
| *kernel_workspace = ws_addr_vec[0]; | |||||
| } | |||||
| GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), | |||||
| GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM)); | |||||
| GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(), | |||||
| kernel_def_.task_info().data(), kernel_def_.task_info_size(), | kernel_def_.task_info().data(), kernel_def_.task_info_size(), | ||||
| RT_MEMCPY_HOST_TO_DEVICE)); | RT_MEMCPY_HOST_TO_DEVICE)); | ||||
| auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses)); | |||||
| if (ret != SUCCESS) { | |||||
| return ret; | |||||
| } | |||||
| auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); | |||||
| task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); | |||||
| task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *); | |||||
| GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM)); | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m, | Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m, | ||||
| bool dynamic_flag, uint64_t kernel_id) { | bool dynamic_flag, uint64_t kernel_id) { | ||||
| GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag)); | |||||
| GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag)); | |||||
| STR_FWK_OP_KERNEL fwk_op_kernel = {0}; | STR_FWK_OP_KERNEL fwk_op_kernel = {0}; | ||||
| auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel); | auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel); | ||||
| @@ -33,10 +33,8 @@ namespace ge { | |||||
| private: | private: | ||||
| static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); | static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); | ||||
| Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses); | |||||
| Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); | Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); | ||||
| Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, | |||||
| const SingleOpModelParam ¶m, bool dynamic_flag); | |||||
| Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag); | |||||
| const OpDescPtr op_desc_; | const OpDescPtr op_desc_; | ||||
| const domi::KernelExDef &kernel_def_; | const domi::KernelExDef &kernel_def_; | ||||
| @@ -32,7 +32,8 @@ const uint64_t kVarSize = 0; | |||||
| } | } | ||||
| std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc, | std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc, | ||||
| const SingleOpModelParam ¶m) { | |||||
| const SingleOpModelParam ¶m, | |||||
| bool keep_workspace) { | |||||
| std::vector<std::vector<void *>> ret; | std::vector<std::vector<void *>> ret; | ||||
| RuntimeParam runtime_para; | RuntimeParam runtime_para; | ||||
| runtime_para.mem_size = param.memory_size; | runtime_para.mem_size = param.memory_size; | ||||
| @@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o | |||||
| ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc)); | ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc)); | ||||
| ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc)); | ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc)); | ||||
| ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); | |||||
| if (keep_workspace) { | |||||
| ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); | |||||
| } | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -27,15 +27,17 @@ | |||||
| namespace ge { | namespace ge { | ||||
| class BuildTaskUtils { | class BuildTaskUtils { | ||||
| public: | public: | ||||
| static constexpr int kAddressIndexOutput = 1; | |||||
| static constexpr int kAddressIndexWorkspace = 2; | static constexpr int kAddressIndexWorkspace = 2; | ||||
| static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); | |||||
| static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, | |||||
| const SingleOpModelParam ¶m, | |||||
| bool keep_workspace = true); | |||||
| static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses); | static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses); | ||||
| static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); | static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); | ||||
| static std::string GetTaskInfo(const OpDescPtr &op_desc); | static std::string GetTaskInfo(const OpDescPtr &op_desc); | ||||
| template<typename T> | template<typename T> | ||||
| static std::string VectorToString(const std::vector<T> &values) | |||||
| { | |||||
| static std::string VectorToString(const std::vector<T> &values) { | |||||
| std::stringstream ss; | std::stringstream ss; | ||||
| ss << '['; | ss << '['; | ||||
| auto size = values.size(); | auto size = values.size(); | ||||
| @@ -24,9 +24,11 @@ | |||||
| #include "common/dump/dump_manager.h" | #include "common/dump/dump_manager.h" | ||||
| #include "common/dump/dump_op.h" | #include "common/dump/dump_op.h" | ||||
| #include "common/formats/formats.h" | #include "common/formats/formats.h" | ||||
| #include "common/math/math_util.h" | |||||
| #include "framework/common/debug/log.h" | #include "framework/common/debug/log.h" | ||||
| #include "register/op_tiling.h" | #include "register/op_tiling.h" | ||||
| #include "runtime/rt.h" | #include "runtime/rt.h" | ||||
| #include "build_task_utils.h" | |||||
| namespace ge { | namespace ge { | ||||
| namespace { | namespace { | ||||
| @@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) { | |||||
| std::vector<uint64_t> output_adds; | std::vector<uint64_t> output_adds; | ||||
| auto input_size = op_desc_->GetInputsSize(); | auto input_size = op_desc_->GetInputsSize(); | ||||
| auto output_size = op_desc_->GetOutputsSize(); | auto output_size = op_desc_->GetOutputsSize(); | ||||
| auto all_size = io_addrs_for_dump_.size(); | |||||
| if (input_size + output_size != all_size) { | |||||
| GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size, | |||||
| uintptr_t *arg_base = nullptr; | |||||
| size_t arg_num = 0; | |||||
| GetIoAddr(arg_base, arg_num); | |||||
| if (arg_num < input_size + output_size) { | |||||
| GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", | |||||
| arg_num, | |||||
| input_size + output_size); | input_size + output_size); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| for (size_t i = 0; i < input_size; i++) { | for (size_t i = 0; i < input_size; i++) { | ||||
| uint64_t input_addr = io_addrs_for_dump_[i]; | |||||
| uint64_t input_addr = arg_base[i]; | |||||
| input_addrs.emplace_back(input_addr); | input_addrs.emplace_back(input_addr); | ||||
| } | } | ||||
| for (size_t j = 0; j < output_size; j++) { | for (size_t j = 0; j < output_size; j++) { | ||||
| uint64_t output_addr = io_addrs_for_dump_[input_size + j]; | |||||
| uint64_t output_addr = arg_base[input_size + j]; | |||||
| output_adds.emplace_back(output_addr); | output_adds.emplace_back(output_addr); | ||||
| } | } | ||||
| dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream); | dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream); | ||||
| @@ -89,10 +95,6 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size | |||||
| void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; } | void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; } | ||||
| const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; } | |||||
| void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; } | |||||
| void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) { | void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) { | ||||
| model_name_ = model_name; | model_name_ = model_name; | ||||
| model_id_ = model_id; | model_id_ = model_id; | ||||
| @@ -107,6 +109,36 @@ Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, u | |||||
| op_name = op_desc_->GetName(); | op_name = op_desc_->GetName(); | ||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { | |||||
| auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | |||||
| auto all_addresses = BuildTaskUtils::JoinAddresses(addresses); | |||||
| uintptr_t *arg_base = nullptr; | |||||
| size_t arg_num = 0; | |||||
| GetIoAddr(arg_base, arg_num); | |||||
| if (arg_num != all_addresses.size()) { | |||||
| GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect = %zu, but got = %zu", | |||||
| op_desc_->GetName().c_str(), | |||||
| arg_num, | |||||
| all_addresses.size()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| for (void *addr : all_addresses) { | |||||
| *arg_base++ = reinterpret_cast<uintptr_t >(addr); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<DataBuffer> &input_buffers, | |||||
| vector<GeTensorDesc> &output_desc, | |||||
| vector<DataBuffer> &output_buffers, | |||||
| rtStream_t stream) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| TbeOpTask::~TbeOpTask() { | TbeOpTask::~TbeOpTask() { | ||||
| if (sm_desc_ != nullptr) { | if (sm_desc_ != nullptr) { | ||||
| @@ -141,12 +173,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { | |||||
| return RT_FAILED; | return RT_FAILED; | ||||
| } | } | ||||
| GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); | GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); | ||||
| size_t input_size = op_desc_->GetInputsSize(); | |||||
| size_t output_size = op_desc_->GetOutputsSize(); | |||||
| uint64_t *io_addr = reinterpret_cast<uint64_t *>(args_.get()); | |||||
| std::vector<uint64_t> io_addrs(io_addr, io_addr + input_size + output_size); | |||||
| SetIoAddrsForDump(io_addrs); | |||||
| auto status = OpenDump(stream); | auto status = OpenDump(stream); | ||||
| if (status != SUCCESS) { | if (status != SUCCESS) { | ||||
| GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str()); | GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str()); | ||||
| @@ -167,11 +193,12 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve | |||||
| GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret); | GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret); | ||||
| return FAILED; | return FAILED; | ||||
| } | } | ||||
| SetWorkspaceSizes(run_info.workspaces); | |||||
| block_dim_ = run_info.block_dim; | block_dim_ = run_info.block_dim; | ||||
| tiling_data_ = run_info.tiling_data.str(); | tiling_data_ = run_info.tiling_data.str(); | ||||
| GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_, | GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_, | ||||
| tiling_data_.size()); | tiling_data_.size()); | ||||
| GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces"); | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -227,13 +254,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s | |||||
| max_tiling_size_ = max_tiling_size; | max_tiling_size_ = max_tiling_size; | ||||
| } | } | ||||
| Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs, | |||||
| const vector<void *> &workspaces, rtStream_t stream) { | |||||
| Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) { | |||||
| static const std::string kPurpose("malloc workspace memory for dynamic op."); | |||||
| if (workspace_sizes.empty()) { | |||||
| GELOGD("No need to allocate workspace."); | |||||
| return SUCCESS; | |||||
| } | |||||
| int64_t total_size = 0; | |||||
| std::vector<int64_t> ws_offsets; | |||||
| for (auto ws_size : workspace_sizes) { | |||||
| // alignment and padding should be done in OpParaCalculate | |||||
| GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); | |||||
| ws_offsets.emplace_back(total_size); | |||||
| total_size += ws_size; | |||||
| } | |||||
| GELOGD("Total workspace size is %ld", total_size); | |||||
| GE_CHECK_NOTNULL(stream_resource_); | |||||
| auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size)); | |||||
| if (ws_base == nullptr) { | |||||
| GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); | |||||
| return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||||
| } | |||||
| GELOGD("Done allocating workspace memory successfully."); | |||||
| for (auto ws_offset : ws_offsets) { | |||||
| workspaces_.emplace_back(ws_base + ws_offset); | |||||
| } | |||||
| return SUCCESS; | |||||
| } | |||||
| Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc, | |||||
| const vector<DataBuffer> &input_buffers, | |||||
| vector<GeTensorDesc> &output_desc, | |||||
| vector<DataBuffer> &output_buffers, | |||||
| rtStream_t stream) { | |||||
| GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc)); | |||||
| GELOGD("[%s] Start to launch kernel", node_->GetName().c_str()); | GELOGD("[%s] Start to launch kernel", node_->GetName().c_str()); | ||||
| std::vector<void *> args; | std::vector<void *> args; | ||||
| args.insert(args.end(), inputs.begin(), inputs.end()); | |||||
| args.insert(args.end(), outputs.begin(), outputs.end()); | |||||
| args.insert(args.end(), workspaces.begin(), workspaces.end()); | |||||
| for (auto &buffer : input_buffers) { | |||||
| args.emplace_back(buffer.data); | |||||
| } | |||||
| for (auto &buffer : output_buffers) { | |||||
| args.emplace_back(buffer.data); | |||||
| } | |||||
| for (auto &buffer : workspaces_) { | |||||
| args.emplace_back(buffer); | |||||
| } | |||||
| if (tiling_buffer_ != nullptr) { | if (tiling_buffer_ != nullptr) { | ||||
| GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); | GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); | ||||
| @@ -254,6 +322,14 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void * | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| arg_base = reinterpret_cast<uintptr_t *>(args_.get()); | |||||
| arg_count = arg_size_ / sizeof(void *); | |||||
| if (tiling_buffer_ != nullptr) { | |||||
| --arg_count; | |||||
| } | |||||
| } | |||||
| AiCpuBaseTask::~AiCpuBaseTask() { | AiCpuBaseTask::~AiCpuBaseTask() { | ||||
| if (ext_info_addr_dev_ != nullptr) { | if (ext_info_addr_dev_ != nullptr) { | ||||
| (void)rtFree(ext_info_addr_dev_); | (void)rtFree(ext_info_addr_dev_); | ||||
| @@ -399,12 +475,14 @@ AiCpuTask::~AiCpuTask() { | |||||
| } | } | ||||
| } | } | ||||
| const void *AiCpuTask::GetIOAddr() const { return io_addr_; } | |||||
| Status AiCpuTask::LaunchKernel(rtStream_t stream) { | Status AiCpuTask::LaunchKernel(rtStream_t stream) { | ||||
| GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str()); | GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str()); | ||||
| auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(), | |||||
| RT_MEMCPY_HOST_TO_DEVICE_EX, stream); | |||||
| auto ret = rtMemcpyAsync(io_addr_, | |||||
| io_addr_size_, | |||||
| io_addr_host_.data(), | |||||
| io_addr_host_.size() * sizeof(void *), | |||||
| RT_MEMCPY_HOST_TO_DEVICE_EX, | |||||
| stream); | |||||
| if (ret != RT_ERROR_NONE) { | if (ret != RT_ERROR_NONE) { | ||||
| GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str()); | GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str()); | ||||
| return RT_FAILED; | return RT_FAILED; | ||||
| @@ -680,6 +758,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| Status AiCpuTask::UpdateArgTable(const SingleOpModelParam ¶m) { | |||||
| auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); | |||||
| io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); | |||||
| return SUCCESS; | |||||
| } | |||||
| void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data()); | |||||
| arg_count = io_addr_host_.size(); | |||||
| } | |||||
| void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { | void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { | ||||
| args_ = std::move(args); | args_ = std::move(args); | ||||
| arg_size_ = arg_size; | arg_size_ = arg_size; | ||||
| @@ -691,9 +780,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; } | |||||
| void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; } | void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; } | ||||
| void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; } | |||||
| const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; } | |||||
| void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; } | |||||
| const void *AiCpuCCTask::GetArgs() const { return args_.get(); } | const void *AiCpuCCTask::GetArgs() const { return args_.get(); } | ||||
| @@ -716,12 +803,6 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| GELOGD("Invoke rtCpuKernelLaunch succeeded"); | GELOGD("Invoke rtCpuKernelLaunch succeeded"); | ||||
| size_t input_size = op_desc_->GetInputsSize(); | |||||
| size_t output_size = op_desc_->GetOutputsSize(); | |||||
| uint64_t *io_addr = reinterpret_cast<uint64_t *>(io_addr_); | |||||
| std::vector<uint64_t> io_addrs (io_addr, io_addr + input_size + output_size); | |||||
| SetIoAddrsForDump(io_addrs); | |||||
| auto status = OpenDump(stream); | auto status = OpenDump(stream); | ||||
| if (status != SUCCESS) { | if (status != SUCCESS) { | ||||
| GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str()); | GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str()); | ||||
| @@ -761,4 +842,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| arg_base = io_addr_; | |||||
| arg_count = io_addr_num_; | |||||
| } | |||||
| } // namespace ge | } // namespace ge | ||||
| @@ -32,49 +32,27 @@ | |||||
| #include "init/gelib.h" | #include "init/gelib.h" | ||||
| namespace ge { | namespace ge { | ||||
| enum OpTaskType { | |||||
| OP_TASK_TBE = 0, | |||||
| OP_TASK_AICPU, | |||||
| OP_TASK_AICPUCC, | |||||
| OP_TASK_INVALID, | |||||
| }; | |||||
| class StreamResource; | |||||
| struct SingleOpModelParam; | |||||
| class OpTask { | class OpTask { | ||||
| public: | public: | ||||
| OpTask() = default; | OpTask() = default; | ||||
| virtual ~OpTask() = default; | virtual ~OpTask() = default; | ||||
| virtual Status LaunchKernel(rtStream_t stream) = 0; | virtual Status LaunchKernel(rtStream_t stream) = 0; | ||||
| virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | ||||
| const vector<GeTensorDesc> &output_desc) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| virtual Status LaunchKernel(const std::vector<void *> &inputs, | |||||
| const std::vector<void *> &outputs, | |||||
| const std::vector<void *> &workspaces, | |||||
| rtStream_t stream) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| virtual OpTaskType GetOpTaskType() = 0; | |||||
| virtual const void *GetIOAddr() const = 0; | |||||
| const vector<int64_t> &GetWorkspaceSizes() const; | |||||
| void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes); | |||||
| const vector<GeTensorDesc> &output_desc); | |||||
| virtual Status UpdateArgTable(const SingleOpModelParam ¶m); | |||||
| void SetModelArgs(std::string model_name, uint32_t model_id); | void SetModelArgs(std::string model_name, uint32_t model_id); | ||||
| Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim); | Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim); | ||||
| const OpDescPtr &GetOpdesc() const {return op_desc_;} | const OpDescPtr &GetOpdesc() const {return op_desc_;} | ||||
| Status OpenDump(rtStream_t stream); | Status OpenDump(rtStream_t stream); | ||||
| void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) { | |||||
| io_addrs_for_dump_ = io_addrs_for_dump; | |||||
| } | |||||
| virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0; | |||||
| virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | ||||
| const std::vector<DataBuffer> &input_buffers, | const std::vector<DataBuffer> &input_buffers, | ||||
| std::vector<GeTensorDesc> &output_desc, | std::vector<GeTensorDesc> &output_desc, | ||||
| std::vector<DataBuffer> &output_buffers, | std::vector<DataBuffer> &output_buffers, | ||||
| rtStream_t stream) { | |||||
| return UNSUPPORTED; | |||||
| } | |||||
| rtStream_t stream); | |||||
| private: | |||||
| std::vector<int64_t> workspace_sizes_; | |||||
| protected: | protected: | ||||
| DumpProperties dump_properties_; | DumpProperties dump_properties_; | ||||
| DumpOp dump_op_; | DumpOp dump_op_; | ||||
| @@ -82,19 +60,18 @@ class OpTask { | |||||
| std::string model_name_; | std::string model_name_; | ||||
| uint32_t model_id_ = 0; | uint32_t model_id_ = 0; | ||||
| uint32_t block_dim_ = 1; | uint32_t block_dim_ = 1; | ||||
| std::vector<uint64_t> io_addrs_for_dump_; | |||||
| }; | }; | ||||
| class TbeOpTask : public OpTask { | class TbeOpTask : public OpTask { | ||||
| public: | public: | ||||
| ~TbeOpTask() override; | ~TbeOpTask() override; | ||||
| Status LaunchKernel(rtStream_t stream) override; | Status LaunchKernel(rtStream_t stream) override; | ||||
| OpTaskType GetOpTaskType() override { | |||||
| return OP_TASK_TBE; | |||||
| } | |||||
| const void *GetIOAddr() const override { | |||||
| return nullptr; | |||||
| } | |||||
| Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||||
| const std::vector<DataBuffer> &input_buffers, | |||||
| std::vector<GeTensorDesc> &output_desc, | |||||
| std::vector<DataBuffer> &output_buffers, | |||||
| rtStream_t stream) override; | |||||
| void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||||
| void SetSmDesc(void *sm_desc); | void SetSmDesc(void *sm_desc); | ||||
| void SetStubFunc(const std::string &name, const void *stub_func); | void SetStubFunc(const std::string &name, const void *stub_func); | ||||
| void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); | void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); | ||||
| @@ -102,20 +79,17 @@ class TbeOpTask : public OpTask { | |||||
| Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | ||||
| const vector<GeTensorDesc> &output_desc) override; | const vector<GeTensorDesc> &output_desc) override; | ||||
| Status LaunchKernel(const vector<void *> &inputs, | |||||
| const vector<void *> &outputs, | |||||
| const vector<void *> &workspaces, | |||||
| rtStream_t stream) override; | |||||
| const void *GetArgs() const; | const void *GetArgs() const; | ||||
| size_t GetArgSize() const; | size_t GetArgSize() const; | ||||
| const std::string &GetStubName() const; | const std::string &GetStubName() const; | ||||
| void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); | void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); | ||||
| private: | private: | ||||
| friend class SingleOpModel; | |||||
| static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); | static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); | ||||
| Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | ||||
| const vector<GeTensorDesc> &output_desc); | const vector<GeTensorDesc> &output_desc); | ||||
| Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes); | |||||
| const void *stub_func_ = nullptr; | const void *stub_func_ = nullptr; | ||||
| std::unique_ptr<uint8_t[]> args_; | std::unique_ptr<uint8_t[]> args_; | ||||
| @@ -123,9 +97,11 @@ class TbeOpTask : public OpTask { | |||||
| void *sm_desc_ = nullptr; | void *sm_desc_ = nullptr; | ||||
| std::string stub_name_; | std::string stub_name_; | ||||
| StreamResource *stream_resource_ = nullptr; | |||||
| void *tiling_buffer_ = nullptr; | void *tiling_buffer_ = nullptr; | ||||
| uint32_t max_tiling_size_ = 0; | uint32_t max_tiling_size_ = 0; | ||||
| std::string tiling_data_; | std::string tiling_data_; | ||||
| std::vector<void *> workspaces_; | |||||
| NodePtr node_; | NodePtr node_; | ||||
| }; | }; | ||||
| @@ -133,7 +109,7 @@ class AiCpuBaseTask : public OpTask { | |||||
| public: | public: | ||||
| AiCpuBaseTask() = default; | AiCpuBaseTask() = default; | ||||
| ~AiCpuBaseTask() override; | ~AiCpuBaseTask() override; | ||||
| const UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||||
| UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||||
| protected: | protected: | ||||
| Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id); | Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id); | ||||
| @@ -158,10 +134,8 @@ class AiCpuTask : public AiCpuBaseTask { | |||||
| ~AiCpuTask() override; | ~AiCpuTask() override; | ||||
| Status LaunchKernel(rtStream_t stream) override; | Status LaunchKernel(rtStream_t stream) override; | ||||
| OpTaskType GetOpTaskType() override { | |||||
| return OP_TASK_AICPU; | |||||
| } | |||||
| const void *GetIOAddr() const override; | |||||
| Status UpdateArgTable(const SingleOpModelParam ¶m) override; | |||||
| void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||||
| Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | ||||
| const std::vector<DataBuffer> &input_buffers, | const std::vector<DataBuffer> &input_buffers, | ||||
| @@ -188,27 +162,31 @@ class AiCpuTask : public AiCpuBaseTask { | |||||
| friend class AiCpuTaskBuilder; | friend class AiCpuTaskBuilder; | ||||
| void *workspace_addr_ = nullptr; | void *workspace_addr_ = nullptr; | ||||
| std::string task_info_; | std::string task_info_; | ||||
| // device addr | |||||
| // device addr | |||||
| void *args_ = nullptr; | void *args_ = nullptr; | ||||
| size_t arg_size_ = 0; | size_t arg_size_ = 0; | ||||
| std::string op_type_; | std::string op_type_; | ||||
| // device addr | // device addr | ||||
| void *io_addr_ = nullptr; | void *io_addr_ = nullptr; | ||||
| size_t io_addr_size_ = 0; | |||||
| // host addr | |||||
| std::vector<void *> io_addr_host_; | |||||
| bool dynamic_flag_ = false; | bool dynamic_flag_ = false; | ||||
| // for copy task | // for copy task | ||||
| void *copy_task_args_buf_; | |||||
| void *copy_workspace_buf_; | |||||
| void *copy_task_args_buf_ = nullptr; | |||||
| void *copy_workspace_buf_ = nullptr; | |||||
| std::vector<void *> output_summary_; | std::vector<void *> output_summary_; | ||||
| std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_; | std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_; | ||||
| void *copy_ioaddr_dev_; | |||||
| void *copy_ioaddr_dev_ = nullptr; | |||||
| void *copy_input_release_flag_dev_; | |||||
| void *copy_input_data_size_dev_; | |||||
| void *copy_input_src_dev_; | |||||
| void *copy_input_dst_dev_; | |||||
| void *copy_input_release_flag_dev_ = nullptr; | |||||
| void *copy_input_data_size_dev_ = nullptr; | |||||
| void *copy_input_src_dev_ = nullptr; | |||||
| void *copy_input_dst_dev_ = nullptr; | |||||
| vector<void *> out_shape_hbm_; | vector<void *> out_shape_hbm_; | ||||
| uint64_t kernel_id_ = 0; | uint64_t kernel_id_ = 0; | ||||
| @@ -222,13 +200,12 @@ class AiCpuCCTask : public AiCpuBaseTask { | |||||
| AiCpuCCTask &operator=(const AiCpuCCTask &) = delete; | AiCpuCCTask &operator=(const AiCpuCCTask &) = delete; | ||||
| Status LaunchKernel(rtStream_t stream) override; | Status LaunchKernel(rtStream_t stream) override; | ||||
| OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; } | |||||
| const void *GetIOAddr() const override; | |||||
| void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||||
| const void *GetArgs() const; | const void *GetArgs() const; | ||||
| void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size); | void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size); | ||||
| void SetSoName(const std::string &so_name); | void SetSoName(const std::string &so_name); | ||||
| void SetkernelName(const std::string &kernel_Name); | void SetkernelName(const std::string &kernel_Name); | ||||
| void SetIoAddr(void *io_addr); | |||||
| void SetIoAddr(uintptr_t *io_addr); | |||||
| size_t GetArgSize() const; | size_t GetArgSize() const; | ||||
| Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | ||||
| @@ -244,7 +221,8 @@ private: | |||||
| std::unique_ptr<uint8_t[]> args_; | std::unique_ptr<uint8_t[]> args_; | ||||
| size_t arg_size_ = 0; | size_t arg_size_ = 0; | ||||
| void *sm_desc_ = nullptr; | void *sm_desc_ = nullptr; | ||||
| void *io_addr_ = nullptr; | |||||
| uintptr_t *io_addr_ = nullptr; | |||||
| size_t io_addr_num_ = 0; | |||||
| bool is_custom_ = false; | bool is_custom_ = false; | ||||
| uint32_t dump_flag_ = RT_KERNEL_DEFAULT; | uint32_t dump_flag_ = RT_KERNEL_DEFAULT; | ||||
| }; | }; | ||||