| @@ -380,6 +380,7 @@ set(TRAIN_SRC_LIST | |||||
| "single_op/task/tbe_task_builder.cc" | "single_op/task/tbe_task_builder.cc" | ||||
| "single_op/task/aicpu_task_builder.cc" | "single_op/task/aicpu_task_builder.cc" | ||||
| "single_op/task/aicpu_kernel_task_builder.cc" | "single_op/task/aicpu_kernel_task_builder.cc" | ||||
| "single_op/task/rts_kernel_task_builder.cc" | |||||
| "hybrid/common/tensor_value.cc" | "hybrid/common/tensor_value.cc" | ||||
| "hybrid/common/npu_memory_allocator.cc" | "hybrid/common/npu_memory_allocator.cc" | ||||
| "hybrid/executor/rt_callback_manager.cc" | "hybrid/executor/rt_callback_manager.cc" | ||||
| @@ -65,6 +65,7 @@ set(SRC_LIST | |||||
| "../single_op/task/tbe_task_builder.cc" | "../single_op/task/tbe_task_builder.cc" | ||||
| "../single_op/task/aicpu_task_builder.cc" | "../single_op/task/aicpu_task_builder.cc" | ||||
| "../single_op/task/aicpu_kernel_task_builder.cc" | "../single_op/task/aicpu_kernel_task_builder.cc" | ||||
| "../single_op/task/rts_kernel_task_builder.cc" | |||||
| "../hybrid/common/tensor_value.cc" | "../hybrid/common/tensor_value.cc" | ||||
| "../hybrid/common/npu_memory_allocator.cc" | "../hybrid/common/npu_memory_allocator.cc" | ||||
| "../hybrid/executor/rt_callback_manager.cc" | "../hybrid/executor/rt_callback_manager.cc" | ||||
| @@ -30,6 +30,7 @@ | |||||
| #include "runtime/rt.h" | #include "runtime/rt.h" | ||||
| #include "task/aicpu_task_builder.h" | #include "task/aicpu_task_builder.h" | ||||
| #include "task/aicpu_kernel_task_builder.h" | #include "task/aicpu_kernel_task_builder.h" | ||||
| #include "task/rts_kernel_task_builder.h" | |||||
| #include "task/tbe_task_builder.h" | #include "task/tbe_task_builder.h" | ||||
| #include "hybrid/executor/hybrid_model_executor.h" | #include "hybrid/executor/hybrid_model_executor.h" | ||||
| #include "hybrid/node_executor/node_executor.h" | #include "hybrid/node_executor/node_executor.h" | ||||
| @@ -266,7 +267,9 @@ Status SingleOpModel::ParseInputsAndOutputs() { | |||||
| for (auto &op_desc : data_ops_) { | for (auto &op_desc : data_ops_) { | ||||
| GE_CHK_STATUS_RET_NOLOG(ParseInputNode(op_desc)); | GE_CHK_STATUS_RET_NOLOG(ParseInputNode(op_desc)); | ||||
| } | } | ||||
| ParseOutputNode(netoutput_op_); | |||||
| if (netoutput_op_ != nullptr) { | |||||
| ParseOutputNode(netoutput_op_); | |||||
| } | |||||
| return SUCCESS; | return SUCCESS; | ||||
| } | } | ||||
| @@ -323,10 +326,7 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s | |||||
| OpTask *task = nullptr; | OpTask *task = nullptr; | ||||
| uint64_t singleop_kernel_id = aicpu_kernel_id++; | uint64_t singleop_kernel_id = aicpu_kernel_id++; | ||||
| GELOGI("Build singleOp CCTask, kernel_id = %lu", singleop_kernel_id); | GELOGI("Build singleOp CCTask, kernel_id = %lu", singleop_kernel_id); | ||||
| auto ret = BuildCpuKernelTask(task_def.kernel(), &task, singleop_kernel_id); | |||||
| if (ret != SUCCESS) { | |||||
| return ret; | |||||
| } | |||||
| GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, singleop_kernel_id)); | |||||
| task->SetModelArgs(model_name_, model_id_); | task->SetModelArgs(model_name_, model_id_); | ||||
| ParseArgTable(task, single_op); | ParseArgTable(task, single_op); | ||||
| single_op.tasks_.emplace_back(task); | single_op.tasks_.emplace_back(task); | ||||
| @@ -345,13 +345,22 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s | |||||
| bool depend_compute_flag = false; | bool depend_compute_flag = false; | ||||
| uint64_t singleop_kernel_id = aicpu_kernel_id++; | uint64_t singleop_kernel_id = aicpu_kernel_id++; | ||||
| GELOGI("Build singleOp TfTask, kernel_id = %lu", singleop_kernel_id); | GELOGI("Build singleOp TfTask, kernel_id = %lu", singleop_kernel_id); | ||||
| auto ret = BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_kernel_id); | |||||
| if (ret != SUCCESS) { | |||||
| return ret; | |||||
| } | |||||
| GE_CHK_STATUS_RET_NOLOG( | |||||
| BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, false, depend_compute_flag, singleop_kernel_id)); | |||||
| aicpu_task->SetModelArgs(model_name_, model_id_); | aicpu_task->SetModelArgs(model_name_, model_id_); | ||||
| ParseArgTable(aicpu_task, single_op); | ParseArgTable(aicpu_task, single_op); | ||||
| single_op.tasks_.emplace_back(aicpu_task); | single_op.tasks_.emplace_back(aicpu_task); | ||||
| } else if ((task_type == RT_MODEL_TASK_MEMCPY_ASYNC) || (task_type == RT_MODEL_TASK_MEMCPY_ADDR_ASYNC)) { | |||||
| auto kernel_def = task_def.memcpy_async(); | |||||
| auto node = op_list_[kernel_def.op_index()]; | |||||
| GE_CHECK_NOTNULL(node); | |||||
| auto op_desc = node->GetOpDesc(); | |||||
| GE_CHECK_NOTNULL(op_desc); | |||||
| std::unique_ptr<MemcpyAsyncTask> task; | |||||
| GE_CHK_STATUS_RET_NOLOG(RtsKernelTaskBuilder::BuildMemcpyAsyncTask(op_desc, kernel_def, model_params_, task)); | |||||
| task->SetModelArgs(model_name_, model_id_); | |||||
| ParseArgTable(task.get(), single_op); | |||||
| single_op.tasks_.emplace_back(task.release()); | |||||
| } else { | } else { | ||||
| // skip | // skip | ||||
| GELOGD("Skip task type: %d", static_cast<int>(task_type)); | GELOGD("Skip task type: %d", static_cast<int>(task_type)); | ||||
| @@ -26,6 +26,7 @@ | |||||
| #include "common/helper/model_helper.h" | #include "common/helper/model_helper.h" | ||||
| #include "single_op/single_op.h" | #include "single_op/single_op.h" | ||||
| #include "single_op/stream_resource.h" | #include "single_op/stream_resource.h" | ||||
| #include "single_op/task/op_task.h" | |||||
| namespace ge { | namespace ge { | ||||
| struct SingleOpModelParam { | struct SingleOpModelParam { | ||||
| @@ -34,6 +34,7 @@ | |||||
| namespace ge { | namespace ge { | ||||
| namespace { | namespace { | ||||
| constexpr int kLaunchRetryTimes = 1000; | constexpr int kLaunchRetryTimes = 1000; | ||||
| constexpr size_t kMemcpyArgCount = 2; | |||||
| constexpr int kSleepTime = 10; | constexpr int kSleepTime = 10; | ||||
| constexpr uint64_t kReleaseFlag = 1; | constexpr uint64_t kReleaseFlag = 1; | ||||
| constexpr int kCopyNum = 2; | constexpr int kCopyNum = 2; | ||||
| @@ -963,4 +964,17 @@ void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| arg_base = io_addr_; | arg_base = io_addr_; | ||||
| arg_count = io_addr_num_; | arg_count = io_addr_num_; | ||||
| } | } | ||||
| Status MemcpyAsyncTask::LaunchKernel(rtStream_t stream) { | |||||
| auto src_addr = reinterpret_cast<void *>(addresses_[0]); | |||||
| auto dst_addr = reinterpret_cast<void *>(addresses_[1]); | |||||
| kind_ = (kind_ == RT_MEMCPY_ADDR_DEVICE_TO_DEVICE) ? RT_MEMCPY_DEVICE_TO_DEVICE : kind_; | |||||
| GE_CHK_RT_RET(rtMemcpyAsync(dst_addr, dst_max_, src_addr, count_, kind_, stream)); | |||||
| return SUCCESS; | |||||
| } | |||||
| void MemcpyAsyncTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||||
| arg_base = addresses_; | |||||
| arg_count = kMemcpyArgCount; | |||||
| } | |||||
| } // namespace ge | } // namespace ge | ||||
| @@ -44,6 +44,9 @@ class OpTask { | |||||
| virtual Status UpdateArgTable(const SingleOpModelParam ¶m); | virtual Status UpdateArgTable(const SingleOpModelParam ¶m); | ||||
| void SetModelArgs(std::string model_name, uint32_t model_id); | void SetModelArgs(std::string model_name, uint32_t model_id); | ||||
| Status GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id); | Status GetProfilingArgs(TaskDescInfo &task_desc_info, uint32_t &model_id); | ||||
| void SetOpDesc(const OpDescPtr &op_desc) { | |||||
| op_desc_ = op_desc; | |||||
| } | |||||
| const OpDescPtr &GetOpdesc() const {return op_desc_;} | const OpDescPtr &GetOpdesc() const {return op_desc_;} | ||||
| Status OpenDump(rtStream_t stream); | Status OpenDump(rtStream_t stream); | ||||
| virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0; | virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0; | ||||
| @@ -244,6 +247,22 @@ private: | |||||
| std::string op_type_; | std::string op_type_; | ||||
| uint64_t kernel_id_ = 0; | uint64_t kernel_id_ = 0; | ||||
| }; | }; | ||||
| class MemcpyAsyncTask : public OpTask { | |||||
| public: | |||||
| Status LaunchKernel(rtStream_t stream) override; | |||||
| void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||||
| private: | |||||
| friend class SingleOpModel; | |||||
| friend class RtsKernelTaskBuilder; | |||||
| uintptr_t addresses_[2]; | |||||
| size_t dst_max_; | |||||
| size_t count_; | |||||
| rtMemcpyKind_t kind_; | |||||
| NodePtr node_; | |||||
| }; | |||||
| } // namespace ge | } // namespace ge | ||||
| #endif // GE_SINGLE_OP_TASK_OP_TASK_H_ | #endif // GE_SINGLE_OP_TASK_OP_TASK_H_ | ||||
| @@ -0,0 +1,45 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "single_op/task/rts_kernel_task_builder.h" | |||||
| #include "build_task_utils.h" | |||||
| namespace ge { | |||||
| namespace { | |||||
| const size_t kNumAddresses = 2; | |||||
| } // namespace | |||||
| Status RtsKernelTaskBuilder::BuildMemcpyAsyncTask(const OpDescPtr &op_desc, | |||||
| const domi::MemcpyAsyncDef &kernel_def, | |||||
| const SingleOpModelParam ¶m, | |||||
| std::unique_ptr<MemcpyAsyncTask> &task) { | |||||
| task.reset(new(std::nothrow)MemcpyAsyncTask()); | |||||
| GE_CHECK_NOTNULL(task); | |||||
| task->SetOpDesc(op_desc); | |||||
| task->dst_max_ = kernel_def.dst_max(); | |||||
| task->count_ = kernel_def.count(); | |||||
| task->kind_ = static_cast<rtMemcpyKind_t>(kernel_def.kind()); | |||||
| auto addresses = BuildTaskUtils::JoinAddresses(BuildTaskUtils::GetAddresses(op_desc, param, false)); | |||||
| if (addresses.size() != kNumAddresses) { | |||||
| GELOGE(INTERNAL_ERROR, "[Build][MemcpyAsyncTask] Invalid address count: %zu", addresses.size()); | |||||
| return INTERNAL_ERROR; | |||||
| } | |||||
| task->addresses_[0] = reinterpret_cast<uintptr_t>(addresses[0]); | |||||
| task->addresses_[1] = reinterpret_cast<uintptr_t>(addresses[1]); | |||||
| return SUCCESS; | |||||
| } | |||||
| } // namespace ge | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef GE_SINGLE_OP_TASK_RTS_KERNEL_TASK_BUILDER_H_ | |||||
| #define GE_SINGLE_OP_TASK_RTS_KERNEL_TASK_BUILDER_H_ | |||||
| #include <vector> | |||||
| #include "graph/op_desc.h" | |||||
| #include "single_op/single_op.h" | |||||
| #include "single_op/single_op_model.h" | |||||
| namespace ge { | |||||
| class RtsKernelTaskBuilder { | |||||
| public: | |||||
| static Status BuildMemcpyAsyncTask(const OpDescPtr &op_desc, | |||||
| const domi::MemcpyAsyncDef &kernel_def, | |||||
| const SingleOpModelParam ¶m, | |||||
| std::unique_ptr<MemcpyAsyncTask> &task); | |||||
| }; | |||||
| } // namespace ge | |||||
| #endif // GE_SINGLE_OP_TASK_RTS_KERNEL_TASK_BUILDER_H_ | |||||
| @@ -585,6 +585,7 @@ set(SINGLE_OP_SRC_FILES | |||||
| "${GE_CODE_DIR}/ge/single_op/single_op_manager.cc" | "${GE_CODE_DIR}/ge/single_op/single_op_manager.cc" | ||||
| "${GE_CODE_DIR}/ge/single_op/task/aicpu_task_builder.cc" | "${GE_CODE_DIR}/ge/single_op/task/aicpu_task_builder.cc" | ||||
| "${GE_CODE_DIR}/ge/single_op/task/aicpu_kernel_task_builder.cc" | "${GE_CODE_DIR}/ge/single_op/task/aicpu_kernel_task_builder.cc" | ||||
| "${GE_CODE_DIR}/ge/single_op/task/rts_kernel_task_builder.cc" | |||||
| "${GE_CODE_DIR}/ge/hybrid/common/tensor_value.cc" | "${GE_CODE_DIR}/ge/hybrid/common/tensor_value.cc" | ||||
| "${GE_CODE_DIR}/ge/hybrid/common/npu_memory_allocator.cc" | "${GE_CODE_DIR}/ge/hybrid/common/npu_memory_allocator.cc" | ||||
| "${GE_CODE_DIR}/ge/hybrid/executor/rt_callback_manager.cc" | "${GE_CODE_DIR}/ge/hybrid/executor/rt_callback_manager.cc" | ||||
| @@ -25,6 +25,11 @@ | |||||
| #define private public | #define private public | ||||
| #include "single_op/single_op_model.h" | #include "single_op/single_op_model.h" | ||||
| #include "single_op/task/tbe_task_builder.h" | #include "single_op/task/tbe_task_builder.h" | ||||
| #include "single_op/task/rts_kernel_task_builder.h" | |||||
| #include "single_op/task/op_task.h" | |||||
| #include "framework/common/helper/model_helper.h" | |||||
| #include "single_op/single_op.h" | |||||
| #include "single_op/stream_resource.h" | |||||
| #undef private | #undef private | ||||
| #undef protected | #undef protected | ||||
| #include "graph/passes/graph_builder_utils.h" | #include "graph/passes/graph_builder_utils.h" | ||||
| @@ -240,3 +245,45 @@ TEST_F(UtestSingleOpModel, test_host_mem) { | |||||
| DynamicSingleOp single_op(0, &stream_mu_, nullptr); | DynamicSingleOp single_op(0, &stream_mu_, nullptr); | ||||
| ASSERT_EQ(model.SetHostMemTensor(single_op), SUCCESS); | ASSERT_EQ(model.SetHostMemTensor(single_op), SUCCESS); | ||||
| } | } | ||||
| TEST_F(UtestSingleOpModel, BuildTaskList) { | |||||
| ComputeGraphPtr graph = make_shared<ComputeGraph>("single_op"); | |||||
| GeModelPtr ge_model = make_shared<GeModel>(); | |||||
| ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph)); | |||||
| shared_ptr<domi::ModelTaskDef> model_task_def = make_shared<domi::ModelTaskDef>(); | |||||
| ge_model->SetModelTaskDef(model_task_def); | |||||
| NodePtr node = nullptr; | |||||
| { | |||||
| auto op_desc = std::make_shared<ge::OpDesc>("memcpy", MEMCPYASYNC); | |||||
| GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT); | |||||
| op_desc->AddInputDesc(tensor); | |||||
| op_desc->AddOutputDesc(tensor); | |||||
| op_desc->SetInputOffset({0}); | |||||
| op_desc->SetOutputOffset({0}); | |||||
| node = graph->AddNode(op_desc); | |||||
| domi::TaskDef *task_def = model_task_def->add_task(); | |||||
| task_def->set_stream_id(0); | |||||
| task_def->set_type(RT_MODEL_TASK_MEMCPY_ASYNC); | |||||
| domi::MemcpyAsyncDef *memcpy_async = task_def->mutable_memcpy_async(); | |||||
| memcpy_async->set_src(0); | |||||
| memcpy_async->set_dst(0); | |||||
| memcpy_async->set_dst_max(512); | |||||
| memcpy_async->set_count(1); | |||||
| memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE); | |||||
| memcpy_async->set_op_index(0); | |||||
| } | |||||
| string model_data_str = "123456789"; | |||||
| SingleOpModel model("model", model_data_str.c_str(), model_data_str.size()); | |||||
| StreamResource *res = new (std::nothrow) StreamResource(1); | |||||
| std::mutex stream_mu; | |||||
| rtStream_t stream = nullptr; | |||||
| rtStreamCreate(&stream, 0); | |||||
| SingleOp single_op(res, &stream_mu, stream); | |||||
| model.model_helper_.model_ = ge_model; | |||||
| model.op_list_.emplace(0, node); | |||||
| ASSERT_EQ(model.BuildTaskList(res, single_op), SUCCESS); | |||||
| MemcpyAsyncTask mem_task; | |||||
| ASSERT_EQ(mem_task.LaunchKernel(0), SUCCESS); | |||||
| } | |||||