Browse Source

!1905 Fix bug of multi_task.

Merge pull request !1905 from 赵之轩/my_dev3
tags/v1.5.1
i-robot Gitee 3 years ago
parent
commit
e16175c5fc
8 changed files with 121 additions and 89 deletions
  1. +68
    -85
      ge/single_op/single_op_model.cc
  2. +5
    -2
      ge/single_op/single_op_model.h
  3. +24
    -1
      ge/single_op/task/tbe_task_builder.cc
  4. +1
    -0
      ge/single_op/task/tbe_task_builder.h
  5. +1
    -0
      tests/ut/ge/hybrid/ge_hybrid_unittest.cc
  6. +1
    -0
      tests/ut/ge/hybrid/node_executor/node_executor_unittest.cc
  7. +20
    -1
      tests/ut/ge/single_op/single_op_model_unittest.cc
  8. +1
    -0
      tests/ut/ge/single_op/single_op_task_unittest.cc

+ 68
- 85
ge/single_op/single_op_model.cc View File

@@ -95,35 +95,6 @@ Status CheckInferDepend(GeModelPtr &ge_model, bool &is_infer_depend, bool &is_ho
}
return SUCCESS;
}

Status NeedHybridModel(GeModelPtr &ge_model, bool &flag) {
bool is_infer_depend = false;
bool is_host_mem = false;
GE_CHK_STATUS_RET(CheckInferDepend(ge_model, is_infer_depend, is_host_mem), "[Check][InferDepend] failed.");
bool need_d2h_cpy = is_infer_depend && !is_host_mem;
auto tasks = ge_model->GetModelTaskDefPtr()->task();
int32_t kernel_task_num = 0;
for (int i = 0; i < tasks.size(); ++i) {
auto task_type = static_cast<rtModelTaskType_t>(tasks[i].type());
if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) {
const auto &context = task_type == RT_MODEL_TASK_KERNEL ? tasks[i].kernel().context() :
tasks[i].kernel_with_handle().context();
auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
if (kernel_type == ccKernelType::TE) {
if (need_d2h_cpy) {
flag = true;
return SUCCESS;
}
kernel_task_num++;
if (kernel_task_num > 1) {
flag = true;
return SUCCESS;
}
}
}
}
return SUCCESS;
}
} // namespace

SingleOpModel::SingleOpModel(const std::string &model_name, const void *model_data, uint32_t model_size)
@@ -558,14 +529,15 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
return BuildTaskList(&resource, single_op);
}

Status SingleOpModel::BuildModelTaskKernel(StreamResource *stream_resource, const TaskDef &task_def,
DynamicSingleOp &single_op) {
auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
const auto &context = task_type == RT_MODEL_TASK_KERNEL ? task_def.kernel().context() :
task_def.kernel_with_handle().context();
Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &single_op) {
auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);

auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
if (kernel_type == ccKernelType::TE) {
auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph());
GE_CHECK_NOTNULL(compute_graph);
single_op.compute_graph_ = compute_graph;
if (tbe_tasks_.size() > 0) {
const auto &task_def = tbe_tasks_[0];
GELOGD("Building TBE task.");
TbeOpTask *tbe_task = nullptr;
GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def, &tbe_task));
@@ -575,71 +547,81 @@ Status SingleOpModel::BuildModelTaskKernel(StreamResource *stream_resource, cons
tbe_task->stream_resource_ = stream_resource;
}
single_op.op_task_.reset(tbe_task);
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
GELOGD("Building AICPU_CC task");
OpTask *task = nullptr;
uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id);
GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id));
task->SetModelArgs(model_name_, model_id_);
single_op.op_task_.reset(task);
} else {
GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
"[Check][Param:TaskDef]Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u",
context.kernel_type());
REPORT_INNER_ERROR("E19999",
"BuildModelTaskKernel fail for got:%u not supported, Only TBE, AI_CPU, CUST_AI_CPU kernel are supported.",
context.kernel_type());
return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID;
}
return SUCCESS;
}

Status SingleOpModel::BuildTaskListForDynamicOp(StreamResource *stream_resource, DynamicSingleOp &single_op) {
auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);

auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph());
GE_CHECK_NOTNULL(compute_graph);
single_op.compute_graph_ = compute_graph;
auto tasks = ge_model->GetModelTaskDefPtr()->task();
for (int i = 0; i < tasks.size(); ++i) {
const TaskDef &task_def = tasks[i];
GELOGI("[%s] Task[%d], type = [%u], DebugString = [%s]", model_name_.c_str(), i, task_def.type(),
task_def.DebugString().c_str());
} else if (aicpu_tasks_.size() > 0) {
const auto &task_def = aicpu_tasks_[0];
auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) {
if (single_op.op_task_ != nullptr) {
GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "[Check][TaskType]Do not support dynamic op with multiple tasks.");
REPORT_INNER_ERROR("E19999",
"BuildTaskListForDynamicOp fail for Do not support dynamic op with multiple tasks.");
return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
}
GE_CHK_STATUS_RET_NOLOG(BuildModelTaskKernel(stream_resource, task_def, single_op));
if (task_type == RT_MODEL_TASK_KERNEL) {
GELOGD("Building AICPU_CC task");
OpTask *task = nullptr;
uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id);
GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id));
task->SetModelArgs(model_name_, model_id_);
single_op.op_task_.reset(task);
} else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
if (single_op.op_task_ != nullptr) {
GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "[Check][TaskType]Do not support dynamic op with multiple tasks.");
REPORT_INNER_ERROR("E19999",
"BuildTaskListForDynamicOp fail for Do not support dynamic op with multiple tasks.");
return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
}
GELOGD("Building AICPU_TF task");
AiCpuTask *aicpu_task = nullptr;
uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
GELOGI("Build dynamic singleOp TfTask, kernel_id = %lu", dynamic_singleop_kernel_id);
GE_CHK_STATUS_RET_NOLOG(BuildKernelExTask(task_def.kernel_ex(), &aicpu_task, dynamic_singleop_kernel_id));
if (aicpu_task->GetUnknownType() == DEPEND_COMPUTE) {
if (i >= tasks.size() - 1) {
if (aicpu_tasks_.size() < 2) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Task]The copy task of the fourth operator was not found.");
REPORT_INNER_ERROR("E19999", "The copy task of the fourth operator was not found.");
return ACL_ERROR_GE_PARAM_INVALID;
}
++i;
const TaskDef &copy_task_def = tasks[i];
const TaskDef &copy_task_def = aicpu_tasks_[1];
GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex()));
}
aicpu_task->SetModelArgs(model_name_, model_id_);
single_op.op_task_.reset(aicpu_task);
}
}
return SUCCESS;
}

Status SingleOpModel::NeedHybridModel(GeModelPtr &ge_model, bool &need_hybrid_model) {
bool is_infer_depend = false;
bool is_host_mem = false;
GE_CHK_STATUS_RET(CheckInferDepend(ge_model, is_infer_depend, is_host_mem), "[Check][InferDepend] failed.");
bool need_d2h_cpy = is_infer_depend && !is_host_mem;
bool aicpu_multi_task = tbe_tasks_.size() >= 1 && aicpu_tasks_.size() >= 1;
bool aicore_multi_task = tbe_tasks_.size() > 1;
need_hybrid_model = need_d2h_cpy || aicore_multi_task || aicpu_multi_task;
return SUCCESS;
}

Status SingleOpModel::ParseTasks() {
auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);

auto tasks = ge_model->GetModelTaskDefPtr()->task();
for (int i = 0; i < tasks.size(); ++i) {
TaskDef &task_def = tasks[i];
GELOGI("[%s] Task[%d], type = [%u], DebugString = [%s]", model_name_.c_str(), i, task_def.type(),
task_def.DebugString().c_str());
auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
if (task_type == RT_MODEL_TASK_KERNEL) {
const auto &kernel_def = task_def.kernel();
const auto &context = kernel_def.context();
auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
if (kernel_type == ccKernelType::TE) {
tbe_tasks_.emplace_back(task_def);
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
aicpu_tasks_.emplace_back(task_def);
} else {
GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
"[Check][Param:TaskDef]Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u",
context.kernel_type());
REPORT_INNER_ERROR("E19999",
"BuildModelTaskKernel fail for got:%u not supported, Only TBE, AI_CPU, CUST_AI_CPU kernel are supported.",
context.kernel_type());
return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID;
}
} else if (task_type == RT_MODEL_TASK_ALL_KERNEL) {
tbe_tasks_.emplace_back(task_def);
} else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
aicpu_tasks_.emplace_back(task_def);
} else {
// skip
GELOGD("Skip task type: %d", static_cast<int>(task_type));
@@ -654,6 +636,7 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp &
GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
model_params_.memory_size = UINT64_MAX;
model_params_.graph_is_dynamic = true;
GE_CHK_STATUS_RET(ParseTasks(), "[Parse][Tasks] failed.");

auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);


+ 5
- 2
ge/single_op/single_op_model.h View File

@@ -71,13 +71,16 @@ class SingleOpModel {
Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, uint64_t kernel_id);
Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id);
Status BuildModelTaskKernel(StreamResource *stream_resource, const domi::TaskDef &task_def,
DynamicSingleOp &single_op);

static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
void ParseArgTable(OpTask *task, SingleOp &op);
Status InitHybridModelExecutor(const StreamResource &resource, const GeModelPtr &ge_model, SingleOp &single_op);
Status SetHostMemTensor(DynamicSingleOp &single_op);
Status NeedHybridModel(GeModelPtr &ge_model, bool &flag);
Status ParseTasks();

std::vector<domi::TaskDef> tbe_tasks_;
std::vector<domi::TaskDef> aicpu_tasks_;

std::string model_name_;
uint32_t model_id_ = 0;


+ 24
- 1
ge/single_op/task/tbe_task_builder.cc View File

@@ -104,7 +104,7 @@ Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bi
binary.version = 0;
binary.data = kernel_bin.GetBinData();
binary.length = kernel_bin.GetBinDataSize();
binary.magic = param.core_type == 0 ? RT_DEV_BINARY_MAGIC_ELF : RT_DEV_BINARY_MAGIC_ELF_AIVEC;
GE_CHK_STATUS_RET_NOLOG(GetMagic(binary.magic));
Status ret = 0;
if (task_def_.type() == RT_MODEL_TASK_ALL_KERNEL) {
ret = rtRegisterAllKernel(&binary, bin_handle);
@@ -416,4 +416,27 @@ Status TbeTaskBuilder::InitTilingInfo(TbeOpTask &task) {
task.EnableDynamicSupport(node_, tiling_buffer, static_cast<uint32_t>(max_size));
return SUCCESS;
}

Status TbeTaskBuilder::GetMagic(uint32_t &magic) const {
std::string json_string;
GE_IF_BOOL_EXEC(AttrUtils::GetStr(op_desc_, TVM_ATTR_NAME_MAGIC, json_string),
GELOGD("Get original type of session_graph_id."));
if (json_string == "RT_DEV_BINARY_MAGIC_ELF") {
magic = RT_DEV_BINARY_MAGIC_ELF;
} else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") {
magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC;
} else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AICUBE") {
magic = RT_DEV_BINARY_MAGIC_ELF_AICUBE;
} else {
REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value:%s check invalid",
TVM_ATTR_NAME_MAGIC.c_str(), op_desc_->GetName().c_str(),
op_desc_->GetType().c_str(), json_string.c_str());
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s), value:%s check invalid",
TVM_ATTR_NAME_MAGIC.c_str(), op_desc_->GetName().c_str(),
op_desc_->GetType().c_str(), json_string.c_str());
return PARAM_INVALID;
}
return SUCCESS;
}

} // namespace ge

+ 1
- 0
ge/single_op/task/tbe_task_builder.h View File

@@ -105,6 +105,7 @@ class TbeTaskBuilder {
const SingleOpModelParam &param);
Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, const SingleOpModelParam &param) const;
Status DoRegisterMeta(void *bin_handle);
Status GetMagic(uint32_t &magic) const;

static Status DoRegisterFunction(void *bin_handle, const char *stub_name, const char *kernel_name);



+ 1
- 0
tests/ut/ge/hybrid/ge_hybrid_unittest.cc View File

@@ -153,6 +153,7 @@ TEST_F(UtestGeHybrid, task_update_tiling_info) {
ge::AttrUtils::SetStr(op_desc, "compile_info_json", "json");
ge::AttrUtils::SetBool(op_desc, "support_dynamicshape", true);
ge::AttrUtils::SetInt(op_desc, "op_para_size", 1);
ge::AttrUtils::SetStr(op_desc, TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF");
auto node = graph->AddNode(op_desc);

std::unique_ptr<NodeItem> node_item;


+ 1
- 0
tests/ut/ge/hybrid/node_executor/node_executor_unittest.cc View File

@@ -87,6 +87,7 @@ TEST_F(NodeExecutorTest, TestGetOrCreateExecutor) {
TEST_F(NodeExecutorTest, TestInitAndFinalize) {
auto &manager = NodeExecutorManager::GetInstance();
manager.FinalizeExecutors();
manager.FinalizeExecutors();
manager.EnsureInitialized();
manager.EnsureInitialized();
const NodeExecutor *executor = nullptr;


+ 20
- 1
tests/ut/ge/single_op/single_op_model_unittest.cc View File

@@ -311,7 +311,7 @@ TEST_F(UtestSingleOpModel, BuildTaskList) {
ASSERT_EQ(mem_task.LaunchKernel(0), SUCCESS);
}

TEST_F(UtestSingleOpModel, build_aicpu_task) {
TEST_F(UtestSingleOpModel, build_dynamic_task) {
ComputeGraphPtr graph = make_shared<ComputeGraph>("single_op");
GeModelPtr ge_model = make_shared<GeModel>();
ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph));
@@ -321,6 +321,15 @@ TEST_F(UtestSingleOpModel, build_aicpu_task) {
domi::TaskDef *task_def = model_task_def->add_task();
task_def->set_type(RT_MODEL_TASK_KERNEL_EX);

domi::TaskDef *task_def2 = model_task_def->add_task();
task_def2->set_type(RT_MODEL_TASK_KERNEL);
domi::KernelDef *kernel_def = task_def2->mutable_kernel();
domi::KernelContext *context = kernel_def->mutable_context();
context->set_kernel_type(6); // ccKernelType::AI_CPU

domi::TaskDef *task_def3 = model_task_def->add_task();
task_def3->set_type(RT_MODEL_TASK_ALL_KERNEL);

string model_data_str = "123456789";
SingleOpModel model("model", model_data_str.c_str(), model_data_str.size());
std::mutex stream_mu;
@@ -329,8 +338,18 @@ TEST_F(UtestSingleOpModel, build_aicpu_task) {
DynamicSingleOp single_op(0, &stream_mu, stream);
model.model_helper_.model_ = ge_model;
auto op_desc = std::make_shared<ge::OpDesc>("add", "Add");
AttrUtils::SetStr(op_desc, TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF");
std::vector<char> kernelBin;
TBEKernelPtr tbe_kernel = std::make_shared<ge::OpKernelBin>("name/Add", std::move(kernelBin));
op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel);
NodePtr node = graph->AddNode(op_desc);
model.op_list_[0] = node;
StreamResource *res = new (std::nothrow) StreamResource(1);

ASSERT_EQ(model.ParseTasks(), SUCCESS);
ASSERT_EQ(model.BuildTaskListForDynamicOp(res, single_op), SUCCESS);
model.tbe_tasks_.clear();
ASSERT_EQ(model.BuildTaskListForDynamicOp(res, single_op), SUCCESS);
model.aicpu_tasks_[0] = *task_def2;
model.BuildTaskListForDynamicOp(res, single_op);
}

+ 1
- 0
tests/ut/ge/single_op/single_op_task_unittest.cc View File

@@ -54,6 +54,7 @@ TEST_F(UtestSingleOpTask, test_build_kernel_task) {

auto graph = make_shared<ComputeGraph>("graph");
auto op_desc = make_shared<OpDesc>("Add", "Add");
AttrUtils::SetStr(op_desc, TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF");
std::vector<char> kernelBin;
TBEKernelPtr tbe_kernel = std::make_shared<ge::OpKernelBin>("name/Add", std::move(kernelBin));
op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel);


Loading…
Cancel
Save