Browse Source

add report error message

tags/v1.3.0
liudingyan 3 years ago
parent
commit
e1ee76614c
14 changed files with 290 additions and 144 deletions
  1. +4
    -3
      ge/hybrid/hybrid_davinci_model.cc
  2. +23
    -15
      ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
  3. +30
    -33
      ge/hybrid/node_executor/controlop/control_op_executor.cc
  4. +33
    -13
      ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
  5. +67
    -25
      ge/hybrid/node_executor/hccl/hccl_node_executor.cc
  6. +13
    -5
      ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
  7. +4
    -2
      ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
  8. +2
    -1
      ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc
  9. +20
    -10
      ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc
  10. +3
    -2
      ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc
  11. +4
    -1
      ge/hybrid/node_executor/host_cpu/kernel_factory.cc
  12. +17
    -9
      ge/hybrid/node_executor/node_executor.cc
  13. +3
    -4
      ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc
  14. +67
    -21
      ge/hybrid/node_executor/task_context.cc

+ 4
- 3
ge/hybrid/hybrid_davinci_model.cc View File

@@ -32,9 +32,10 @@ class HybridDavinciModel::Impl {
}

Status Init() {
GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), "Failed to initialize executors");
GE_CHK_STATUS_RET(model_.Init(), "Failed to init model.")
GE_CHK_STATUS_RET(executor_.Init(), "Failed to init model executor.")
GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(),
"[Initialize][NodeExecutorManager] failed");
GE_CHK_STATUS_RET(model_.Init(), "[Init][HybridModel] failed.")
GE_CHK_STATUS_RET(executor_.Init(), "[Init][HybridModelAsyncExecutor] failed.")
return SUCCESS;
}



+ 23
- 15
ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc View File

@@ -30,7 +30,7 @@ namespace ge {
namespace hybrid {
REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH, KnownNodeExecutor);

Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
Status KnownNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start");
GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName());
if (davinci_model_->GetTaskList().empty()) {
@@ -56,7 +56,9 @@ Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function<void()
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] Start");
rt_ret = rtModelExecute(davinci_model_->GetRtModelHandle(), context.GetStream(), 0);
GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
GELOGE(rt_ret, "rtModelExecute error, ret: hybrid_model_executorOx%X", rt_ret); return FAILED;);
REPORT_CALL_ERROR("E19999", "rtModelExecute error, ret:Ox%X", rt_ret);
GELOGE(rt_ret, "[Invoke][rtModelExecute] error, ret:Ox%X", rt_ret);
return FAILED;);
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] End");

GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback));
@@ -87,7 +89,7 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) {
}

GE_CHK_STATUS_RET(davinci_model_->UpdateKnownNodeArgs(inputs, outputs),
"known node task update known node args failed.");
"[Update][KnownNodeArgs] failed for %s.", context.GetNodeName());
GELOGD("[%s] KnownNodeExecutor::UpdateArgs success, task_size = %zu", context.GetNodeName(),
davinci_model_->GetTaskList().size());
return SUCCESS;
@@ -95,15 +97,15 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) {

Status KnownNodeTask::Init(TaskContext &context) {
// allocate output mem
GE_CHK_STATUS_RET(context.AllocateOutputs(), "known node task allocate output failed.");
GE_CHK_STATUS_RET(context.AllocateOutputs(), "[Allocate][Outputs] failed for %s.", context.GetNodeName());
// allocate mem base
void *buffer = nullptr;
if (davinci_model_->TotalMemSize() != 0) {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(),
"[KnownNodeTask_AllocateWorkspace] Start");
GE_CHK_STATUS_RET(
context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, davinci_model_->GetRuntimeParam().mem_base),
"known node task allocate workspace failed.");
GE_CHK_STATUS_RET(context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer,
davinci_model_->GetRuntimeParam().mem_base),
"[Allocate][Workspace] failed for %s.", context.GetNodeName());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(),
"[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize());
// update mem base
@@ -112,8 +114,10 @@ Status KnownNodeTask::Init(TaskContext &context) {
davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size);
}
GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(),
davinci_model_->Id(), davinci_model_->SubModelId()),
"KnownNodeTask::Init destroy aicpu kernel failed.");
davinci_model_->Id(),
davinci_model_->SubModelId()),
"[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u",
davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId());
GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName());
return SUCCESS;
}
@@ -121,7 +125,8 @@ Status KnownNodeTask::Init(TaskContext &context) {
Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) {
GELOGD("[Init][DavinciModel] start");
davinci_model_->InitRuntimeParams();
GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed");
GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(),
"[Init][VariableMem] failed");
int32_t device_id = 0;
GE_CHK_RT_RET(rtGetDevice(&device_id));
davinci_model_->SetDeviceId(static_cast<uint32_t>(device_id));
@@ -153,11 +158,13 @@ Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) cons
GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start");
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] Start");
GE_CHK_STATUS_RET(task.Init(context), "known node init davinci model failed.");
GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init] %s known node init davinci model failed.",
context.GetNodeName());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] End");

RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] Start");
GE_CHK_STATUS_RET(task.UpdateArgs(context), "known node task update args failed.");
GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Invoke][UpdateArgs] %s known node task update args failed.",
context.GetNodeName());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] End");
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] End");
GELOGD("[%s] KnownNodeExecutor::PrepareTask success.", context.GetNodeName());
@@ -188,7 +195,9 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node
davinci_model->SetSubModelId(node->GetOpDesc()->GetId());
GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId());

GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed.");
GE_CHK_STATUS_RET(davinci_model->Assign(ge_model),
"[Invoke][Assign]KnownNodeExecutor::LoadTask davincimodel assign failed for node:%s.",
node->GetName().c_str());

auto known_node_task = MakeShared<KnownNodeTask>(davinci_model);
GE_CHECK_NOTNULL(known_node_task);
@@ -201,8 +210,7 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node
Status KnownNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context,
const std::function<void()> &callback) const {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] Start");
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
"Failed to execute task. node = %s",
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "[Invoke][ExecuteAsync]Failed to execute task. node = %s",
context.GetNodeItem().NodeName().c_str());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] End");
return SUCCESS;


+ 30
- 33
ge/hybrid/node_executor/controlop/control_op_executor.cc View File

@@ -43,8 +43,7 @@ Status ControlOpNodeTask::ExecuteSubgraph(const GraphItem *subgraph,
auto executor = MakeShared<SubgraphExecutor>(subgraph, execution_context);
GE_CHECK_NOTNULL(executor);
GE_CHK_STATUS_RET(executor->ExecuteAsync(task_context),
"[%s] Failed to execute partitioned call.",
subgraph->GetName().c_str());
"[Invoke][ExecuteAsync][%s] Failed to execute partitioned call.", subgraph->GetName().c_str());

auto callback = [executor, done_callback]() mutable {
if (done_callback != nullptr) {
@@ -127,7 +126,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi
auto cond_tensor = task_context.GetInput(kIfCondIndex);
GE_CHECK_NOTNULL(cond_tensor);
GE_CHK_STATUS_RET(ToBool(*cond_tensor, data_type, cond_val),
"[%s] Failed to get cond value.",
"[Invoke][ToBool][%s] Failed to get cond value.",
task_context.GetNodeName());
} else {
// true if num elements is non-zero
@@ -141,9 +140,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi
auto subgraph = cond_val ? then_ : else_;
GELOGD("[%s] Taking subgraph [%s] by cond = [%d]", task_context.GetNodeName(), subgraph->GetName().c_str(), cond_val);
GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback),
"[%s] Failed to execute subgraph. cond = %d",
task_context.GetNodeName(),
cond_val);
"[Execute][Subgraph] failed for [%s]. cond = %d", task_context.GetNodeName(), cond_val);

GELOGD("[%s] Done executing with cond = %d successfully.", task_context.GetNodeName(), cond_val);
return SUCCESS;
@@ -201,8 +198,7 @@ Status CaseOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::func
}

GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback),
"[%s] Failed to execute else-subgraph.",
task_context.GetNodeName());
"[Execute][Subgraph] failed for [%s].", task_context.GetNodeName());

GELOGD("[%s] Done executing subgraph[%d] successfully.", task_context.GetNodeName(), branch_index);
return SUCCESS;
@@ -228,18 +224,18 @@ Status WhileOpNodeTask::Init(const NodePtr &node, const HybridModel &model) {

Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::function<void()> &done_callback) const {
if (task_context.NumInputs() != task_context.NumOutputs()) {
REPORT_INNER_ERROR("E19999",
"[%s] Invalid while args. num_inputs = %d not equal num_outputs = %d",
task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs());
GELOGE(INTERNAL_ERROR,
"[%s] Invalid while args. num_inputs = %d, num_outputs = %d",
task_context.GetNodeName(),
task_context.NumInputs(),
task_context.NumOutputs());
"[Check][Param:task_context][%s] Invalid while args. num_inputs = %d, num_outputs = %d",
task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs());
return INTERNAL_ERROR;
}

bool is_continue = false;
GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue),
"[%s] Failed to execute cond-subgraph",
task_context.GetNodeName());
"[Execute][Cond] failed for [%s]", task_context.GetNodeName());
if (!is_continue) {
for (int i = 0; i < task_context.NumInputs(); ++i) {
auto input_tensor = task_context.GetInput(i);
@@ -269,9 +265,8 @@ Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::fun
++iteration;
GELOGD("[%s] Start to execute, iteration = %d", task_context.GetNodeName(), iteration);
GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue),
"[%s] Failed to execute iteration %d.",
task_context.GetNodeName(),
iteration);
"[Invoke][ExecuteOneLoop][%s] Failed to execute iteration %d.",
task_context.GetNodeName(), iteration);
}
GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration);
if (done_callback) {
@@ -299,24 +294,27 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue
auto executor = MakeShared<SubgraphExecutor>(cond_, execution_context, task_context.IsForceInferShape());
GE_CHECK_NOTNULL(executor);
GELOGD("[%s] Start to execute cond-subgraph.", task_context.GetNodeName());
GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), "Failed to execute partitioned call.");
GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc),
"[Invoke][ExecuteAsync] %s Failed to execute partitioned call.", task_context.GetNodeName());
GELOGD("[%s] Done executing cond-subgraph successfully.", cond_->GetName().c_str());
GE_CHK_STATUS_RET_NOLOG(task_context.RegisterCallback([executor]() mutable {
executor.reset();
}));

// get cond output
GE_CHK_STATUS_RET(executor->Synchronize(), "[%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str());
GE_CHK_STATUS_RET(executor->Synchronize(),
"[Invoke][Synchronize][%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str());
std::vector<TensorValue> cond_outputs;
std::vector<ConstGeTensorDescPtr> cond_output_desc_list;
GE_CHK_STATUS_RET(executor->GetOutputs(cond_outputs, cond_output_desc_list),
"[%s] Failed to get cond-output.",
cond_->GetName().c_str());
"[Invoke][GetOutputs][%s] Failed to get cond-output.", cond_->GetName().c_str());
if (cond_outputs.size() != kCondOutputSize || cond_output_desc_list.size() != kCondOutputSize) {
REPORT_INNER_ERROR("E19999", "[%s] Number of cond outputs(%zu) or size of cond output desc(%zu)"
"not equal %zu, check invalid", task_context.GetNodeName(), cond_outputs.size(),
cond_output_desc_list.size(), kCondOutputSize);
GELOGE(INTERNAL_ERROR,
"[%s] Number of cond outputs is invalid. number = %zu",
task_context.GetNodeName(),
cond_outputs.size());
"[Check][Size][%s] Number of cond outputs(%zu) or Number of cond output desc(%zu) not equal %zu",
task_context.GetNodeName(), cond_outputs.size(), cond_output_desc_list.size(), kCondOutputSize);
return INTERNAL_ERROR;
}

@@ -325,8 +323,7 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue
if (shape.IsScalar()) {
auto data_type = cond_tensor_desc->GetDataType();
GE_CHK_STATUS_RET(ToBool(cond_outputs[0], data_type, is_continue),
"[%s] Failed to get cond value.",
task_context.GetNodeName());
"[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName());
} else {
// true if num elements is non-zero
is_continue = shape.GetShapeSize() > 0;
@@ -367,17 +364,15 @@ Status WhileOpNodeTask::MoveOutputs2Inputs(TaskContext &task_context) {
Status WhileOpNodeTask::ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const {
GELOGD("[%s] Start to execute body-subgraph.", task_context.GetNodeName());
GE_CHK_STATUS_RET(ExecuteSubgraph(body_, task_context, nullptr),
"[%s] Failed to execute cond-subgraph", task_context.GetNodeName());
"[Execute][Subgraph] failed for [%s]", task_context.GetNodeName());
GELOGD("[%s] Done executing body-subgraph successfully.", task_context.GetNodeName());

// set outputs to inputs for next iteration
GE_CHK_STATUS_RET(MoveOutputs2Inputs(task_context),
"[%s] Failed to move outputs to inputs",
task_context.GetNodeName());
"[Move][Outputs2Inputs] failed for [%s]", task_context.GetNodeName());

GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue),
"[%s] Failed to execute cond-subgraph",
task_context.GetNodeName());
"[Invoke][ExecuteCond][%s] Failed to execute cond-subgraph", task_context.GetNodeName());

if (!is_continue) {
for (int i = 0; i < task_context.NumInputs(); ++i) {
@@ -404,12 +399,14 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model,
} else if (node_type == WHILE || node_type == STATELESSWHILE) {
node_task.reset(new(std::nothrow) WhileOpNodeTask());
} else {
GELOGE(PARAM_INVALID, "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str());
REPORT_INNER_ERROR("E19999", "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str());
GELOGE(PARAM_INVALID, "[Check][NodeType][%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str());
return PARAM_INVALID;
}

GE_CHECK_NOTNULL(node_task);
GE_CHK_STATUS_RET(node_task->Init(node, model), "[%s] Failed to init ControlOpNodeTask.", node->GetName().c_str());
GE_CHK_STATUS_RET(node_task->Init(node, model),
"[Invoke][Init][%s] Failed to init ControlOpNodeTask.", node->GetName().c_str());

task = std::move(node_task);
return SUCCESS;


+ 33
- 13
ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc View File

@@ -47,7 +47,9 @@ Status RefInputTask::UpdateArgs(TaskContext &) {
Status RefInputTask::Execute(TaskContext &context) {
auto iter = out_ref_input_index_.find(node_type_);
if (iter == out_ref_input_index_.end()) {
GELOGE(UNSUPPORTED, "node %s type %s can not use RefInputTask.",
REPORT_INNER_ERROR("E19999", "node %s type %s can not use RefInputTask.",
node_name_.c_str(), node_type_.c_str());
GELOGE(UNSUPPORTED, "[Find][Node]node %s type %s can not use RefInputTask.",
node_name_.c_str(), node_type_.c_str());
return UNSUPPORTED;
}
@@ -65,7 +67,9 @@ Status RefInputTask::RefOneByOne(TaskContext &context) {
int input_num = context.NumInputs();
int output_num = context.NumOutputs();
if (output_num > input_num) {
GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.",
REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only %d inputs, can't ref one by one.",
node_name_.c_str(), node_type_.c_str(), output_num, input_num);
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only %d inputs, can't ref one by one.",
node_name_.c_str(), node_type_.c_str(), output_num, input_num);
return INTERNAL_ERROR;
}
@@ -84,7 +88,9 @@ Status RefInputTask::RefByOrder(const std::vector<uint32_t> &ref_order, TaskCont
GELOGI("node %s type %s ref input by order begin.", node_name_.c_str(), node_type_.c_str());
int32_t output_num = context.NumOutputs();
if (ref_order.size() != static_cast<size_t>(output_num)) {
GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only has %zu out ref index.",
REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only has %zu out ref index.",
node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size());
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only has %zu out ref index.",
node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size());
return INTERNAL_ERROR;
}
@@ -102,7 +108,7 @@ Status RefInputTask::RefByOrder(const std::vector<uint32_t> &ref_order, TaskCont

Status RefInputTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[RefInputTaskExecuteAsync] Start");
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s ref input task execute failed",
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s ref input task execute failed",
node_name_.c_str(), node_type_.c_str());
if (done_callback != nullptr) {
// host cpu no need register callback, call it directly.
@@ -126,20 +132,26 @@ Status DependInputShapeTask::Execute(TaskContext &context) {
std::string node_type = node_->GetType();
auto kernel = factory.Create(node_type);
if (kernel == nullptr) {
GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.",
REPORT_CALL_ERROR("E19999", "create failed for node %s type %s is not supported by host kernel.",
node_->GetName().c_str(), node_type.c_str());
GELOGE(UNSUPPORTED, "[Invoke][Create]node %s type %s is not supported by host kernel.",
node_->GetName().c_str(), node_type.c_str());
return UNSUPPORTED;
}
std::vector<GeTensorPtr> outputs;
Status compute_ret = kernel->Compute(node_, outputs);
if (compute_ret != SUCCESS) {
GELOGE(compute_ret, "node %s type %s compute failed or not imply.",
REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", node_->GetName().c_str(), node_type.c_str());
GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.",
node_->GetName().c_str(), node_type.c_str());
return compute_ret;
}
int32_t output_num = context.NumOutputs();
if (static_cast<size_t>(output_num) != outputs.size()) {
GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.",
REPORT_INNER_ERROR("E19999", "node %s type %s has %d output,"
"but kernel compute only has %zu output. check invalid",
node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size());
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d output, but kernel compute only has %zu output.",
node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size());
return INTERNAL_ERROR;
}
@@ -155,7 +167,11 @@ Status DependInputShapeTask::Execute(TaskContext &context) {
auto tensor_value = context.MutableOutput(i);
GE_CHECK_NOTNULL(tensor_value);
if (tensor_data.GetSize() > tensor_value->GetSize()) {
GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.",
REPORT_INNER_ERROR("E19999", "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu."
"check invalid",
node_->GetName().c_str(), node_type.c_str(), i,
tensor_data.GetSize(), tensor_value->GetSize());
GELOGE(INTERNAL_ERROR, "[Check][Size]node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.",
node_->GetName().c_str(), node_type.c_str(), i, tensor_data.GetSize(), tensor_value->GetSize());
return INTERNAL_ERROR;
}
@@ -180,7 +196,7 @@ Status DependInputShapeTask::Execute(TaskContext &context) {
Status DependInputShapeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(),
"[DependInputShapeTaskExecuteAsync] Start");
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s depend input shape task execute failed",
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s depend input shape task execute failed",
node_->GetName().c_str(), node_->GetType().c_str());
if (done_callback != nullptr) {
// host cpu no need register callback, call it directly.
@@ -213,7 +229,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model,
node->GetName().c_str(), node_type.c_str());
task = MakeShared<RefInputTask>(node);
if (task == nullptr) {
GELOGE(MEMALLOC_FAILED, "create RefInputTask for node %s failed.", node->GetName().c_str());
REPORT_CALL_ERROR("E19999", "Create RefInputTask failed for node %s.", node->GetName().c_str());
GELOGE(MEMALLOC_FAILED, "[Create][RefInputTask] failed for node %s.", node->GetName().c_str());
return MEMALLOC_FAILED;
}
} else if (DependInputShapeTask::IsBelong(node_type)) {
@@ -221,7 +238,9 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model,
node->GetName().c_str(), node_type.c_str());
task = MakeShared<DependInputShapeTask>(node);
if (task == nullptr) {
GELOGE(MEMALLOC_FAILED, "create DependInputShapeTask for node %s type %s failed.",
REPORT_CALL_ERROR("E19999", "Create DependInputShapeTask failed for node %s type %s.",
node->GetName().c_str(), node_type.c_str());
GELOGE(MEMALLOC_FAILED, "[Create][DependInputShapeTask]failed for node %s type %s.",
node->GetName().c_str(), node_type.c_str());
return MEMALLOC_FAILED;
}
@@ -229,7 +248,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model,
GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str());
auto tensor = model.GetTensor(node);
if (tensor == nullptr) {
GELOGE(INTERNAL_ERROR, "Failed to get tensor by name: %s", node->GetName().c_str());
REPORT_CALL_ERROR("E19999", "GetTensor failed for name: %s", node->GetName().c_str());
GELOGE(INTERNAL_ERROR, "[Get][Tensor] failed for name: %s", node->GetName().c_str());
return INTERNAL_ERROR;
}

@@ -251,7 +271,7 @@ Status ConstantNodeTask::UpdateArgs(TaskContext &context) {

Status ConstantNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
GELOGD("[%s] Start execute.", context.GetNodeName());
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[%s] Failed to set output.", context.GetNodeName());
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[Set][Output] failed for [%s].", context.GetNodeName());
if (done_callback) {
GELOGD("[%s] Start invoke callback.", context.GetNodeName());
done_callback();


+ 67
- 25
ge/hybrid/node_executor/hccl/hccl_node_executor.cc View File

@@ -43,13 +43,15 @@ REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::HCCL, HcclNode
Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
GELOGI("[%s] HcclNodeTask::ExecuteAsync in.", context.GetNodeName());
if (context.handle_ == nullptr) {
GELOGE(FAILED, "hccl handle is nullptr! ");
REPORT_INNER_ERROR("E19999", " %s invalid, hccl handle is nullptr!", context.GetNodeName());
GELOGE(FAILED, "[Check][Param:context] %s hccl handle is nullptr!", context.GetNodeName());
return FAILED;
}
auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function<void(HcclResult status)>))dlsym(
context.handle_, "HcomExecEnqueueOperation");
if (HcomExecEnqueueOperation == nullptr) {
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function.");
GELOGE(FAILED, "[Invoke][HcomExecEnqueueOperation] failed for %s hcom unknown node function.",
context.GetNodeName());
if (dlclose(context.handle_) != 0) {
GELOGW("Failed to close handle %s", dlerror());
}
@@ -83,24 +85,35 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
ge::DataType src_data_type = input_desc->GetDataType();
auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type));
if (iter == kConstOpHcclDataType.end()) {
GELOGE(PARAM_INVALID, "kConstOpHcclDataType find failed.");
REPORT_INNER_ERROR("E19999", "%s inputdesc0 datatype:%s not support.",
op_desc->GetName().c_str(),
TypeUtils::DataTypeToSerialString(src_data_type).c_str());
GELOGE(PARAM_INVALID, "[Find][DataType]%s inputdesc0 datatype:%s not support.",
op_desc->GetName().c_str(),
TypeUtils::DataTypeToSerialString(src_data_type).c_str());
return PARAM_INVALID;
}
op_info.dataType = iter->second;
HcclReduceOp op_type = HCCL_REDUCE_SUM;
if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER ||
op_desc->GetType() == HVDCALLBACKALLREDUCE || op_desc->GetType() == HCOMREDUCE) {
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed");
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type),
"[Get][HcclOperationType] failed for %s type:%s", op_desc->GetName().c_str(),
op_desc->GetType().c_str());
op_info.opType = op_type;
}
int64_t root_id = 0;
if (op_desc->GetType() == HCOMBROADCAST) {
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed");
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id),
"[Get][HcclRootId] failed for %s type:%s", op_desc->GetName().c_str(),
op_desc->GetType().c_str());
}
op_info.root = root_id;
auto callback = [op_desc, done_callback](HcclResult status) {
if (status != HCCL_SUCCESS) {
GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X",
REPORT_CALL_ERROR("E19999", "call HcomExecEnqueueOperation failed for node %s, ret: 0x%X",
op_desc->GetName().c_str(), status);
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node %s, ret: 0x%X",
op_desc->GetName().c_str(), status);
}

@@ -110,14 +123,18 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
int32_t count = 0;
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast<HcclDataType>(op_info.dataType),
op_desc->GetType() == HCOMALLGATHER, count),
"GetHcomCount failed");
"[Get][HcomCount] failed for %s type:%s", op_desc->GetName().c_str(),
op_desc->GetType().c_str());
GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.",
context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root);
op_info.count = count;

HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback);
if (hccl_ret != HCCL_SUCCESS) {
GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret);
REPORT_CALL_ERROR("E19999", "Call HcomExecEnqueueOperation failed for node:%s(%s), ret: 0x%X",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret);
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node:%s(%s), ret: 0x%X",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret);
return HCCL_E_INTERNAL;
}

@@ -173,13 +190,23 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
GELOGD("data is null, no need to do rdma read/write, node=%s", context.GetNodeName());
return SUCCESS;
} else {
GELOGE(FAILED, "Tensor data is nullptr.");
REPORT_INNER_ERROR("E19999", "Tensor data is nullptr. and kRdmaScatterTypes not contain %s",
context.GetNodeItem().NodeType().c_str());
GELOGE(FAILED, "[Find][NodeType]Tensor data is nullptr. and kRdmaScatterTypes not contain %s",
context.GetNodeItem().NodeType().c_str());
return FAILED;
}
}
auto dims = remote_tensor.GetTensorDesc().GetShape().GetDims();
if (dims.size() != kVarTableDims && dims.back() != kVarTableRowCnt) {
GELOGE(PARAM_INVALID, "Variable table shape check failed");
REPORT_INNER_ERROR("E19999", "Variable table shape check failed, number of shape dims:%zu not equal expect:%zu"
"and shape dims back:%zu not equal expect:%zu, node:%s(%s)",
dims.size(), kVarTableDims, dims.back(), kVarTableRowCnt,
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Variable table shape check failed,"
"number of shape dims:%zu not equal expect:%zu and shape dims back:%zu not equal expect:%zu, node:%s(%s)",
dims.size(), kVarTableDims, dims.back(), kVarTableRowCnt,
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
return PARAM_INVALID;
}

@@ -222,7 +249,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
Tensor offset_tensor;
GE_CHK_STATUS_RET(ctx->GetTensor(offset_index_.first, offset_index_.second, offset_tensor))
if (static_cast<int64_t>(offset_tensor.GetSize() / GetSizeByDataType(data_type)) != row_num) {
GELOGE(PARAM_INVALID, "num of offset and remote addr mismatch, offset size=%zu, remote_addr size=%ld, dtype=%s",
REPORT_INNER_ERROR("E19999", "num of offset and remote addr mismatch, check invalid"
"offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num,
TypeUtils::DataTypeToSerialString(data_type).c_str());
GELOGE(PARAM_INVALID, "[Check][Size]num of offset and remote addr mismatch,"
"offset size=%zu, remote_addr size=%ld, dtype=%s",
offset_tensor.GetSize(), row_num, TypeUtils::DataTypeToSerialString(data_type).c_str());
return PARAM_INVALID;
}
@@ -244,7 +275,9 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
auto local_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(tv->MutableData()));
auto device_len = tv->GetSize() / row_num;
if (device_len <= 0 || device_len > data[kVarTableIdxLen]) {
GELOGE(FAILED, "Local embedding length is out of range, expect %ld, but %ld exactly.",
REPORT_INNER_ERROR("E19999", "Local embedding length is out of range, expect %ld, but %ld exactly.",
data[kVarTableIdxLen], device_len);
GELOGE(FAILED, "[Check][Size]Local embedding length is out of range, expect %ld, but %ld exactly.",
data[kVarTableIdxLen], device_len);
return FAILED;
}
@@ -267,7 +300,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
(HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &,
std::function<void(HcclResult status)>))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess");
if (HcomExecEnqueueRemoteAccess == nullptr) {
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function.");
GELOGE(FAILED, "[Invoke][HcomExecEnqueueRemoteAccess] failed for node:%s(%s) hcom unknown node function.",
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
if (dlclose(context.handle_) != 0) {
GELOGW("Failed to close handle %s", dlerror());
}
@@ -283,7 +317,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
TaskContext *p_ctx = &context;
auto callback = [p_ctx, done_callback](HcclResult status) {
if (status != HCCL_SUCCESS) {
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status);
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExcutorInitialize] failed for node:%s(%s), ret: 0x%X",
p_ctx->GetNodeName(), p_ctx->GetNodeItem().NodeType().c_str(), status);
p_ctx->SetStatus(FAILED);
}
done_callback();
@@ -296,7 +331,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
}
HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback);
if (hccl_ret != HCCL_SUCCESS) {
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueRemoteAccess] failed for node:%s(%s), ret: 0x%X",
context.GetNodeName(), context.GetNodeItem().NodeType().c_str(), hccl_ret);
return HCCL_E_INTERNAL;
}

@@ -314,13 +350,17 @@ Status HcclNodeTask::Init(TaskContext &context) {
Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName());

GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed.");
GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init]hccl node %s(%s) load hccl so failed.",
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
// allocate output mem, output mem or remote read will be calculated when node execute.
if (kRdmaReadTypes.count(context.GetNodeItem().NodeType()) == 0) {
GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed.");
GE_CHK_STATUS_RET(context.AllocateOutputs(),
"[Invoke][AllocateOutputs]hccl node %s(%s) task allocate output failed.",
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
}

GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed.");
GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Update][Args] failed for hccl node %s(%s).",
context.GetNodeName(), context.GetNodeItem().NodeType().c_str());
GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName());
return SUCCESS;
}
@@ -341,8 +381,9 @@ Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node,
Status HcclNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context,
const std::function<void()> &callback) const {
context.handle_ = handle_;
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "Failed to execute task. node = %s",
context.GetNodeItem().NodeName().c_str());
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
"[Invoke][ExecuteAsync] failed to execute task. node:%s(%s)",
context.GetNodeItem().NodeName().c_str(), context.GetNodeItem().NodeType().c_str());
return SUCCESS;
}

@@ -359,12 +400,13 @@ Status HcclNodeExecutor::Initialize() {
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str());
handle_ = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
if (handle_ == nullptr) {
GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror());
REPORT_CALL_ERROR("E19999", "Open SoFile %s failed, error:%s! ", canonical_path.c_str(), dlerror());
GELOGE(GE_PLGMGR_SO_NOT_EXIST, "[Open][SoFile] %s failed, error:%s! ", canonical_path.c_str(), dlerror());
return FAILED;
}
auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize");
if (HcomExecInitialize == nullptr) {
GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function.");
GELOGE(FAILED, "[Invoke][HcomExecInitialize] Failed for hcom unknown node function.");
return FAILED;
}
HcclResult hccl_ret = HcomExecInitialize();
@@ -373,7 +415,7 @@ Status HcclNodeExecutor::Initialize() {
} else if (hccl_ret == HCCL_SUCCESS) {
GELOGI("Hcom executor initialize success.");
} else {
GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret);
GELOGE(FAILED, "[Call][HcomExecInitialize] failed, ret: 0x%X", hccl_ret);
return FAILED;
}
return SUCCESS;
@@ -382,12 +424,12 @@ Status HcclNodeExecutor::Initialize() {
Status HcclNodeExecutor::Finalize() {
auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize");
if (HcomExecFinalize == nullptr) {
GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function.");
GELOGE(FAILED, "[Invoke][HcomExecFinalize] failed for hcom unknown node function.");
return FAILED;
}
HcclResult hccl_ret = HcomExecFinalize();
if (hccl_ret != HCCL_SUCCESS) {
GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret);
GELOGE(FAILED, "[Call][HcomExecFinalize] failed, ret: 0x%X", hccl_ret);
return FAILED;
}
// dlclose file handle


+ 13
- 5
ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc View File

@@ -33,7 +33,7 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) {

Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
GELOGD("[%s] Start execute.", context.GetNodeName());
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.",
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute] failed for node:%s type:%s.",
node_->GetName().c_str(), node_->GetType().c_str())
if (done_callback) {
GELOGD("[%s] Start invoke callback.", context.GetNodeName());
@@ -70,7 +70,8 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
AllocationAttr attr;
attr.SetMemType(HOST_DDR);
if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) {
GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i);
REPORT_CALL_ERROR("E19999", "node:%s Failed to allocate output %d", context.GetNodeName(), i);
GELOGE(FAILED, "[Invoke][AllocateOutput]node:%s Failed to allocate output %d", context.GetNodeName(), i);
return FAILED;
}
auto tensor = context.GetOutput(i);
@@ -92,14 +93,18 @@ Status HostCpuNodeTask::Execute(TaskContext &context) {
RunContext run_context;
auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_);
if (host_kernel == nullptr) {
GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.",
REPORT_CALL_ERROR("E19999", "CreateKernel failed for node %s type %s is not supported by host kernel.",
node_->GetName().c_str(), node_->GetType().c_str());
GELOGE(UNSUPPORTED, "[Create][Kernel]node %s type %s is not supported by host kernel.",
node_->GetName().c_str(), node_->GetType().c_str());
return UNSUPPORTED;
}

Status compute_ret = host_kernel->Compute(context);
if (compute_ret != SUCCESS) {
GELOGE(compute_ret, "node %s type %s compute failed or not imply.",
REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.",
node_->GetName().c_str(), node_->GetType().c_str());
GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.",
node_->GetName().c_str(), node_->GetType().c_str());
return compute_ret;
}
@@ -131,7 +136,10 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no
task = MakeShared<HostCpuNodeTask>(node);
GE_CHECK_NOTNULL(task);
} else {
GELOGE(UNSUPPORTED, "node %s type %s is not support in HostCpuNodeExecutor now.", name.c_str(), type.c_str());
REPORT_INNER_ERROR("E19999", "Create NodeTask failed for node %s type %s.",
name.c_str(), type.c_str());
GELOGE(UNSUPPORTED, "[Create][NodeTask]node %s type %s is not support in HostCpuNodeExecutor now.",
name.c_str(), type.c_str());
return UNSUPPORTED;
}
return SUCCESS;


+ 4
- 2
ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc View File

@@ -34,7 +34,9 @@ Status AssignKernel::Compute(TaskContext& context) {
const auto value_tensor = context.GetInput(kAssignValueInputIndex);
GE_CHECK_NOTNULL(value_tensor);
if (value_tensor->GetSize() > ref_tensor->GetSize()) {
GELOGE(INTERNAL_ERROR, "[%s] value_input_size=%zu, but ref_input_size=%zu.",
REPORT_INNER_ERROR("E19999", "[%s] value_input_size=%zu bigger than ref_input_size=%zu. check invalid",
node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize());
GELOGE(INTERNAL_ERROR, "[Check][Size][%s] value_input_size=%zu, but ref_input_size=%zu.",
node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize());
return INTERNAL_ERROR;
}
@@ -46,7 +48,7 @@ Status AssignKernel::Compute(TaskContext& context) {
value_tensor->GetSize(), RT_MEMCPY_HOST_TO_HOST));
}
GE_CHK_STATUS_RET(context.SetOutput(kAssignRefOutputIndex, *ref_tensor),
"[%s] Failed to set output.", context.GetNodeName());
"[Set][Output] failed for[%s].", context.GetNodeName());

GELOGD("[%s] compute success.", node_->GetName().c_str());
return SUCCESS;


+ 2
- 1
ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc View File

@@ -30,7 +30,8 @@ namespace host_cpu {
Status DataKernel::Compute(TaskContext& context) {
auto input = context.MutableInput(kDataInputIndex);
GE_CHECK_NOTNULL(input);
GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), "[%s] Failed to set output.", context.GetNodeName())
GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input),
"[Set][Output] failed for [%s].", context.GetNodeName())
GELOGD("[%s] compute success.", node_->GetName().c_str());
return SUCCESS;
}


+ 20
- 10
ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc View File

@@ -36,31 +36,41 @@ Status RandomUniformKernel::Compute(TaskContext& context) {
(void)AttrUtils::GetInt(node_->GetOpDesc(), "seed2", seed2);
DataType data_type = DT_FLOAT;
if (!AttrUtils::GetDataType(node_->GetOpDesc(), kAttrDtype, data_type)) {
GELOGE(PARAM_INVALID, "[%s] get attr dtype failed.", node_->GetName().c_str());
REPORT_CALL_ERROR("E19999", "GetDataType failed for [%s].", node_->GetName().c_str());
GELOGE(PARAM_INVALID, "[Get][DataType] failed for [%s].", node_->GetName().c_str());
return PARAM_INVALID;
}
switch (data_type) {
case DT_FLOAT16:
if (GenerateFP16(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) {
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT");
GELOGE(FAILED, "[Invoke][GenerateFP16]Generate random_distribution failed for %s, data_type=DT_FLOAT16",
node_->GetName().c_str());
return FAILED;
}
break;
case DT_FLOAT:
if (Generate<float>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) {
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT");
GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_FLOAT",
node_->GetName().c_str());
return FAILED;
}
break;
case DT_DOUBLE:
if (Generate<double>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) {
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_DOUBLE");
GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_DOUBLE",
node_->GetName().c_str());
return FAILED;
}
break;
default:
GELOGE(UNSUPPORTED, "Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE, but data_type=%s",
TypeUtils::DataTypeToSerialString(data_type).c_str());
REPORT_INNER_ERROR("E19999", "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE,"
"but data_type=%s, node:%s",
TypeUtils::DataTypeToSerialString(data_type).c_str(),
node_->GetName().c_str());
GELOGE(UNSUPPORTED, "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE,"
"but data_type=%s, node:%s",
TypeUtils::DataTypeToSerialString(data_type).c_str(),
node_->GetName().c_str());
return UNSUPPORTED;
}

@@ -79,7 +89,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s
auto tensor_size = data_num * sizeof(T);
TensorValue tensor;
GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr),
"[%s] Failed to allocate output of size %zu",
"[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu",
context.GetNodeName(),
tensor_size);

@@ -101,7 +111,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s
*(buf + i) = distribution(gen);
}

GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName());
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output] failed for [%s].", context.GetNodeName());
return SUCCESS;
}

@@ -115,7 +125,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64
auto tensor_size = data_num * sizeof(fp16_t);
TensorValue tensor;
GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr),
"[%s] Failed to allocate output of size %zu",
"[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu",
context.GetNodeName(),
tensor_size);

@@ -137,7 +147,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64
*(buf + i) = static_cast<fp16_t>(distribution(gen));
}

GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName());
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output]failed for [%s].", context.GetNodeName());
return SUCCESS;
}



+ 3
- 2
ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc View File

@@ -25,11 +25,12 @@ namespace host_cpu {
Status VariableKernel::Compute(TaskContext& context) {
auto tensor = context.GetVariable(node_->GetName());
if (tensor == nullptr) {
GELOGE(PARAM_INVALID, "tensor is NULL.");
REPORT_INNER_ERROR("E19999", "Get Variable from task context for node:%s failed.", context.GetNodeName());
GELOGE(PARAM_INVALID, "[Check][Param]Get Variable from task context for node:%s failed.", context.GetNodeName());
return PARAM_INVALID;
}
// Constant & Variable Op has and only has one output
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[%s] Failed to set output.", context.GetNodeName());
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[Set][Output] failed for [%s].", context.GetNodeName());
GELOGD("[%s] compute success.", node_->GetName().c_str());
return SUCCESS;
}


+ 4
- 1
ge/hybrid/node_executor/host_cpu/kernel_factory.cc View File

@@ -34,7 +34,10 @@ std::shared_ptr<Kernel> KernelFactory::CreateKernel(const NodePtr &node) {
if (iter != kernel_creator_map_.end()) {
return iter->second(node);
}
GELOGE(FAILED, "Not supported, type = %s, name = %s", node->GetType().c_str(), node->GetName().c_str());
REPORT_INNER_ERROR("E19999", "Not supported because kernel_creator_map_ not contain type:%s, name = %s",
node->GetType().c_str(), node->GetName().c_str());
GELOGE(FAILED, "[Find][NodeType]Not supported because kernel_creator_map_ not contain type = %s, name = %s",
node->GetType().c_str(), node->GetName().c_str());
return nullptr;
}



+ 17
- 9
ge/hybrid/node_executor/node_executor.cc View File

@@ -45,8 +45,7 @@ Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {

Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function<void()> &callback) const {
HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
"Failed to execute task. node = %s",
context.GetNodeItem().NodeName().c_str());
"[Execute][Task] failed. node = %s", context.GetNodeItem().NodeName().c_str());
return SUCCESS;
}

@@ -106,7 +105,10 @@ NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node
const auto &lib_name = op_desc->GetOpKernelLibName();
auto it = engine_mapping_.find(lib_name);
if (it == engine_mapping_.end()) {
GELOGE(UNSUPPORTED, "KernelLib not supported. node = %s, lib_name = %s", node.GetName().c_str(), lib_name.c_str());
REPORT_INNER_ERROR("E19999", "Failed to get ExecutorType by lib_name:%s, node:%s",
lib_name.c_str(), node.GetName().c_str());
GELOGE(UNSUPPORTED, "[Find][ExecutorType]Failed to get ExecutorType by lib_name:%s, node:%s",
lib_name.c_str(), node.GetName().c_str());
return ExecutorType::RESERVED;
}

@@ -117,7 +119,10 @@ Status NodeExecutorManager::GetExecutor(Node &node, const NodeExecutor **executo
auto executor_type = ResolveExecutorType(node);
const auto it = executors_.find(executor_type);
if (it == executors_.end()) {
GELOGE(INTERNAL_ERROR, "Failed to get executor by type: %d.", static_cast<int>(executor_type));
REPORT_INNER_ERROR("E19999", "Failed to get executor by type: %d.",
static_cast<int>(executor_type));
GELOGE(INTERNAL_ERROR, "[Check][ExecutorType]Failed to get executor by type: %d.",
static_cast<int>(executor_type));
return INTERNAL_ERROR;
}

@@ -155,16 +160,16 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const {
GeShape output_shape = output_tensor.GetShape();
int64_t output_mem_size = 0;
GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size),
"hccl calc tensor mem size failed.");
"[Calc][TensorMemSize] failed, node:%s.", node.GetName().c_str());
GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1),
"[%s] Invalid output mem size: %ld",
"[Check][Overflow][%s] Invalid output mem size: %ld",
node.GetName().c_str(),
output_mem_size);
output_mem_size = ((output_mem_size +
MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE;
TensorUtils::SetSize(output_tensor, output_mem_size);
GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor),
"hccl update output size failed.");
"[Update][OutputDesc] failed, node:%s.", node.GetName().c_str());
GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i,
output_tensor.GetShape().GetDimNum(), output_mem_size);
}
@@ -189,14 +194,17 @@ Status NodeExecutorManager::InitializeExecutors() {
GE_CHECK_NOTNULL(build_fn);
auto executor = std::unique_ptr<NodeExecutor>(build_fn());
if (executor == nullptr) {
GELOGE(INTERNAL_ERROR, "Failed to create executor for engine type = %d", static_cast<int>(engine_type));
REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for engine type = %d",
static_cast<int>(engine_type));
GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for engine type = %d", static_cast<int>(engine_type));
return INTERNAL_ERROR;
}

GELOGD("Executor of engine type = %d was created successfully", static_cast<int>(engine_type));
auto ret = executor->Initialize();
if (ret != SUCCESS) {
GELOGE(ret, "Failed to initialize NodeExecutor of type = %d, clear executors", static_cast<int>(engine_type));
REPORT_CALL_ERROR("E19999", "Initialize NodeExecutor failed for type = %d", static_cast<int>(engine_type));
GELOGE(ret, "[Initialize][NodeExecutor] failed for type = %d", static_cast<int>(engine_type));
for (auto &executor_it : executors_) {
executor_it.second->Finalize();
}


+ 3
- 4
ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc View File

@@ -38,15 +38,14 @@ Status PartitionedCallNodeTask::Init(TaskContext &context) {

Status PartitionedCallNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
GE_CHK_STATUS_RET(subgraph_executor_->ExecuteAsync(context),
"[%s] Failed to set inputs", graph_item_->GetName().c_str());
"[Invoke][ExecuteAsync] failed for[%s]", graph_item_->GetName().c_str());

auto callback = [=]() {
Callback(done_callback);
};

GE_CHK_STATUS_RET(context.RegisterCallback(callback),
"[%s] Failed to register callback",
graph_item_->GetName().c_str());
"[Register][Callback] failed for [%s]", graph_item_->GetName().c_str());
GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str());
return SUCCESS;
}
@@ -83,7 +82,7 @@ Status PartitionedCallNodeExecutor::LoadTask(const ge::hybrid::HybridModel &mode

Status PartitionedCallNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] Start");
GE_CHK_STATUS_RET(task.Init(context), "[%s] Failed to init task.", context.GetNodeName());
GE_CHK_STATUS_RET(task.Init(context), "[Init][Task] failed for [%s].", context.GetNodeName());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] End");
return SUCCESS;
}


+ 67
- 21
ge/hybrid/node_executor/task_context.cc View File

@@ -63,17 +63,22 @@ std::unique_ptr<TaskContext> TaskContext::Create(NodeState *node_state,
node_item.output_start,
node_item.num_outputs);
if (node_item.input_start < 0 || node_item.output_start < 0) {
REPORT_INNER_ERROR("E19999", "NodeItem:%s(%s) not property initialized."
"input_start:%d or output_start:%d less than 0",
node_item.NodeName().c_str(), node_item.NodeType().c_str(),
node_item.input_start, node_item.output_start);
GELOGE(INTERNAL_ERROR,
"NodeItem not property initialized. input_start = %d, output_start = %d",
node_item.input_start,
node_item.output_start);
"[Check][Param]NodeItem:%s(%s) not property initialized. input_start = %d, output_start = %d",
node_item.NodeName().c_str(), node_item.NodeType().c_str(),
node_item.input_start, node_item.output_start);
return nullptr;
}

auto task_context = std::unique_ptr<TaskContext>(
new(std::nothrow)TaskContext(execution_context, node_state, subgraph_context));
if (task_context == nullptr) {
GELOGE(MEMALLOC_FAILED, "[%s] Failed to create instance of TaskContext.", node_item.NodeName().c_str());
REPORT_CALL_ERROR("E19999", "Create TaskContext failed for [%s].", node_item.NodeName().c_str());
GELOGE(MEMALLOC_FAILED, "[Create][TaskContext] failed for [%s].", node_item.NodeName().c_str());
return nullptr;
}

@@ -94,7 +99,12 @@ int TaskContext::NumOutputs() const {

TensorValue *TaskContext::MutableInput(int index) {
if (index < 0 || index >= node_item_->num_inputs) {
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs);
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_inputs = %d, node:%s(%s)",
index, node_item_->num_inputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_inputs = %d, node:%s(%s)",
index, node_item_->num_inputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
return nullptr;
}

@@ -103,7 +113,12 @@ TensorValue *TaskContext::MutableInput(int index) {

const TensorValue *TaskContext::GetOutput(int index) const {
if (index < 0 || index >= node_item_->num_outputs) {
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs);
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)",
index, node_item_->num_outputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)",
index, node_item_->num_outputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
return nullptr;
}

@@ -112,7 +127,12 @@ const TensorValue *TaskContext::GetOutput(int index) const {

TensorValue *TaskContext::MutableOutput(int index) {
if (index < 0 || index >= node_item_->num_outputs) {
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs);
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)",
index, node_item_->num_outputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)",
index, node_item_->num_outputs,
node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
return nullptr;
}

@@ -125,7 +145,10 @@ std::size_t TaskContext::NumWorkspaces() const {

void *TaskContext::MutableWorkspace(int index) {
if (index < 0 || static_cast<size_t>(index) >= workspaces_.size()) {
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_workspaces = %d", index, node_item_->num_outputs);
REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. number:%zu of workspaces_, node:%s(%s)",
index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. number:%zu of workspaces_, node:%s(%s)",
index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
return nullptr;
}

@@ -134,7 +157,11 @@ void *TaskContext::MutableWorkspace(int index) {

const TensorValue *TaskContext::GetInput(int index) const {
if (index < 0 || index >= node_item_->num_inputs) {
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs);
REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. num_inputs:%d node:%s(%s)",
index, node_item_->num_inputs, node_item_->NodeName().c_str(),
node_item_->NodeType().c_str());
GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. num_inputs:%d node:%s(%s)",
index, node_item_->num_inputs, node_item_->NodeName().c_str(), node_item_->NodeType().c_str());
return nullptr;
}

@@ -146,7 +173,10 @@ Status TaskContext::AllocateWorkspaces() {
for (auto size : workspace_sizes) {
void *workspace = execution_context_->allocator->Allocate(size);
if (workspace == nullptr) {
GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size: %ld", size);
REPORT_CALL_ERROR("E19999", "node:%s(%s) Allocate workspace failed, size: %ld",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size);
GELOGE(MEMALLOC_FAILED, "[Allocate][workspace] failed for node:%s(%s), size: %ld",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size);
return MEMALLOC_FAILED;
}

@@ -162,7 +192,8 @@ Status TaskContext::RegisterCallback(const std::function<void()> &callback_fun)
}
auto ret = execution_context_->callback_manager->RegisterCallback(GetStream(), callback_fun);
if (ret != SUCCESS) {
GELOGE(ret, "[%s] Failed to register callback", GetNodeName());
REPORT_CALL_ERROR("E19999", "RegisterCallback failed for [%s]", GetNodeName());
GELOGE(ret, "[Register][Callback] failed for [%s]", GetNodeName());
execution_context_->callback_manager->Destroy();
return ret;
}
@@ -187,7 +218,8 @@ string TaskContext::TensorDesc2String(const GeTensorDesc &desc) {
Status TaskContext::AllocateTensor(const GeTensorDesc &tensor_desc, TensorValue &tensor, AllocationAttr *attr) {
int64_t size = 0;
if (ge::TensorUtils::GetSize(tensor_desc, size) != GRAPH_SUCCESS) {
GELOGE(INTERNAL_ERROR, "Failed to get tensor size");
REPORT_CALL_ERROR("E19999", "Get TensorSize failed, tensor:%s", tensor_desc.GetName().c_str());
GELOGE(INTERNAL_ERROR, "[Get][TensorSize] failed, tensor:%s", tensor_desc.GetName().c_str());
return INTERNAL_ERROR;
}

@@ -211,7 +243,12 @@ Status TaskContext::AllocateOutput(int index,
TensorDesc2String(tensor_desc).c_str());

if (index < 0 || index >= node_item_->num_outputs) {
GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index);
REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(),
node_item_->num_outputs, index);
GELOGE(PARAM_INVALID, "[Check][Param] %s(%s) output index out of range. num_output = %d, index = %d",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(),
node_item_->num_outputs, index);
return PARAM_INVALID;
}

@@ -289,7 +326,10 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) {
Status TaskContext::AllocateTensor(size_t size, TensorValue &tensor, AllocationAttr *attr) {
auto buffer = TensorBuffer::Create(execution_context_->allocator, size, attr);
if (buffer == nullptr) {
GELOGE(MEMALLOC_FAILED, "Failed to allocate buffer of size: %zu", size);
REPORT_CALL_ERROR("E19999", "%s(%s) Allocate buffer failed, size: %zu",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size);
GELOGE(MEMALLOC_FAILED, "[Allocate][buffer] failed for %s(%s), size: %zu",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size);
return MEMALLOC_FAILED;
}

@@ -303,7 +343,12 @@ const NodeItem &TaskContext::GetNodeItem() const {

Status TaskContext::SetOutput(int index, const TensorValue &tensor) {
if (index < 0 || index >= node_item_->num_outputs) {
GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index);
REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(),
node_item_->num_outputs, index);
GELOGE(PARAM_INVALID, "[Check][Param]%s(%s) output index out of range. num_output = %d, index = %d",
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(),
node_item_->num_outputs, index);
return PARAM_INVALID;
}

@@ -368,7 +413,8 @@ Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr
}

if (*buffer == nullptr) {
GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size = %zu", size);
REPORT_CALL_ERROR("E19999", "Allocate Workspace failed, size = %zu", size);
GELOGE(MEMALLOC_FAILED, "[Allocate][Workspace] failed, size = %zu", size);
return MEMALLOC_FAILED;
}

@@ -400,11 +446,11 @@ Status TaskContext::PropagateOutputs() {
input_offset);

if (subgraph_context_->all_inputs_.size() <= static_cast<size_t>(input_offset)) {
GELOGE(INTERNAL_ERROR,
"[%s] input index out of range. index = %d, total input num = %zu",
GetNodeName(),
input_offset,
subgraph_context_->all_inputs_.size());
REPORT_INNER_ERROR("E19999",
"[%s] input index out of range check invalid. index = %d, total input num = %zu",
GetNodeName(), input_offset, subgraph_context_->all_inputs_.size());
GELOGE(INTERNAL_ERROR, "[Check][Size][%s] input index out of range. index = %d, total input num = %zu",
GetNodeName(), input_offset, subgraph_context_->all_inputs_.size());
return INTERNAL_ERROR;
}



Loading…
Cancel
Save