@@ -32,9 +32,10 @@ class HybridDavinciModel::Impl { | |||||
} | } | ||||
Status Init() { | Status Init() { | ||||
GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), "Failed to initialize executors"); | |||||
GE_CHK_STATUS_RET(model_.Init(), "Failed to init model.") | |||||
GE_CHK_STATUS_RET(executor_.Init(), "Failed to init model executor.") | |||||
GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), | |||||
"[Initialize][NodeExecutorManager] failed"); | |||||
GE_CHK_STATUS_RET(model_.Init(), "[Init][HybridModel] failed.") | |||||
GE_CHK_STATUS_RET(executor_.Init(), "[Init][HybridModelAsyncExecutor] failed.") | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -30,7 +30,7 @@ namespace ge { | |||||
namespace hybrid { | namespace hybrid { | ||||
REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH, KnownNodeExecutor); | REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH, KnownNodeExecutor); | ||||
Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | |||||
Status KnownNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start"); | ||||
GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName()); | GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName()); | ||||
if (davinci_model_->GetTaskList().empty()) { | if (davinci_model_->GetTaskList().empty()) { | ||||
@@ -56,7 +56,9 @@ Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function<void() | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] Start"); | ||||
rt_ret = rtModelExecute(davinci_model_->GetRtModelHandle(), context.GetStream(), 0); | rt_ret = rtModelExecute(davinci_model_->GetRtModelHandle(), context.GetStream(), 0); | ||||
GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, | GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, | ||||
GELOGE(rt_ret, "rtModelExecute error, ret: hybrid_model_executorOx%X", rt_ret); return FAILED;); | |||||
REPORT_CALL_ERROR("E19999", "rtModelExecute error, ret:Ox%X", rt_ret); | |||||
GELOGE(rt_ret, "[Invoke][rtModelExecute] error, ret:Ox%X", rt_ret); | |||||
return FAILED;); | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] End"); | ||||
GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback)); | GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback)); | ||||
@@ -87,7 +89,7 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { | |||||
} | } | ||||
GE_CHK_STATUS_RET(davinci_model_->UpdateKnownNodeArgs(inputs, outputs), | GE_CHK_STATUS_RET(davinci_model_->UpdateKnownNodeArgs(inputs, outputs), | ||||
"known node task update known node args failed."); | |||||
"[Update][KnownNodeArgs] failed for %s.", context.GetNodeName()); | |||||
GELOGD("[%s] KnownNodeExecutor::UpdateArgs success, task_size = %zu", context.GetNodeName(), | GELOGD("[%s] KnownNodeExecutor::UpdateArgs success, task_size = %zu", context.GetNodeName(), | ||||
davinci_model_->GetTaskList().size()); | davinci_model_->GetTaskList().size()); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -95,15 +97,15 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { | |||||
Status KnownNodeTask::Init(TaskContext &context) { | Status KnownNodeTask::Init(TaskContext &context) { | ||||
// allocate output mem | // allocate output mem | ||||
GE_CHK_STATUS_RET(context.AllocateOutputs(), "known node task allocate output failed."); | |||||
GE_CHK_STATUS_RET(context.AllocateOutputs(), "[Allocate][Outputs] failed for %s.", context.GetNodeName()); | |||||
// allocate mem base | // allocate mem base | ||||
void *buffer = nullptr; | void *buffer = nullptr; | ||||
if (davinci_model_->TotalMemSize() != 0) { | if (davinci_model_->TotalMemSize() != 0) { | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | ||||
"[KnownNodeTask_AllocateWorkspace] Start"); | "[KnownNodeTask_AllocateWorkspace] Start"); | ||||
GE_CHK_STATUS_RET( | |||||
context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, davinci_model_->GetRuntimeParam().mem_base), | |||||
"known node task allocate workspace failed."); | |||||
GE_CHK_STATUS_RET(context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, | |||||
davinci_model_->GetRuntimeParam().mem_base), | |||||
"[Allocate][Workspace] failed for %s.", context.GetNodeName()); | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | ||||
"[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize()); | "[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize()); | ||||
// update mem base | // update mem base | ||||
@@ -112,8 +114,10 @@ Status KnownNodeTask::Init(TaskContext &context) { | |||||
davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size); | davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size); | ||||
} | } | ||||
GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), | GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), | ||||
davinci_model_->Id(), davinci_model_->SubModelId()), | |||||
"KnownNodeTask::Init destroy aicpu kernel failed."); | |||||
davinci_model_->Id(), | |||||
davinci_model_->SubModelId()), | |||||
"[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u", | |||||
davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId()); | |||||
GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); | GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -121,7 +125,8 @@ Status KnownNodeTask::Init(TaskContext &context) { | |||||
Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) { | Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) { | ||||
GELOGD("[Init][DavinciModel] start"); | GELOGD("[Init][DavinciModel] start"); | ||||
davinci_model_->InitRuntimeParams(); | davinci_model_->InitRuntimeParams(); | ||||
GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed"); | |||||
GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), | |||||
"[Init][VariableMem] failed"); | |||||
int32_t device_id = 0; | int32_t device_id = 0; | ||||
GE_CHK_RT_RET(rtGetDevice(&device_id)); | GE_CHK_RT_RET(rtGetDevice(&device_id)); | ||||
davinci_model_->SetDeviceId(static_cast<uint32_t>(device_id)); | davinci_model_->SetDeviceId(static_cast<uint32_t>(device_id)); | ||||
@@ -153,11 +158,13 @@ Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) cons | |||||
GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName()); | GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName()); | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start"); | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] Start"); | ||||
GE_CHK_STATUS_RET(task.Init(context), "known node init davinci model failed."); | |||||
GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init] %s known node init davinci model failed.", | |||||
context.GetNodeName()); | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] End"); | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] Start"); | ||||
GE_CHK_STATUS_RET(task.UpdateArgs(context), "known node task update args failed."); | |||||
GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Invoke][UpdateArgs] %s known node task update args failed.", | |||||
context.GetNodeName()); | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] End"); | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] End"); | ||||
GELOGD("[%s] KnownNodeExecutor::PrepareTask success.", context.GetNodeName()); | GELOGD("[%s] KnownNodeExecutor::PrepareTask success.", context.GetNodeName()); | ||||
@@ -188,7 +195,9 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node | |||||
davinci_model->SetSubModelId(node->GetOpDesc()->GetId()); | davinci_model->SetSubModelId(node->GetOpDesc()->GetId()); | ||||
GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId()); | GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId()); | ||||
GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed."); | |||||
GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), | |||||
"[Invoke][Assign]KnownNodeExecutor::LoadTask davincimodel assign failed for node:%s.", | |||||
node->GetName().c_str()); | |||||
auto known_node_task = MakeShared<KnownNodeTask>(davinci_model); | auto known_node_task = MakeShared<KnownNodeTask>(davinci_model); | ||||
GE_CHECK_NOTNULL(known_node_task); | GE_CHECK_NOTNULL(known_node_task); | ||||
@@ -201,8 +210,7 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node | |||||
Status KnownNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, | Status KnownNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, | ||||
const std::function<void()> &callback) const { | const std::function<void()> &callback) const { | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] Start"); | ||||
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), | |||||
"Failed to execute task. node = %s", | |||||
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "[Invoke][ExecuteAsync]Failed to execute task. node = %s", | |||||
context.GetNodeItem().NodeName().c_str()); | context.GetNodeItem().NodeName().c_str()); | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] End"); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -43,8 +43,7 @@ Status ControlOpNodeTask::ExecuteSubgraph(const GraphItem *subgraph, | |||||
auto executor = MakeShared<SubgraphExecutor>(subgraph, execution_context); | auto executor = MakeShared<SubgraphExecutor>(subgraph, execution_context); | ||||
GE_CHECK_NOTNULL(executor); | GE_CHECK_NOTNULL(executor); | ||||
GE_CHK_STATUS_RET(executor->ExecuteAsync(task_context), | GE_CHK_STATUS_RET(executor->ExecuteAsync(task_context), | ||||
"[%s] Failed to execute partitioned call.", | |||||
subgraph->GetName().c_str()); | |||||
"[Invoke][ExecuteAsync][%s] Failed to execute partitioned call.", subgraph->GetName().c_str()); | |||||
auto callback = [executor, done_callback]() mutable { | auto callback = [executor, done_callback]() mutable { | ||||
if (done_callback != nullptr) { | if (done_callback != nullptr) { | ||||
@@ -127,7 +126,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi | |||||
auto cond_tensor = task_context.GetInput(kIfCondIndex); | auto cond_tensor = task_context.GetInput(kIfCondIndex); | ||||
GE_CHECK_NOTNULL(cond_tensor); | GE_CHECK_NOTNULL(cond_tensor); | ||||
GE_CHK_STATUS_RET(ToBool(*cond_tensor, data_type, cond_val), | GE_CHK_STATUS_RET(ToBool(*cond_tensor, data_type, cond_val), | ||||
"[%s] Failed to get cond value.", | |||||
"[Invoke][ToBool][%s] Failed to get cond value.", | |||||
task_context.GetNodeName()); | task_context.GetNodeName()); | ||||
} else { | } else { | ||||
// true if num elements is non-zero | // true if num elements is non-zero | ||||
@@ -141,9 +140,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi | |||||
auto subgraph = cond_val ? then_ : else_; | auto subgraph = cond_val ? then_ : else_; | ||||
GELOGD("[%s] Taking subgraph [%s] by cond = [%d]", task_context.GetNodeName(), subgraph->GetName().c_str(), cond_val); | GELOGD("[%s] Taking subgraph [%s] by cond = [%d]", task_context.GetNodeName(), subgraph->GetName().c_str(), cond_val); | ||||
GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), | GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), | ||||
"[%s] Failed to execute subgraph. cond = %d", | |||||
task_context.GetNodeName(), | |||||
cond_val); | |||||
"[Execute][Subgraph] failed for [%s]. cond = %d", task_context.GetNodeName(), cond_val); | |||||
GELOGD("[%s] Done executing with cond = %d successfully.", task_context.GetNodeName(), cond_val); | GELOGD("[%s] Done executing with cond = %d successfully.", task_context.GetNodeName(), cond_val); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -201,8 +198,7 @@ Status CaseOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::func | |||||
} | } | ||||
GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), | GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), | ||||
"[%s] Failed to execute else-subgraph.", | |||||
task_context.GetNodeName()); | |||||
"[Execute][Subgraph] failed for [%s].", task_context.GetNodeName()); | |||||
GELOGD("[%s] Done executing subgraph[%d] successfully.", task_context.GetNodeName(), branch_index); | GELOGD("[%s] Done executing subgraph[%d] successfully.", task_context.GetNodeName(), branch_index); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -228,18 +224,18 @@ Status WhileOpNodeTask::Init(const NodePtr &node, const HybridModel &model) { | |||||
Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::function<void()> &done_callback) const { | Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::function<void()> &done_callback) const { | ||||
if (task_context.NumInputs() != task_context.NumOutputs()) { | if (task_context.NumInputs() != task_context.NumOutputs()) { | ||||
REPORT_INNER_ERROR("E19999", | |||||
"[%s] Invalid while args. num_inputs = %d not equal num_outputs = %d", | |||||
task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); | |||||
GELOGE(INTERNAL_ERROR, | GELOGE(INTERNAL_ERROR, | ||||
"[%s] Invalid while args. num_inputs = %d, num_outputs = %d", | |||||
task_context.GetNodeName(), | |||||
task_context.NumInputs(), | |||||
task_context.NumOutputs()); | |||||
"[Check][Param:task_context][%s] Invalid while args. num_inputs = %d, num_outputs = %d", | |||||
task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
bool is_continue = false; | bool is_continue = false; | ||||
GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), | GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), | ||||
"[%s] Failed to execute cond-subgraph", | |||||
task_context.GetNodeName()); | |||||
"[Execute][Cond] failed for [%s]", task_context.GetNodeName()); | |||||
if (!is_continue) { | if (!is_continue) { | ||||
for (int i = 0; i < task_context.NumInputs(); ++i) { | for (int i = 0; i < task_context.NumInputs(); ++i) { | ||||
auto input_tensor = task_context.GetInput(i); | auto input_tensor = task_context.GetInput(i); | ||||
@@ -269,9 +265,8 @@ Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::fun | |||||
++iteration; | ++iteration; | ||||
GELOGD("[%s] Start to execute, iteration = %d", task_context.GetNodeName(), iteration); | GELOGD("[%s] Start to execute, iteration = %d", task_context.GetNodeName(), iteration); | ||||
GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue), | GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue), | ||||
"[%s] Failed to execute iteration %d.", | |||||
task_context.GetNodeName(), | |||||
iteration); | |||||
"[Invoke][ExecuteOneLoop][%s] Failed to execute iteration %d.", | |||||
task_context.GetNodeName(), iteration); | |||||
} | } | ||||
GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration); | GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration); | ||||
if (done_callback) { | if (done_callback) { | ||||
@@ -299,24 +294,27 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue | |||||
auto executor = MakeShared<SubgraphExecutor>(cond_, execution_context, task_context.IsForceInferShape()); | auto executor = MakeShared<SubgraphExecutor>(cond_, execution_context, task_context.IsForceInferShape()); | ||||
GE_CHECK_NOTNULL(executor); | GE_CHECK_NOTNULL(executor); | ||||
GELOGD("[%s] Start to execute cond-subgraph.", task_context.GetNodeName()); | GELOGD("[%s] Start to execute cond-subgraph.", task_context.GetNodeName()); | ||||
GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), "Failed to execute partitioned call."); | |||||
GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), | |||||
"[Invoke][ExecuteAsync] %s Failed to execute partitioned call.", task_context.GetNodeName()); | |||||
GELOGD("[%s] Done executing cond-subgraph successfully.", cond_->GetName().c_str()); | GELOGD("[%s] Done executing cond-subgraph successfully.", cond_->GetName().c_str()); | ||||
GE_CHK_STATUS_RET_NOLOG(task_context.RegisterCallback([executor]() mutable { | GE_CHK_STATUS_RET_NOLOG(task_context.RegisterCallback([executor]() mutable { | ||||
executor.reset(); | executor.reset(); | ||||
})); | })); | ||||
// get cond output | // get cond output | ||||
GE_CHK_STATUS_RET(executor->Synchronize(), "[%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); | |||||
GE_CHK_STATUS_RET(executor->Synchronize(), | |||||
"[Invoke][Synchronize][%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); | |||||
std::vector<TensorValue> cond_outputs; | std::vector<TensorValue> cond_outputs; | ||||
std::vector<ConstGeTensorDescPtr> cond_output_desc_list; | std::vector<ConstGeTensorDescPtr> cond_output_desc_list; | ||||
GE_CHK_STATUS_RET(executor->GetOutputs(cond_outputs, cond_output_desc_list), | GE_CHK_STATUS_RET(executor->GetOutputs(cond_outputs, cond_output_desc_list), | ||||
"[%s] Failed to get cond-output.", | |||||
cond_->GetName().c_str()); | |||||
"[Invoke][GetOutputs][%s] Failed to get cond-output.", cond_->GetName().c_str()); | |||||
if (cond_outputs.size() != kCondOutputSize || cond_output_desc_list.size() != kCondOutputSize) { | if (cond_outputs.size() != kCondOutputSize || cond_output_desc_list.size() != kCondOutputSize) { | ||||
REPORT_INNER_ERROR("E19999", "[%s] Number of cond outputs(%zu) or size of cond output desc(%zu)" | |||||
"not equal %zu, check invalid", task_context.GetNodeName(), cond_outputs.size(), | |||||
cond_output_desc_list.size(), kCondOutputSize); | |||||
GELOGE(INTERNAL_ERROR, | GELOGE(INTERNAL_ERROR, | ||||
"[%s] Number of cond outputs is invalid. number = %zu", | |||||
task_context.GetNodeName(), | |||||
cond_outputs.size()); | |||||
"[Check][Size][%s] Number of cond outputs(%zu) or Number of cond output desc(%zu) not equal %zu", | |||||
task_context.GetNodeName(), cond_outputs.size(), cond_output_desc_list.size(), kCondOutputSize); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -325,8 +323,7 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue | |||||
if (shape.IsScalar()) { | if (shape.IsScalar()) { | ||||
auto data_type = cond_tensor_desc->GetDataType(); | auto data_type = cond_tensor_desc->GetDataType(); | ||||
GE_CHK_STATUS_RET(ToBool(cond_outputs[0], data_type, is_continue), | GE_CHK_STATUS_RET(ToBool(cond_outputs[0], data_type, is_continue), | ||||
"[%s] Failed to get cond value.", | |||||
task_context.GetNodeName()); | |||||
"[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName()); | |||||
} else { | } else { | ||||
// true if num elements is non-zero | // true if num elements is non-zero | ||||
is_continue = shape.GetShapeSize() > 0; | is_continue = shape.GetShapeSize() > 0; | ||||
@@ -367,17 +364,15 @@ Status WhileOpNodeTask::MoveOutputs2Inputs(TaskContext &task_context) { | |||||
Status WhileOpNodeTask::ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const { | Status WhileOpNodeTask::ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const { | ||||
GELOGD("[%s] Start to execute body-subgraph.", task_context.GetNodeName()); | GELOGD("[%s] Start to execute body-subgraph.", task_context.GetNodeName()); | ||||
GE_CHK_STATUS_RET(ExecuteSubgraph(body_, task_context, nullptr), | GE_CHK_STATUS_RET(ExecuteSubgraph(body_, task_context, nullptr), | ||||
"[%s] Failed to execute cond-subgraph", task_context.GetNodeName()); | |||||
"[Execute][Subgraph] failed for [%s]", task_context.GetNodeName()); | |||||
GELOGD("[%s] Done executing body-subgraph successfully.", task_context.GetNodeName()); | GELOGD("[%s] Done executing body-subgraph successfully.", task_context.GetNodeName()); | ||||
// set outputs to inputs for next iteration | // set outputs to inputs for next iteration | ||||
GE_CHK_STATUS_RET(MoveOutputs2Inputs(task_context), | GE_CHK_STATUS_RET(MoveOutputs2Inputs(task_context), | ||||
"[%s] Failed to move outputs to inputs", | |||||
task_context.GetNodeName()); | |||||
"[Move][Outputs2Inputs] failed for [%s]", task_context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), | GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), | ||||
"[%s] Failed to execute cond-subgraph", | |||||
task_context.GetNodeName()); | |||||
"[Invoke][ExecuteCond][%s] Failed to execute cond-subgraph", task_context.GetNodeName()); | |||||
if (!is_continue) { | if (!is_continue) { | ||||
for (int i = 0; i < task_context.NumInputs(); ++i) { | for (int i = 0; i < task_context.NumInputs(); ++i) { | ||||
@@ -404,12 +399,14 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model, | |||||
} else if (node_type == WHILE || node_type == STATELESSWHILE) { | } else if (node_type == WHILE || node_type == STATELESSWHILE) { | ||||
node_task.reset(new(std::nothrow) WhileOpNodeTask()); | node_task.reset(new(std::nothrow) WhileOpNodeTask()); | ||||
} else { | } else { | ||||
GELOGE(PARAM_INVALID, "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][NodeType][%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
GE_CHECK_NOTNULL(node_task); | GE_CHECK_NOTNULL(node_task); | ||||
GE_CHK_STATUS_RET(node_task->Init(node, model), "[%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); | |||||
GE_CHK_STATUS_RET(node_task->Init(node, model), | |||||
"[Invoke][Init][%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); | |||||
task = std::move(node_task); | task = std::move(node_task); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -47,7 +47,9 @@ Status RefInputTask::UpdateArgs(TaskContext &) { | |||||
Status RefInputTask::Execute(TaskContext &context) { | Status RefInputTask::Execute(TaskContext &context) { | ||||
auto iter = out_ref_input_index_.find(node_type_); | auto iter = out_ref_input_index_.find(node_type_); | ||||
if (iter == out_ref_input_index_.end()) { | if (iter == out_ref_input_index_.end()) { | ||||
GELOGE(UNSUPPORTED, "node %s type %s can not use RefInputTask.", | |||||
REPORT_INNER_ERROR("E19999", "node %s type %s can not use RefInputTask.", | |||||
node_name_.c_str(), node_type_.c_str()); | |||||
GELOGE(UNSUPPORTED, "[Find][Node]node %s type %s can not use RefInputTask.", | |||||
node_name_.c_str(), node_type_.c_str()); | node_name_.c_str(), node_type_.c_str()); | ||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
@@ -65,7 +67,9 @@ Status RefInputTask::RefOneByOne(TaskContext &context) { | |||||
int input_num = context.NumInputs(); | int input_num = context.NumInputs(); | ||||
int output_num = context.NumOutputs(); | int output_num = context.NumOutputs(); | ||||
if (output_num > input_num) { | if (output_num > input_num) { | ||||
GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", | |||||
REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", | |||||
node_name_.c_str(), node_type_.c_str(), output_num, input_num); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only %d inputs, can't ref one by one.", | |||||
node_name_.c_str(), node_type_.c_str(), output_num, input_num); | node_name_.c_str(), node_type_.c_str(), output_num, input_num); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -84,7 +88,9 @@ Status RefInputTask::RefByOrder(const std::vector<uint32_t> &ref_order, TaskCont | |||||
GELOGI("node %s type %s ref input by order begin.", node_name_.c_str(), node_type_.c_str()); | GELOGI("node %s type %s ref input by order begin.", node_name_.c_str(), node_type_.c_str()); | ||||
int32_t output_num = context.NumOutputs(); | int32_t output_num = context.NumOutputs(); | ||||
if (ref_order.size() != static_cast<size_t>(output_num)) { | if (ref_order.size() != static_cast<size_t>(output_num)) { | ||||
GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only has %zu out ref index.", | |||||
REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only has %zu out ref index.", | |||||
node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only has %zu out ref index.", | |||||
node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); | node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -102,7 +108,7 @@ Status RefInputTask::RefByOrder(const std::vector<uint32_t> &ref_order, TaskCont | |||||
Status RefInputTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status RefInputTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[RefInputTaskExecuteAsync] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[RefInputTaskExecuteAsync] Start"); | ||||
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s ref input task execute failed", | |||||
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s ref input task execute failed", | |||||
node_name_.c_str(), node_type_.c_str()); | node_name_.c_str(), node_type_.c_str()); | ||||
if (done_callback != nullptr) { | if (done_callback != nullptr) { | ||||
// host cpu no need register callback, call it directly. | // host cpu no need register callback, call it directly. | ||||
@@ -126,20 +132,26 @@ Status DependInputShapeTask::Execute(TaskContext &context) { | |||||
std::string node_type = node_->GetType(); | std::string node_type = node_->GetType(); | ||||
auto kernel = factory.Create(node_type); | auto kernel = factory.Create(node_type); | ||||
if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", | |||||
REPORT_CALL_ERROR("E19999", "create failed for node %s type %s is not supported by host kernel.", | |||||
node_->GetName().c_str(), node_type.c_str()); | |||||
GELOGE(UNSUPPORTED, "[Invoke][Create]node %s type %s is not supported by host kernel.", | |||||
node_->GetName().c_str(), node_type.c_str()); | node_->GetName().c_str(), node_type.c_str()); | ||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
std::vector<GeTensorPtr> outputs; | std::vector<GeTensorPtr> outputs; | ||||
Status compute_ret = kernel->Compute(node_, outputs); | Status compute_ret = kernel->Compute(node_, outputs); | ||||
if (compute_ret != SUCCESS) { | if (compute_ret != SUCCESS) { | ||||
GELOGE(compute_ret, "node %s type %s compute failed or not imply.", | |||||
REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", node_->GetName().c_str(), node_type.c_str()); | |||||
GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", | |||||
node_->GetName().c_str(), node_type.c_str()); | node_->GetName().c_str(), node_type.c_str()); | ||||
return compute_ret; | return compute_ret; | ||||
} | } | ||||
int32_t output_num = context.NumOutputs(); | int32_t output_num = context.NumOutputs(); | ||||
if (static_cast<size_t>(output_num) != outputs.size()) { | if (static_cast<size_t>(output_num) != outputs.size()) { | ||||
GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.", | |||||
REPORT_INNER_ERROR("E19999", "node %s type %s has %d output," | |||||
"but kernel compute only has %zu output. check invalid", | |||||
node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d output, but kernel compute only has %zu output.", | |||||
node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); | node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -155,7 +167,11 @@ Status DependInputShapeTask::Execute(TaskContext &context) { | |||||
auto tensor_value = context.MutableOutput(i); | auto tensor_value = context.MutableOutput(i); | ||||
GE_CHECK_NOTNULL(tensor_value); | GE_CHECK_NOTNULL(tensor_value); | ||||
if (tensor_data.GetSize() > tensor_value->GetSize()) { | if (tensor_data.GetSize() > tensor_value->GetSize()) { | ||||
GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", | |||||
REPORT_INNER_ERROR("E19999", "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu." | |||||
"check invalid", | |||||
node_->GetName().c_str(), node_type.c_str(), i, | |||||
tensor_data.GetSize(), tensor_value->GetSize()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size]node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", | |||||
node_->GetName().c_str(), node_type.c_str(), i, tensor_data.GetSize(), tensor_value->GetSize()); | node_->GetName().c_str(), node_type.c_str(), i, tensor_data.GetSize(), tensor_value->GetSize()); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -180,7 +196,7 @@ Status DependInputShapeTask::Execute(TaskContext &context) { | |||||
Status DependInputShapeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status DependInputShapeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), | ||||
"[DependInputShapeTaskExecuteAsync] Start"); | "[DependInputShapeTaskExecuteAsync] Start"); | ||||
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s depend input shape task execute failed", | |||||
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s depend input shape task execute failed", | |||||
node_->GetName().c_str(), node_->GetType().c_str()); | node_->GetName().c_str(), node_->GetType().c_str()); | ||||
if (done_callback != nullptr) { | if (done_callback != nullptr) { | ||||
// host cpu no need register callback, call it directly. | // host cpu no need register callback, call it directly. | ||||
@@ -213,7 +229,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, | |||||
node->GetName().c_str(), node_type.c_str()); | node->GetName().c_str(), node_type.c_str()); | ||||
task = MakeShared<RefInputTask>(node); | task = MakeShared<RefInputTask>(node); | ||||
if (task == nullptr) { | if (task == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "create RefInputTask for node %s failed.", node->GetName().c_str()); | |||||
REPORT_CALL_ERROR("E19999", "Create RefInputTask failed for node %s.", node->GetName().c_str()); | |||||
GELOGE(MEMALLOC_FAILED, "[Create][RefInputTask] failed for node %s.", node->GetName().c_str()); | |||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
} else if (DependInputShapeTask::IsBelong(node_type)) { | } else if (DependInputShapeTask::IsBelong(node_type)) { | ||||
@@ -221,7 +238,9 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, | |||||
node->GetName().c_str(), node_type.c_str()); | node->GetName().c_str(), node_type.c_str()); | ||||
task = MakeShared<DependInputShapeTask>(node); | task = MakeShared<DependInputShapeTask>(node); | ||||
if (task == nullptr) { | if (task == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "create DependInputShapeTask for node %s type %s failed.", | |||||
REPORT_CALL_ERROR("E19999", "Create DependInputShapeTask failed for node %s type %s.", | |||||
node->GetName().c_str(), node_type.c_str()); | |||||
GELOGE(MEMALLOC_FAILED, "[Create][DependInputShapeTask]failed for node %s type %s.", | |||||
node->GetName().c_str(), node_type.c_str()); | node->GetName().c_str(), node_type.c_str()); | ||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
@@ -229,7 +248,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, | |||||
GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str()); | GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str()); | ||||
auto tensor = model.GetTensor(node); | auto tensor = model.GetTensor(node); | ||||
if (tensor == nullptr) { | if (tensor == nullptr) { | ||||
GELOGE(INTERNAL_ERROR, "Failed to get tensor by name: %s", node->GetName().c_str()); | |||||
REPORT_CALL_ERROR("E19999", "GetTensor failed for name: %s", node->GetName().c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Get][Tensor] failed for name: %s", node->GetName().c_str()); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -251,7 +271,7 @@ Status ConstantNodeTask::UpdateArgs(TaskContext &context) { | |||||
Status ConstantNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status ConstantNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
GELOGD("[%s] Start execute.", context.GetNodeName()); | GELOGD("[%s] Start execute.", context.GetNodeName()); | ||||
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[%s] Failed to set output.", context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[Set][Output] failed for [%s].", context.GetNodeName()); | |||||
if (done_callback) { | if (done_callback) { | ||||
GELOGD("[%s] Start invoke callback.", context.GetNodeName()); | GELOGD("[%s] Start invoke callback.", context.GetNodeName()); | ||||
done_callback(); | done_callback(); | ||||
@@ -43,13 +43,15 @@ REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::HCCL, HcclNode | |||||
Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
GELOGI("[%s] HcclNodeTask::ExecuteAsync in.", context.GetNodeName()); | GELOGI("[%s] HcclNodeTask::ExecuteAsync in.", context.GetNodeName()); | ||||
if (context.handle_ == nullptr) { | if (context.handle_ == nullptr) { | ||||
GELOGE(FAILED, "hccl handle is nullptr! "); | |||||
REPORT_INNER_ERROR("E19999", " %s invalid, hccl handle is nullptr!", context.GetNodeName()); | |||||
GELOGE(FAILED, "[Check][Param:context] %s hccl handle is nullptr!", context.GetNodeName()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function<void(HcclResult status)>))dlsym( | auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function<void(HcclResult status)>))dlsym( | ||||
context.handle_, "HcomExecEnqueueOperation"); | context.handle_, "HcomExecEnqueueOperation"); | ||||
if (HcomExecEnqueueOperation == nullptr) { | if (HcomExecEnqueueOperation == nullptr) { | ||||
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); | |||||
GELOGE(FAILED, "[Invoke][HcomExecEnqueueOperation] failed for %s hcom unknown node function.", | |||||
context.GetNodeName()); | |||||
if (dlclose(context.handle_) != 0) { | if (dlclose(context.handle_) != 0) { | ||||
GELOGW("Failed to close handle %s", dlerror()); | GELOGW("Failed to close handle %s", dlerror()); | ||||
} | } | ||||
@@ -83,24 +85,35 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do | |||||
ge::DataType src_data_type = input_desc->GetDataType(); | ge::DataType src_data_type = input_desc->GetDataType(); | ||||
auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); | auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); | ||||
if (iter == kConstOpHcclDataType.end()) { | if (iter == kConstOpHcclDataType.end()) { | ||||
GELOGE(PARAM_INVALID, "kConstOpHcclDataType find failed."); | |||||
REPORT_INNER_ERROR("E19999", "%s inputdesc0 datatype:%s not support.", | |||||
op_desc->GetName().c_str(), | |||||
TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
GELOGE(PARAM_INVALID, "[Find][DataType]%s inputdesc0 datatype:%s not support.", | |||||
op_desc->GetName().c_str(), | |||||
TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
op_info.dataType = iter->second; | op_info.dataType = iter->second; | ||||
HcclReduceOp op_type = HCCL_REDUCE_SUM; | HcclReduceOp op_type = HCCL_REDUCE_SUM; | ||||
if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER || | if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER || | ||||
op_desc->GetType() == HVDCALLBACKALLREDUCE || op_desc->GetType() == HCOMREDUCE) { | op_desc->GetType() == HVDCALLBACKALLREDUCE || op_desc->GetType() == HCOMREDUCE) { | ||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed"); | |||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), | |||||
"[Get][HcclOperationType] failed for %s type:%s", op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str()); | |||||
op_info.opType = op_type; | op_info.opType = op_type; | ||||
} | } | ||||
int64_t root_id = 0; | int64_t root_id = 0; | ||||
if (op_desc->GetType() == HCOMBROADCAST) { | if (op_desc->GetType() == HCOMBROADCAST) { | ||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed"); | |||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), | |||||
"[Get][HcclRootId] failed for %s type:%s", op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str()); | |||||
} | } | ||||
op_info.root = root_id; | op_info.root = root_id; | ||||
auto callback = [op_desc, done_callback](HcclResult status) { | auto callback = [op_desc, done_callback](HcclResult status) { | ||||
if (status != HCCL_SUCCESS) { | if (status != HCCL_SUCCESS) { | ||||
GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", | |||||
REPORT_CALL_ERROR("E19999", "call HcomExecEnqueueOperation failed for node %s, ret: 0x%X", | |||||
op_desc->GetName().c_str(), status); | |||||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node %s, ret: 0x%X", | |||||
op_desc->GetName().c_str(), status); | op_desc->GetName().c_str(), status); | ||||
} | } | ||||
@@ -110,14 +123,18 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do | |||||
int32_t count = 0; | int32_t count = 0; | ||||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast<HcclDataType>(op_info.dataType), | GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast<HcclDataType>(op_info.dataType), | ||||
op_desc->GetType() == HCOMALLGATHER, count), | op_desc->GetType() == HCOMALLGATHER, count), | ||||
"GetHcomCount failed"); | |||||
"[Get][HcomCount] failed for %s type:%s", op_desc->GetName().c_str(), | |||||
op_desc->GetType().c_str()); | |||||
GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.", | GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.", | ||||
context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); | context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); | ||||
op_info.count = count; | op_info.count = count; | ||||
HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); | HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); | ||||
if (hccl_ret != HCCL_SUCCESS) { | if (hccl_ret != HCCL_SUCCESS) { | ||||
GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); | |||||
REPORT_CALL_ERROR("E19999", "Call HcomExecEnqueueOperation failed for node:%s(%s), ret: 0x%X", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); | |||||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node:%s(%s), ret: 0x%X", | |||||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); | |||||
return HCCL_E_INTERNAL; | return HCCL_E_INTERNAL; | ||||
} | } | ||||
@@ -173,13 +190,23 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess | |||||
GELOGD("data is null, no need to do rdma read/write, node=%s", context.GetNodeName()); | GELOGD("data is null, no need to do rdma read/write, node=%s", context.GetNodeName()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} else { | } else { | ||||
GELOGE(FAILED, "Tensor data is nullptr."); | |||||
REPORT_INNER_ERROR("E19999", "Tensor data is nullptr. and kRdmaScatterTypes not contain %s", | |||||
context.GetNodeItem().NodeType().c_str()); | |||||
GELOGE(FAILED, "[Find][NodeType]Tensor data is nullptr. and kRdmaScatterTypes not contain %s", | |||||
context.GetNodeItem().NodeType().c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
} | } | ||||
auto dims = remote_tensor.GetTensorDesc().GetShape().GetDims(); | auto dims = remote_tensor.GetTensorDesc().GetShape().GetDims(); | ||||
if (dims.size() != kVarTableDims && dims.back() != kVarTableRowCnt) { | if (dims.size() != kVarTableDims && dims.back() != kVarTableRowCnt) { | ||||
GELOGE(PARAM_INVALID, "Variable table shape check failed"); | |||||
REPORT_INNER_ERROR("E19999", "Variable table shape check failed, number of shape dims:%zu not equal expect:%zu" | |||||
"and shape dims back:%zu not equal expect:%zu, node:%s(%s)", | |||||
dims.size(), kVarTableDims, dims.back(), kVarTableRowCnt, | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Variable table shape check failed," | |||||
"number of shape dims:%zu not equal expect:%zu and shape dims back:%zu not equal expect:%zu, node:%s(%s)", | |||||
dims.size(), kVarTableDims, dims.back(), kVarTableRowCnt, | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -222,7 +249,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess | |||||
Tensor offset_tensor; | Tensor offset_tensor; | ||||
GE_CHK_STATUS_RET(ctx->GetTensor(offset_index_.first, offset_index_.second, offset_tensor)) | GE_CHK_STATUS_RET(ctx->GetTensor(offset_index_.first, offset_index_.second, offset_tensor)) | ||||
if (static_cast<int64_t>(offset_tensor.GetSize() / GetSizeByDataType(data_type)) != row_num) { | if (static_cast<int64_t>(offset_tensor.GetSize() / GetSizeByDataType(data_type)) != row_num) { | ||||
GELOGE(PARAM_INVALID, "num of offset and remote addr mismatch, offset size=%zu, remote_addr size=%ld, dtype=%s", | |||||
REPORT_INNER_ERROR("E19999", "num of offset and remote addr mismatch, check invalid" | |||||
"offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num, | |||||
TypeUtils::DataTypeToSerialString(data_type).c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Size]num of offset and remote addr mismatch," | |||||
"offset size=%zu, remote_addr size=%ld, dtype=%s", | |||||
offset_tensor.GetSize(), row_num, TypeUtils::DataTypeToSerialString(data_type).c_str()); | offset_tensor.GetSize(), row_num, TypeUtils::DataTypeToSerialString(data_type).c_str()); | ||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -244,7 +275,9 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess | |||||
auto local_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(tv->MutableData())); | auto local_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(tv->MutableData())); | ||||
auto device_len = tv->GetSize() / row_num; | auto device_len = tv->GetSize() / row_num; | ||||
if (device_len <= 0 || device_len > data[kVarTableIdxLen]) { | if (device_len <= 0 || device_len > data[kVarTableIdxLen]) { | ||||
GELOGE(FAILED, "Local embedding length is out of range, expect %ld, but %ld exactly.", | |||||
REPORT_INNER_ERROR("E19999", "Local embedding length is out of range, expect %ld, but %ld exactly.", | |||||
data[kVarTableIdxLen], device_len); | |||||
GELOGE(FAILED, "[Check][Size]Local embedding length is out of range, expect %ld, but %ld exactly.", | |||||
data[kVarTableIdxLen], device_len); | data[kVarTableIdxLen], device_len); | ||||
return FAILED; | return FAILED; | ||||
} | } | ||||
@@ -267,7 +300,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do | |||||
(HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &, | (HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &, | ||||
std::function<void(HcclResult status)>))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); | std::function<void(HcclResult status)>))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); | ||||
if (HcomExecEnqueueRemoteAccess == nullptr) { | if (HcomExecEnqueueRemoteAccess == nullptr) { | ||||
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); | |||||
GELOGE(FAILED, "[Invoke][HcomExecEnqueueRemoteAccess] failed for node:%s(%s) hcom unknown node function.", | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
if (dlclose(context.handle_) != 0) { | if (dlclose(context.handle_) != 0) { | ||||
GELOGW("Failed to close handle %s", dlerror()); | GELOGW("Failed to close handle %s", dlerror()); | ||||
} | } | ||||
@@ -283,7 +317,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do | |||||
TaskContext *p_ctx = &context; | TaskContext *p_ctx = &context; | ||||
auto callback = [p_ctx, done_callback](HcclResult status) { | auto callback = [p_ctx, done_callback](HcclResult status) { | ||||
if (status != HCCL_SUCCESS) { | if (status != HCCL_SUCCESS) { | ||||
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); | |||||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExcutorInitialize] failed for node:%s(%s), ret: 0x%X", | |||||
p_ctx->GetNodeName(), p_ctx->GetNodeItem().NodeType().c_str(), status); | |||||
p_ctx->SetStatus(FAILED); | p_ctx->SetStatus(FAILED); | ||||
} | } | ||||
done_callback(); | done_callback(); | ||||
@@ -296,7 +331,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do | |||||
} | } | ||||
HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); | HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); | ||||
if (hccl_ret != HCCL_SUCCESS) { | if (hccl_ret != HCCL_SUCCESS) { | ||||
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); | |||||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueRemoteAccess] failed for node:%s(%s), ret: 0x%X", | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str(), hccl_ret); | |||||
return HCCL_E_INTERNAL; | return HCCL_E_INTERNAL; | ||||
} | } | ||||
@@ -314,13 +350,17 @@ Status HcclNodeTask::Init(TaskContext &context) { | |||||
Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { | Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { | ||||
GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName()); | GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName()); | ||||
GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed."); | |||||
GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init]hccl node %s(%s) load hccl so failed.", | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
// allocate output mem, output mem or remote read will be calculated when node execute. | // allocate output mem, output mem or remote read will be calculated when node execute. | ||||
if (kRdmaReadTypes.count(context.GetNodeItem().NodeType()) == 0) { | if (kRdmaReadTypes.count(context.GetNodeItem().NodeType()) == 0) { | ||||
GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed."); | |||||
GE_CHK_STATUS_RET(context.AllocateOutputs(), | |||||
"[Invoke][AllocateOutputs]hccl node %s(%s) task allocate output failed.", | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
} | } | ||||
GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed."); | |||||
GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Update][Args] failed for hccl node %s(%s).", | |||||
context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); | |||||
GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName()); | GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -341,8 +381,9 @@ Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, | |||||
Status HcclNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, | Status HcclNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, | ||||
const std::function<void()> &callback) const { | const std::function<void()> &callback) const { | ||||
context.handle_ = handle_; | context.handle_ = handle_; | ||||
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "Failed to execute task. node = %s", | |||||
context.GetNodeItem().NodeName().c_str()); | |||||
GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), | |||||
"[Invoke][ExecuteAsync] failed to execute task. node:%s(%s)", | |||||
context.GetNodeItem().NodeName().c_str(), context.GetNodeItem().NodeType().c_str()); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -359,12 +400,13 @@ Status HcclNodeExecutor::Initialize() { | |||||
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); | GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); | ||||
handle_ = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL); | handle_ = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL); | ||||
if (handle_ == nullptr) { | if (handle_ == nullptr) { | ||||
GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); | |||||
REPORT_CALL_ERROR("E19999", "Open SoFile %s failed, error:%s! ", canonical_path.c_str(), dlerror()); | |||||
GELOGE(GE_PLGMGR_SO_NOT_EXIST, "[Open][SoFile] %s failed, error:%s! ", canonical_path.c_str(), dlerror()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); | auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); | ||||
if (HcomExecInitialize == nullptr) { | if (HcomExecInitialize == nullptr) { | ||||
GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); | |||||
GELOGE(FAILED, "[Invoke][HcomExecInitialize] Failed for hcom unknown node function."); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
HcclResult hccl_ret = HcomExecInitialize(); | HcclResult hccl_ret = HcomExecInitialize(); | ||||
@@ -373,7 +415,7 @@ Status HcclNodeExecutor::Initialize() { | |||||
} else if (hccl_ret == HCCL_SUCCESS) { | } else if (hccl_ret == HCCL_SUCCESS) { | ||||
GELOGI("Hcom executor initialize success."); | GELOGI("Hcom executor initialize success."); | ||||
} else { | } else { | ||||
GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); | |||||
GELOGE(FAILED, "[Call][HcomExecInitialize] failed, ret: 0x%X", hccl_ret); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -382,12 +424,12 @@ Status HcclNodeExecutor::Initialize() { | |||||
Status HcclNodeExecutor::Finalize() { | Status HcclNodeExecutor::Finalize() { | ||||
auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); | auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); | ||||
if (HcomExecFinalize == nullptr) { | if (HcomExecFinalize == nullptr) { | ||||
GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); | |||||
GELOGE(FAILED, "[Invoke][HcomExecFinalize] failed for hcom unknown node function."); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
HcclResult hccl_ret = HcomExecFinalize(); | HcclResult hccl_ret = HcomExecFinalize(); | ||||
if (hccl_ret != HCCL_SUCCESS) { | if (hccl_ret != HCCL_SUCCESS) { | ||||
GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); | |||||
GELOGE(FAILED, "[Call][HcomExecFinalize] failed, ret: 0x%X", hccl_ret); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
// dlclose file handle | // dlclose file handle | ||||
@@ -33,7 +33,7 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) { | |||||
Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
GELOGD("[%s] Start execute.", context.GetNodeName()); | GELOGD("[%s] Start execute.", context.GetNodeName()); | ||||
GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.", | |||||
GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute] failed for node:%s type:%s.", | |||||
node_->GetName().c_str(), node_->GetType().c_str()) | node_->GetName().c_str(), node_->GetType().c_str()) | ||||
if (done_callback) { | if (done_callback) { | ||||
GELOGD("[%s] Start invoke callback.", context.GetNodeName()); | GELOGD("[%s] Start invoke callback.", context.GetNodeName()); | ||||
@@ -70,7 +70,8 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) { | |||||
AllocationAttr attr; | AllocationAttr attr; | ||||
attr.SetMemType(HOST_DDR); | attr.SetMemType(HOST_DDR); | ||||
if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) { | if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) { | ||||
GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i); | |||||
REPORT_CALL_ERROR("E19999", "node:%s Failed to allocate output %d", context.GetNodeName(), i); | |||||
GELOGE(FAILED, "[Invoke][AllocateOutput]node:%s Failed to allocate output %d", context.GetNodeName(), i); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
auto tensor = context.GetOutput(i); | auto tensor = context.GetOutput(i); | ||||
@@ -92,14 +93,18 @@ Status HostCpuNodeTask::Execute(TaskContext &context) { | |||||
RunContext run_context; | RunContext run_context; | ||||
auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_); | auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_); | ||||
if (host_kernel == nullptr) { | if (host_kernel == nullptr) { | ||||
GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", | |||||
REPORT_CALL_ERROR("E19999", "CreateKernel failed for node %s type %s is not supported by host kernel.", | |||||
node_->GetName().c_str(), node_->GetType().c_str()); | |||||
GELOGE(UNSUPPORTED, "[Create][Kernel]node %s type %s is not supported by host kernel.", | |||||
node_->GetName().c_str(), node_->GetType().c_str()); | node_->GetName().c_str(), node_->GetType().c_str()); | ||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
Status compute_ret = host_kernel->Compute(context); | Status compute_ret = host_kernel->Compute(context); | ||||
if (compute_ret != SUCCESS) { | if (compute_ret != SUCCESS) { | ||||
GELOGE(compute_ret, "node %s type %s compute failed or not imply.", | |||||
REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", | |||||
node_->GetName().c_str(), node_->GetType().c_str()); | |||||
GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", | |||||
node_->GetName().c_str(), node_->GetType().c_str()); | node_->GetName().c_str(), node_->GetType().c_str()); | ||||
return compute_ret; | return compute_ret; | ||||
} | } | ||||
@@ -131,7 +136,10 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no | |||||
task = MakeShared<HostCpuNodeTask>(node); | task = MakeShared<HostCpuNodeTask>(node); | ||||
GE_CHECK_NOTNULL(task); | GE_CHECK_NOTNULL(task); | ||||
} else { | } else { | ||||
GELOGE(UNSUPPORTED, "node %s type %s is not support in HostCpuNodeExecutor now.", name.c_str(), type.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Create NodeTask failed for node %s type %s.", | |||||
name.c_str(), type.c_str()); | |||||
GELOGE(UNSUPPORTED, "[Create][NodeTask]node %s type %s is not support in HostCpuNodeExecutor now.", | |||||
name.c_str(), type.c_str()); | |||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -34,7 +34,9 @@ Status AssignKernel::Compute(TaskContext& context) { | |||||
const auto value_tensor = context.GetInput(kAssignValueInputIndex); | const auto value_tensor = context.GetInput(kAssignValueInputIndex); | ||||
GE_CHECK_NOTNULL(value_tensor); | GE_CHECK_NOTNULL(value_tensor); | ||||
if (value_tensor->GetSize() > ref_tensor->GetSize()) { | if (value_tensor->GetSize() > ref_tensor->GetSize()) { | ||||
GELOGE(INTERNAL_ERROR, "[%s] value_input_size=%zu, but ref_input_size=%zu.", | |||||
REPORT_INNER_ERROR("E19999", "[%s] value_input_size=%zu bigger than ref_input_size=%zu. check invalid", | |||||
node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size][%s] value_input_size=%zu, but ref_input_size=%zu.", | |||||
node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); | node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -46,7 +48,7 @@ Status AssignKernel::Compute(TaskContext& context) { | |||||
value_tensor->GetSize(), RT_MEMCPY_HOST_TO_HOST)); | value_tensor->GetSize(), RT_MEMCPY_HOST_TO_HOST)); | ||||
} | } | ||||
GE_CHK_STATUS_RET(context.SetOutput(kAssignRefOutputIndex, *ref_tensor), | GE_CHK_STATUS_RET(context.SetOutput(kAssignRefOutputIndex, *ref_tensor), | ||||
"[%s] Failed to set output.", context.GetNodeName()); | |||||
"[Set][Output] failed for[%s].", context.GetNodeName()); | |||||
GELOGD("[%s] compute success.", node_->GetName().c_str()); | GELOGD("[%s] compute success.", node_->GetName().c_str()); | ||||
return SUCCESS; | return SUCCESS; | ||||
@@ -30,7 +30,8 @@ namespace host_cpu { | |||||
Status DataKernel::Compute(TaskContext& context) { | Status DataKernel::Compute(TaskContext& context) { | ||||
auto input = context.MutableInput(kDataInputIndex); | auto input = context.MutableInput(kDataInputIndex); | ||||
GE_CHECK_NOTNULL(input); | GE_CHECK_NOTNULL(input); | ||||
GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), "[%s] Failed to set output.", context.GetNodeName()) | |||||
GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), | |||||
"[Set][Output] failed for [%s].", context.GetNodeName()) | |||||
GELOGD("[%s] compute success.", node_->GetName().c_str()); | GELOGD("[%s] compute success.", node_->GetName().c_str()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -36,31 +36,41 @@ Status RandomUniformKernel::Compute(TaskContext& context) { | |||||
(void)AttrUtils::GetInt(node_->GetOpDesc(), "seed2", seed2); | (void)AttrUtils::GetInt(node_->GetOpDesc(), "seed2", seed2); | ||||
DataType data_type = DT_FLOAT; | DataType data_type = DT_FLOAT; | ||||
if (!AttrUtils::GetDataType(node_->GetOpDesc(), kAttrDtype, data_type)) { | if (!AttrUtils::GetDataType(node_->GetOpDesc(), kAttrDtype, data_type)) { | ||||
GELOGE(PARAM_INVALID, "[%s] get attr dtype failed.", node_->GetName().c_str()); | |||||
REPORT_CALL_ERROR("E19999", "GetDataType failed for [%s].", node_->GetName().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Get][DataType] failed for [%s].", node_->GetName().c_str()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
switch (data_type) { | switch (data_type) { | ||||
case DT_FLOAT16: | case DT_FLOAT16: | ||||
if (GenerateFP16(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | if (GenerateFP16(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | ||||
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); | |||||
GELOGE(FAILED, "[Invoke][GenerateFP16]Generate random_distribution failed for %s, data_type=DT_FLOAT16", | |||||
node_->GetName().c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
break; | break; | ||||
case DT_FLOAT: | case DT_FLOAT: | ||||
if (Generate<float>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | if (Generate<float>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | ||||
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); | |||||
GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_FLOAT", | |||||
node_->GetName().c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
break; | break; | ||||
case DT_DOUBLE: | case DT_DOUBLE: | ||||
if (Generate<double>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | if (Generate<double>(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { | ||||
GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_DOUBLE"); | |||||
GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_DOUBLE", | |||||
node_->GetName().c_str()); | |||||
return FAILED; | return FAILED; | ||||
} | } | ||||
break; | break; | ||||
default: | default: | ||||
GELOGE(UNSUPPORTED, "Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE, but data_type=%s", | |||||
TypeUtils::DataTypeToSerialString(data_type).c_str()); | |||||
REPORT_INNER_ERROR("E19999", "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," | |||||
"but data_type=%s, node:%s", | |||||
TypeUtils::DataTypeToSerialString(data_type).c_str(), | |||||
node_->GetName().c_str()); | |||||
GELOGE(UNSUPPORTED, "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," | |||||
"but data_type=%s, node:%s", | |||||
TypeUtils::DataTypeToSerialString(data_type).c_str(), | |||||
node_->GetName().c_str()); | |||||
return UNSUPPORTED; | return UNSUPPORTED; | ||||
} | } | ||||
@@ -79,7 +89,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s | |||||
auto tensor_size = data_num * sizeof(T); | auto tensor_size = data_num * sizeof(T); | ||||
TensorValue tensor; | TensorValue tensor; | ||||
GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), | GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), | ||||
"[%s] Failed to allocate output of size %zu", | |||||
"[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", | |||||
context.GetNodeName(), | context.GetNodeName(), | ||||
tensor_size); | tensor_size); | ||||
@@ -101,7 +111,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s | |||||
*(buf + i) = distribution(gen); | *(buf + i) = distribution(gen); | ||||
} | } | ||||
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -115,7 +125,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 | |||||
auto tensor_size = data_num * sizeof(fp16_t); | auto tensor_size = data_num * sizeof(fp16_t); | ||||
TensorValue tensor; | TensorValue tensor; | ||||
GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), | GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), | ||||
"[%s] Failed to allocate output of size %zu", | |||||
"[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", | |||||
context.GetNodeName(), | context.GetNodeName(), | ||||
tensor_size); | tensor_size); | ||||
@@ -137,7 +147,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 | |||||
*(buf + i) = static_cast<fp16_t>(distribution(gen)); | *(buf + i) = static_cast<fp16_t>(distribution(gen)); | ||||
} | } | ||||
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output]failed for [%s].", context.GetNodeName()); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -25,11 +25,12 @@ namespace host_cpu { | |||||
Status VariableKernel::Compute(TaskContext& context) { | Status VariableKernel::Compute(TaskContext& context) { | ||||
auto tensor = context.GetVariable(node_->GetName()); | auto tensor = context.GetVariable(node_->GetName()); | ||||
if (tensor == nullptr) { | if (tensor == nullptr) { | ||||
GELOGE(PARAM_INVALID, "tensor is NULL."); | |||||
REPORT_INNER_ERROR("E19999", "Get Variable from task context for node:%s failed.", context.GetNodeName()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Get Variable from task context for node:%s failed.", context.GetNodeName()); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
// Constant & Variable Op has and only has one output | // Constant & Variable Op has and only has one output | ||||
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[%s] Failed to set output.", context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); | |||||
GELOGD("[%s] compute success.", node_->GetName().c_str()); | GELOGD("[%s] compute success.", node_->GetName().c_str()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -34,7 +34,10 @@ std::shared_ptr<Kernel> KernelFactory::CreateKernel(const NodePtr &node) { | |||||
if (iter != kernel_creator_map_.end()) { | if (iter != kernel_creator_map_.end()) { | ||||
return iter->second(node); | return iter->second(node); | ||||
} | } | ||||
GELOGE(FAILED, "Not supported, type = %s, name = %s", node->GetType().c_str(), node->GetName().c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Not supported because kernel_creator_map_ not contain type:%s, name = %s", | |||||
node->GetType().c_str(), node->GetName().c_str()); | |||||
GELOGE(FAILED, "[Find][NodeType]Not supported because kernel_creator_map_ not contain type = %s, name = %s", | |||||
node->GetType().c_str(), node->GetName().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -45,8 +45,7 @@ Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { | |||||
Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function<void()> &callback) const { | Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function<void()> &callback) const { | ||||
HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback), | HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback), | ||||
"Failed to execute task. node = %s", | |||||
context.GetNodeItem().NodeName().c_str()); | |||||
"[Execute][Task] failed. node = %s", context.GetNodeItem().NodeName().c_str()); | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -106,7 +105,10 @@ NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node | |||||
const auto &lib_name = op_desc->GetOpKernelLibName(); | const auto &lib_name = op_desc->GetOpKernelLibName(); | ||||
auto it = engine_mapping_.find(lib_name); | auto it = engine_mapping_.find(lib_name); | ||||
if (it == engine_mapping_.end()) { | if (it == engine_mapping_.end()) { | ||||
GELOGE(UNSUPPORTED, "KernelLib not supported. node = %s, lib_name = %s", node.GetName().c_str(), lib_name.c_str()); | |||||
REPORT_INNER_ERROR("E19999", "Failed to get ExecutorType by lib_name:%s, node:%s", | |||||
lib_name.c_str(), node.GetName().c_str()); | |||||
GELOGE(UNSUPPORTED, "[Find][ExecutorType]Failed to get ExecutorType by lib_name:%s, node:%s", | |||||
lib_name.c_str(), node.GetName().c_str()); | |||||
return ExecutorType::RESERVED; | return ExecutorType::RESERVED; | ||||
} | } | ||||
@@ -117,7 +119,10 @@ Status NodeExecutorManager::GetExecutor(Node &node, const NodeExecutor **executo | |||||
auto executor_type = ResolveExecutorType(node); | auto executor_type = ResolveExecutorType(node); | ||||
const auto it = executors_.find(executor_type); | const auto it = executors_.find(executor_type); | ||||
if (it == executors_.end()) { | if (it == executors_.end()) { | ||||
GELOGE(INTERNAL_ERROR, "Failed to get executor by type: %d.", static_cast<int>(executor_type)); | |||||
REPORT_INNER_ERROR("E19999", "Failed to get executor by type: %d.", | |||||
static_cast<int>(executor_type)); | |||||
GELOGE(INTERNAL_ERROR, "[Check][ExecutorType]Failed to get executor by type: %d.", | |||||
static_cast<int>(executor_type)); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -155,16 +160,16 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const { | |||||
GeShape output_shape = output_tensor.GetShape(); | GeShape output_shape = output_tensor.GetShape(); | ||||
int64_t output_mem_size = 0; | int64_t output_mem_size = 0; | ||||
GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size), | GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size), | ||||
"hccl calc tensor mem size failed."); | |||||
"[Calc][TensorMemSize] failed, node:%s.", node.GetName().c_str()); | |||||
GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1), | GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1), | ||||
"[%s] Invalid output mem size: %ld", | |||||
"[Check][Overflow][%s] Invalid output mem size: %ld", | |||||
node.GetName().c_str(), | node.GetName().c_str(), | ||||
output_mem_size); | output_mem_size); | ||||
output_mem_size = ((output_mem_size + | output_mem_size = ((output_mem_size + | ||||
MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE; | MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE; | ||||
TensorUtils::SetSize(output_tensor, output_mem_size); | TensorUtils::SetSize(output_tensor, output_mem_size); | ||||
GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor), | GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor), | ||||
"hccl update output size failed."); | |||||
"[Update][OutputDesc] failed, node:%s.", node.GetName().c_str()); | |||||
GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i, | GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i, | ||||
output_tensor.GetShape().GetDimNum(), output_mem_size); | output_tensor.GetShape().GetDimNum(), output_mem_size); | ||||
} | } | ||||
@@ -189,14 +194,17 @@ Status NodeExecutorManager::InitializeExecutors() { | |||||
GE_CHECK_NOTNULL(build_fn); | GE_CHECK_NOTNULL(build_fn); | ||||
auto executor = std::unique_ptr<NodeExecutor>(build_fn()); | auto executor = std::unique_ptr<NodeExecutor>(build_fn()); | ||||
if (executor == nullptr) { | if (executor == nullptr) { | ||||
GELOGE(INTERNAL_ERROR, "Failed to create executor for engine type = %d", static_cast<int>(engine_type)); | |||||
REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for engine type = %d", | |||||
static_cast<int>(engine_type)); | |||||
GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for engine type = %d", static_cast<int>(engine_type)); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
GELOGD("Executor of engine type = %d was created successfully", static_cast<int>(engine_type)); | GELOGD("Executor of engine type = %d was created successfully", static_cast<int>(engine_type)); | ||||
auto ret = executor->Initialize(); | auto ret = executor->Initialize(); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "Failed to initialize NodeExecutor of type = %d, clear executors", static_cast<int>(engine_type)); | |||||
REPORT_CALL_ERROR("E19999", "Initialize NodeExecutor failed for type = %d", static_cast<int>(engine_type)); | |||||
GELOGE(ret, "[Initialize][NodeExecutor] failed for type = %d", static_cast<int>(engine_type)); | |||||
for (auto &executor_it : executors_) { | for (auto &executor_it : executors_) { | ||||
executor_it.second->Finalize(); | executor_it.second->Finalize(); | ||||
} | } | ||||
@@ -38,15 +38,14 @@ Status PartitionedCallNodeTask::Init(TaskContext &context) { | |||||
Status PartitionedCallNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | Status PartitionedCallNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { | ||||
GE_CHK_STATUS_RET(subgraph_executor_->ExecuteAsync(context), | GE_CHK_STATUS_RET(subgraph_executor_->ExecuteAsync(context), | ||||
"[%s] Failed to set inputs", graph_item_->GetName().c_str()); | |||||
"[Invoke][ExecuteAsync] failed for[%s]", graph_item_->GetName().c_str()); | |||||
auto callback = [=]() { | auto callback = [=]() { | ||||
Callback(done_callback); | Callback(done_callback); | ||||
}; | }; | ||||
GE_CHK_STATUS_RET(context.RegisterCallback(callback), | GE_CHK_STATUS_RET(context.RegisterCallback(callback), | ||||
"[%s] Failed to register callback", | |||||
graph_item_->GetName().c_str()); | |||||
"[Register][Callback] failed for [%s]", graph_item_->GetName().c_str()); | |||||
GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str()); | GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str()); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -83,7 +82,7 @@ Status PartitionedCallNodeExecutor::LoadTask(const ge::hybrid::HybridModel &mode | |||||
Status PartitionedCallNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { | Status PartitionedCallNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { | ||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] Start"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] Start"); | ||||
GE_CHK_STATUS_RET(task.Init(context), "[%s] Failed to init task.", context.GetNodeName()); | |||||
GE_CHK_STATUS_RET(task.Init(context), "[Init][Task] failed for [%s].", context.GetNodeName()); | |||||
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] End"); | RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] End"); | ||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
@@ -63,17 +63,22 @@ std::unique_ptr<TaskContext> TaskContext::Create(NodeState *node_state, | |||||
node_item.output_start, | node_item.output_start, | ||||
node_item.num_outputs); | node_item.num_outputs); | ||||
if (node_item.input_start < 0 || node_item.output_start < 0) { | if (node_item.input_start < 0 || node_item.output_start < 0) { | ||||
REPORT_INNER_ERROR("E19999", "NodeItem:%s(%s) not property initialized." | |||||
"input_start:%d or output_start:%d less than 0", | |||||
node_item.NodeName().c_str(), node_item.NodeType().c_str(), | |||||
node_item.input_start, node_item.output_start); | |||||
GELOGE(INTERNAL_ERROR, | GELOGE(INTERNAL_ERROR, | ||||
"NodeItem not property initialized. input_start = %d, output_start = %d", | |||||
node_item.input_start, | |||||
node_item.output_start); | |||||
"[Check][Param]NodeItem:%s(%s) not property initialized. input_start = %d, output_start = %d", | |||||
node_item.NodeName().c_str(), node_item.NodeType().c_str(), | |||||
node_item.input_start, node_item.output_start); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
auto task_context = std::unique_ptr<TaskContext>( | auto task_context = std::unique_ptr<TaskContext>( | ||||
new(std::nothrow)TaskContext(execution_context, node_state, subgraph_context)); | new(std::nothrow)TaskContext(execution_context, node_state, subgraph_context)); | ||||
if (task_context == nullptr) { | if (task_context == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "[%s] Failed to create instance of TaskContext.", node_item.NodeName().c_str()); | |||||
REPORT_CALL_ERROR("E19999", "Create TaskContext failed for [%s].", node_item.NodeName().c_str()); | |||||
GELOGE(MEMALLOC_FAILED, "[Create][TaskContext] failed for [%s].", node_item.NodeName().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -94,7 +99,12 @@ int TaskContext::NumOutputs() const { | |||||
TensorValue *TaskContext::MutableInput(int index) { | TensorValue *TaskContext::MutableInput(int index) { | ||||
if (index < 0 || index >= node_item_->num_inputs) { | if (index < 0 || index >= node_item_->num_inputs) { | ||||
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); | |||||
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_inputs = %d, node:%s(%s)", | |||||
index, node_item_->num_inputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_inputs = %d, node:%s(%s)", | |||||
index, node_item_->num_inputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -103,7 +113,12 @@ TensorValue *TaskContext::MutableInput(int index) { | |||||
const TensorValue *TaskContext::GetOutput(int index) const { | const TensorValue *TaskContext::GetOutput(int index) const { | ||||
if (index < 0 || index >= node_item_->num_outputs) { | if (index < 0 || index >= node_item_->num_outputs) { | ||||
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); | |||||
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", | |||||
index, node_item_->num_outputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", | |||||
index, node_item_->num_outputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -112,7 +127,12 @@ const TensorValue *TaskContext::GetOutput(int index) const { | |||||
TensorValue *TaskContext::MutableOutput(int index) { | TensorValue *TaskContext::MutableOutput(int index) { | ||||
if (index < 0 || index >= node_item_->num_outputs) { | if (index < 0 || index >= node_item_->num_outputs) { | ||||
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); | |||||
REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", | |||||
index, node_item_->num_outputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", | |||||
index, node_item_->num_outputs, | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -125,7 +145,10 @@ std::size_t TaskContext::NumWorkspaces() const { | |||||
void *TaskContext::MutableWorkspace(int index) { | void *TaskContext::MutableWorkspace(int index) { | ||||
if (index < 0 || static_cast<size_t>(index) >= workspaces_.size()) { | if (index < 0 || static_cast<size_t>(index) >= workspaces_.size()) { | ||||
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_workspaces = %d", index, node_item_->num_outputs); | |||||
REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. number:%zu of workspaces_, node:%s(%s)", | |||||
index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. number:%zu of workspaces_, node:%s(%s)", | |||||
index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -134,7 +157,11 @@ void *TaskContext::MutableWorkspace(int index) { | |||||
const TensorValue *TaskContext::GetInput(int index) const { | const TensorValue *TaskContext::GetInput(int index) const { | ||||
if (index < 0 || index >= node_item_->num_inputs) { | if (index < 0 || index >= node_item_->num_inputs) { | ||||
GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); | |||||
REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. num_inputs:%d node:%s(%s)", | |||||
index, node_item_->num_inputs, node_item_->NodeName().c_str(), | |||||
node_item_->NodeType().c_str()); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. num_inputs:%d node:%s(%s)", | |||||
index, node_item_->num_inputs, node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -146,7 +173,10 @@ Status TaskContext::AllocateWorkspaces() { | |||||
for (auto size : workspace_sizes) { | for (auto size : workspace_sizes) { | ||||
void *workspace = execution_context_->allocator->Allocate(size); | void *workspace = execution_context_->allocator->Allocate(size); | ||||
if (workspace == nullptr) { | if (workspace == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size: %ld", size); | |||||
REPORT_CALL_ERROR("E19999", "node:%s(%s) Allocate workspace failed, size: %ld", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); | |||||
GELOGE(MEMALLOC_FAILED, "[Allocate][workspace] failed for node:%s(%s), size: %ld", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); | |||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
@@ -162,7 +192,8 @@ Status TaskContext::RegisterCallback(const std::function<void()> &callback_fun) | |||||
} | } | ||||
auto ret = execution_context_->callback_manager->RegisterCallback(GetStream(), callback_fun); | auto ret = execution_context_->callback_manager->RegisterCallback(GetStream(), callback_fun); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(ret, "[%s] Failed to register callback", GetNodeName()); | |||||
REPORT_CALL_ERROR("E19999", "RegisterCallback failed for [%s]", GetNodeName()); | |||||
GELOGE(ret, "[Register][Callback] failed for [%s]", GetNodeName()); | |||||
execution_context_->callback_manager->Destroy(); | execution_context_->callback_manager->Destroy(); | ||||
return ret; | return ret; | ||||
} | } | ||||
@@ -187,7 +218,8 @@ string TaskContext::TensorDesc2String(const GeTensorDesc &desc) { | |||||
Status TaskContext::AllocateTensor(const GeTensorDesc &tensor_desc, TensorValue &tensor, AllocationAttr *attr) { | Status TaskContext::AllocateTensor(const GeTensorDesc &tensor_desc, TensorValue &tensor, AllocationAttr *attr) { | ||||
int64_t size = 0; | int64_t size = 0; | ||||
if (ge::TensorUtils::GetSize(tensor_desc, size) != GRAPH_SUCCESS) { | if (ge::TensorUtils::GetSize(tensor_desc, size) != GRAPH_SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, "Failed to get tensor size"); | |||||
REPORT_CALL_ERROR("E19999", "Get TensorSize failed, tensor:%s", tensor_desc.GetName().c_str()); | |||||
GELOGE(INTERNAL_ERROR, "[Get][TensorSize] failed, tensor:%s", tensor_desc.GetName().c_str()); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||
@@ -211,7 +243,12 @@ Status TaskContext::AllocateOutput(int index, | |||||
TensorDesc2String(tensor_desc).c_str()); | TensorDesc2String(tensor_desc).c_str()); | ||||
if (index < 0 || index >= node_item_->num_outputs) { | if (index < 0 || index >= node_item_->num_outputs) { | ||||
GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); | |||||
REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), | |||||
node_item_->num_outputs, index); | |||||
GELOGE(PARAM_INVALID, "[Check][Param] %s(%s) output index out of range. num_output = %d, index = %d", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), | |||||
node_item_->num_outputs, index); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -289,7 +326,10 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) { | |||||
Status TaskContext::AllocateTensor(size_t size, TensorValue &tensor, AllocationAttr *attr) { | Status TaskContext::AllocateTensor(size_t size, TensorValue &tensor, AllocationAttr *attr) { | ||||
auto buffer = TensorBuffer::Create(execution_context_->allocator, size, attr); | auto buffer = TensorBuffer::Create(execution_context_->allocator, size, attr); | ||||
if (buffer == nullptr) { | if (buffer == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "Failed to allocate buffer of size: %zu", size); | |||||
REPORT_CALL_ERROR("E19999", "%s(%s) Allocate buffer failed, size: %zu", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); | |||||
GELOGE(MEMALLOC_FAILED, "[Allocate][buffer] failed for %s(%s), size: %zu", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); | |||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
@@ -303,7 +343,12 @@ const NodeItem &TaskContext::GetNodeItem() const { | |||||
Status TaskContext::SetOutput(int index, const TensorValue &tensor) { | Status TaskContext::SetOutput(int index, const TensorValue &tensor) { | ||||
if (index < 0 || index >= node_item_->num_outputs) { | if (index < 0 || index >= node_item_->num_outputs) { | ||||
GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); | |||||
REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), | |||||
node_item_->num_outputs, index); | |||||
GELOGE(PARAM_INVALID, "[Check][Param]%s(%s) output index out of range. num_output = %d, index = %d", | |||||
node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), | |||||
node_item_->num_outputs, index); | |||||
return PARAM_INVALID; | return PARAM_INVALID; | ||||
} | } | ||||
@@ -368,7 +413,8 @@ Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr | |||||
} | } | ||||
if (*buffer == nullptr) { | if (*buffer == nullptr) { | ||||
GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size = %zu", size); | |||||
REPORT_CALL_ERROR("E19999", "Allocate Workspace failed, size = %zu", size); | |||||
GELOGE(MEMALLOC_FAILED, "[Allocate][Workspace] failed, size = %zu", size); | |||||
return MEMALLOC_FAILED; | return MEMALLOC_FAILED; | ||||
} | } | ||||
@@ -400,11 +446,11 @@ Status TaskContext::PropagateOutputs() { | |||||
input_offset); | input_offset); | ||||
if (subgraph_context_->all_inputs_.size() <= static_cast<size_t>(input_offset)) { | if (subgraph_context_->all_inputs_.size() <= static_cast<size_t>(input_offset)) { | ||||
GELOGE(INTERNAL_ERROR, | |||||
"[%s] input index out of range. index = %d, total input num = %zu", | |||||
GetNodeName(), | |||||
input_offset, | |||||
subgraph_context_->all_inputs_.size()); | |||||
REPORT_INNER_ERROR("E19999", | |||||
"[%s] input index out of range check invalid. index = %d, total input num = %zu", | |||||
GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); | |||||
GELOGE(INTERNAL_ERROR, "[Check][Size][%s] input index out of range. index = %d, total input num = %zu", | |||||
GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); | |||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
} | } | ||||