|
|
@@ -42,10 +42,10 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do |
|
|
|
GELOGE(FAILED, "hccl handle is nullptr! "); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
auto EnqueueHcomOpertion = (HcclResult(*)(HcomOpertion, std::function<void(HcclResult status)>))dlsym( |
|
|
|
context.handle_, "EnqueueHcomOpertion"); |
|
|
|
if (EnqueueHcomOpertion == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function."); |
|
|
|
auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function<void(HcclResult status)>))dlsym( |
|
|
|
context.handle_, "HcomExecEnqueueOperation"); |
|
|
|
if (HcomExecEnqueueOperation == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); |
|
|
|
if (dlclose(context.handle_) != 0) { |
|
|
|
GELOGW("Failed to close handle %s", dlerror()); |
|
|
|
} |
|
|
@@ -70,7 +70,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do |
|
|
|
const OpDescPtr op_desc = node_item.GetOpDesc(); |
|
|
|
GE_CHECK_NOTNULL(op_desc); |
|
|
|
|
|
|
|
HcomOpertion op_info; |
|
|
|
HcomOperation op_info; |
|
|
|
op_info.hcclType = op_desc->GetType(); |
|
|
|
op_info.inputPtr = inputs.empty() ? nullptr : inputs[0]; |
|
|
|
op_info.outputPtr = outputs.empty() ? nullptr : outputs[0]; |
|
|
@@ -96,7 +96,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do |
|
|
|
op_info.root = root_id; |
|
|
|
auto callback = [this, op_desc](HcclResult status) { |
|
|
|
if (status != HCCL_SUCCESS) { |
|
|
|
GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status); |
|
|
|
GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", op_desc->GetName().c_str(), status); |
|
|
|
} |
|
|
|
std::lock_guard<std::mutex> lock(this->hccl_mutex_); |
|
|
|
this->cond_.notify_all(); |
|
|
@@ -110,9 +110,9 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do |
|
|
|
context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); |
|
|
|
op_info.count = count; |
|
|
|
|
|
|
|
HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback); |
|
|
|
HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); |
|
|
|
if (hccl_ret != HCCL_SUCCESS) { |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
return HCCL_E_INTERNAL; |
|
|
|
} |
|
|
|
|
|
|
@@ -213,11 +213,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess |
|
|
|
|
|
|
|
Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) { |
|
|
|
GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName()); |
|
|
|
auto EnqueueRemoteAccess = |
|
|
|
auto HcomExecEnqueueRemoteAccess = |
|
|
|
(HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &, |
|
|
|
std::function<void(HcclResult status)>))dlsym(context.handle_, "EnqueueRemoteAccess"); |
|
|
|
if (EnqueueRemoteAccess == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function."); |
|
|
|
std::function<void(HcclResult status)>))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); |
|
|
|
if (HcomExecEnqueueRemoteAccess == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); |
|
|
|
if (dlclose(context.handle_) != 0) { |
|
|
|
GELOGW("Failed to close handle %s", dlerror()); |
|
|
|
} |
|
|
@@ -228,15 +228,15 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do |
|
|
|
|
|
|
|
auto callback = [this](HcclResult status) { |
|
|
|
if (status != HCCL_SUCCESS) { |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", status); |
|
|
|
} |
|
|
|
std::lock_guard<std::mutex> lock(this->hccl_mutex_); |
|
|
|
this->cond_.notify_all(); |
|
|
|
GELOGI("rdma callback success."); |
|
|
|
}; |
|
|
|
HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); |
|
|
|
HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); |
|
|
|
if (hccl_ret != HCCL_SUCCESS) { |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
return HCCL_E_INTERNAL; |
|
|
|
} |
|
|
|
|
|
|
@@ -307,32 +307,32 @@ Status HcclNodeExecutor::Initialize() { |
|
|
|
GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize"); |
|
|
|
if (HcomExcutorInitialize == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function."); |
|
|
|
auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); |
|
|
|
if (HcomExecInitialize == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
HcclResult hccl_ret = HcomExcutorInitialize(); |
|
|
|
HcclResult hccl_ret = HcomExecInitialize(); |
|
|
|
if (hccl_ret == HCCL_E_PTR) { |
|
|
|
GELOGI("Hccl comm is null, hcom executor initialize is not required."); |
|
|
|
} else if (hccl_ret == HCCL_SUCCESS) { |
|
|
|
GELOGI("Hcom executor initialize success."); |
|
|
|
} else { |
|
|
|
GELOGE(FAILED, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status HcclNodeExecutor::Finalize() { |
|
|
|
auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize"); |
|
|
|
if (HcomExcutorFinalize == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function."); |
|
|
|
auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); |
|
|
|
if (HcomExecFinalize == nullptr) { |
|
|
|
GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
HcclResult hccl_ret = HcomExcutorFinalize(); |
|
|
|
HcclResult hccl_ret = HcomExecFinalize(); |
|
|
|
if (hccl_ret != HCCL_SUCCESS) { |
|
|
|
GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret); |
|
|
|
GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); |
|
|
|
return FAILED; |
|
|
|
} |
|
|
|
// dlclose file handle |
|
|
|