From 6a3dd3d66e4a513a1d1245fc43396b2f0f70b15c Mon Sep 17 00:00:00 2001 From: dajunli Date: Thu, 17 Dec 2020 19:28:06 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9hccl=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E7=9B=B8=E5=85=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ge/graph/manager/memory_api.cc | 2 +- .../node_executor/hccl/hccl_node_executor.cc | 50 +++++++++---------- metadef | 2 +- parser | 2 +- third_party/fwkacllib/inc/hccl/hcom.h | 1 - 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/ge/graph/manager/memory_api.cc b/ge/graph/manager/memory_api.cc index 45e4bb65..0798eb51 100644 --- a/ge/graph/manager/memory_api.cc +++ b/ge/graph/manager/memory_api.cc @@ -63,7 +63,7 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t }); auto hcom_remote_mem_register = - (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "hcom_remote_access_mem_register"); + (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "HcomRegRemoteAccessMem"); if (hcom_remote_mem_register == nullptr) { GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); return FAILED; diff --git a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc index 704cab77..eebe2a81 100644 --- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc +++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc @@ -42,10 +42,10 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do GELOGE(FAILED, "hccl handle is nullptr! "); return FAILED; } - auto EnqueueHcomOpertion = (HcclResult(*)(HcomOpertion, std::function))dlsym( - context.handle_, "EnqueueHcomOpertion"); - if (EnqueueHcomOpertion == nullptr) { - GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function."); + auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function))dlsym( + context.handle_, "HcomExecEnqueueOperation"); + if (HcomExecEnqueueOperation == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -70,7 +70,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do const OpDescPtr op_desc = node_item.GetOpDesc(); GE_CHECK_NOTNULL(op_desc); - HcomOpertion op_info; + HcomOperation op_info; op_info.hcclType = op_desc->GetType(); op_info.inputPtr = inputs.empty() ? nullptr : inputs[0]; op_info.outputPtr = outputs.empty() ? nullptr : outputs[0]; @@ -96,7 +96,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do op_info.root = root_id; auto callback = [this, op_desc](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status); + GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", op_desc->GetName().c_str(), status); } std::lock_guard lock(this->hccl_mutex_); this->cond_.notify_all(); @@ -110,9 +110,9 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); op_info.count = count; - HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback); + HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; } @@ -213,11 +213,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector done_callback) { GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName()); - auto EnqueueRemoteAccess = + auto HcomExecEnqueueRemoteAccess = (HcclResult(*)(const string &, const vector &, - std::function))dlsym(context.handle_, "EnqueueRemoteAccess"); - if (EnqueueRemoteAccess == nullptr) { - GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function."); + std::function))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); + if (HcomExecEnqueueRemoteAccess == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -228,15 +228,15 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do auto callback = [this](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", status); } std::lock_guard lock(this->hccl_mutex_); this->cond_.notify_all(); GELOGI("rdma callback success."); }; - HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); + HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; } @@ -307,32 +307,32 @@ Status HcclNodeExecutor::Initialize() { GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); return FAILED; } - auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize"); - if (HcomExcutorInitialize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function."); + auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); + if (HcomExecInitialize == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); return FAILED; } - HcclResult hccl_ret = HcomExcutorInitialize(); + HcclResult hccl_ret = HcomExecInitialize(); if (hccl_ret == HCCL_E_PTR) { GELOGI("Hccl comm is null, hcom executor initialize is not required."); } else if (hccl_ret == HCCL_SUCCESS) { GELOGI("Hcom executor initialize success."); } else { - GELOGE(FAILED, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return FAILED; } return SUCCESS; } Status HcclNodeExecutor::Finalize() { - auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize"); - if (HcomExcutorFinalize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function."); + auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); + if (HcomExecFinalize == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); return FAILED; } - HcclResult hccl_ret = HcomExcutorFinalize(); + HcclResult hccl_ret = HcomExecFinalize(); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); return FAILED; } // dlclose file handle diff --git a/metadef b/metadef index 129b50b4..dba83744 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit 129b50b41f79d0dfeb9fe8987b1c19c9ac51eb8b +Subproject commit dba83744a3ffe3d5f89496e69bb65c50f800c299 diff --git a/parser b/parser index e9f7d019..ce574894 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit e9f7d0197aba57eb5247cb1e029c10e393631c89 +Subproject commit ce574894f13cd94749d1a3964a13e8c97c20434a diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h index ba60cc96..e491d43f 100644 --- a/third_party/fwkacllib/inc/hccl/hcom.h +++ b/third_party/fwkacllib/inc/hccl/hcom.h @@ -251,6 +251,5 @@ HcclResult HcomExecEnqueueRemoteAccess(const std::string& remoteAccessType, #ifdef __cplusplus } - #endif // __cplusplus #endif // HCOM_H_