@@ -62,7 +62,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::LoadFro | |||
char *data = new (std::nothrow) char[len]; | |||
if (data == nullptr) { | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Load model From file failed, bad memory allocation occur. (need:%u)", len); | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Load][ModelFromFile]Failed, " | |||
"bad memory allocation occur(need %u), file %s", len, model_path); | |||
REPORT_CALL_ERROR("E19999", "Load model from file %s failed, " | |||
@@ -90,33 +89,45 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo | |||
GE_CHECK_NOTNULL(model.model_data); | |||
// Model length too small | |||
GE_CHK_BOOL_RET_STATUS(model.model_len >= sizeof(ModelFileHeader), ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||
"Invalid model. Model data size %u must be greater than or equal to %zu.", model.model_len, | |||
sizeof(ModelFileHeader)); | |||
GE_CHK_BOOL_EXEC(model.model_len >= sizeof(ModelFileHeader), | |||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||
"[Check][Param] Invalid model. Model data size %u must be greater than or equal to %zu.", | |||
model.model_len, sizeof(ModelFileHeader)); | |||
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); | |||
// Get file header | |||
auto file_header = reinterpret_cast<ModelFileHeader *>(model.model_data); | |||
// Determine whether the file length and magic number match | |||
GE_CHK_BOOL_RET_STATUS( | |||
file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM, | |||
ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||
"Invalid model. file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != model->model_len[%u] || " | |||
"MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", | |||
file_header->length, sizeof(ModelFileHeader), model.model_len, MODEL_FILE_MAGIC_NUM, file_header->magic); | |||
GE_CHK_BOOL_EXEC(file_header->length == model.model_len - sizeof(ModelFileHeader) && | |||
file_header->magic == MODEL_FILE_MAGIC_NUM, | |||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, | |||
"[Check][Param] Invalid model, file_header->length[%u] + sizeof(ModelFileHeader)[%zu] != " | |||
"model->model_len[%u] || MODEL_FILE_MAGIC_NUM[%u] != file_header->magic[%u]", | |||
file_header->length, sizeof(ModelFileHeader), model.model_len, | |||
MODEL_FILE_MAGIC_NUM, file_header->magic); | |||
return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;); | |||
Status res = SUCCESS; | |||
// Get data address | |||
uint8_t *data = reinterpret_cast<uint8_t *>(model.model_data) + sizeof(ModelFileHeader); | |||
if (file_header->is_encrypt == ModelEncryptType::UNENCRYPTED) { // Unencrypted model | |||
GE_CHK_BOOL_RET_STATUS(model.key.empty(), ACL_ERROR_GE_PARAM_INVALID, | |||
"Invalid param. model is unencrypted, but key is not empty."); | |||
if (!model.key.empty()) { | |||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||
"[Check][Param] Invalid param, model is unencrypted, but key is not empty."); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
model_data = data; | |||
model_len = file_header->length; | |||
GELOGD("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); | |||
} else { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param]Invalid, model encrypt type not supported"); | |||
REPORT_CALL_ERROR("E19999","Invalid model, encrypt type not supported"); | |||
REPORT_INPUT_ERROR("E10003", std::vector<std::string>({"parameter", "value", "reason"}), | |||
std::vector<std::string>({"om", model.om_name.c_str(), "invalid om file"})); | |||
res = ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
@@ -33,12 +33,12 @@ Status GraphLoader::UnloadModel(uint32_t model_id) { | |||
Status ret = model_manager->Stop(model_id); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "UnloadModel: Stop failed. model id:%u", model_id); | |||
GELOGE(ret, "[Stop][Model] failed. model id:%u", model_id); | |||
} | |||
ret = model_manager->Unload(model_id); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id); | |||
GELOGE(ret, "[Unload][Model] failed. model id:%u", model_id); | |||
return ret; | |||
} | |||
GELOGI("UnLoad model success, model id:%u.", model_id); | |||
@@ -50,14 +50,13 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||
GELOGI("Load model online begin."); | |||
rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
return RT_FAILED; | |||
} | |||
if (ge_root_model_ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Check param ge_root_model_ptr nullptr, check invalid"); | |||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph] GE load graph model_ptr is nullptr."); | |||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph][Check][Param] GE load graph model_ptr is nullptr."); | |||
return GE_GRAPH_PARAM_NULLPTR; | |||
} | |||
@@ -65,12 +64,12 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); | |||
GELOGE(ret, "[Load][Model] Online failed. ret = %u, model_id:%u", ret, model_id); | |||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
} | |||
return ret; | |||
} | |||
@@ -81,31 +80,31 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
} | |||
return SUCCESS; | |||
} | |||
ret = model_manager->Start(model_id); | |||
if (ret != SUCCESS) { | |||
if (model_manager->Unload(model_id) != SUCCESS) { | |||
GELOGE(ret, "LoadModel: Unload failed while trying to unload after a failed start."); | |||
GELOGE(ret, "[Unload][Model] failed while trying to unload after a failed start, model_id:%u.", model_id); | |||
} | |||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
} | |||
GELOGE(ret, "LoadModel: Start failed."); | |||
GELOGE(ret, "[Start][Model] failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
return RT_FAILED; | |||
} | |||
GELOGI("Load model online success, model_id:%u.", model_id); | |||
@@ -118,7 +117,7 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->GetMaxUsedMemory(model_id, max_size); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "GetMaxUsedMemory: GetMaxUsedMemory failed."); | |||
GELOGE(ret, "[Call][GetMaxUsedMemory] failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
return SUCCESS; | |||
@@ -127,21 +126,20 @@ Status GraphLoader::GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size) { | |||
Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string &key_path, int32_t priority, | |||
ModelData &model_data) { | |||
if (!CheckInputPathValid(path)) { | |||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); | |||
GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "[Check][Param] model path is invalid:%s", path.c_str()); | |||
return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; | |||
} | |||
GELOGI("Load model begin, model path is: %s", path.c_str()); | |||
if (!key_path.empty() && !CheckInputPathValid(key_path)) { | |||
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", | |||
key_path.c_str()); | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); | |||
REPORT_INNER_ERROR("E19999", "Param key_path:%s empty or invalid", key_path.c_str()); | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[Check][Param] decrypt_key path is invalid:%s", key_path.c_str()); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
Status ret = ModelParserBase::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); | |||
GELOGE(ret, "[Call][LoadFromFile] failed. ret = %u, path:%s, key path:%s", ret, path.c_str(), key_path.c_str()); | |||
if (model_data.model_data != nullptr) { | |||
delete[] static_cast<char *>(model_data.model_data); | |||
model_data.model_data = nullptr; | |||
@@ -156,18 +154,19 @@ Status GraphLoader::CommandHandle(const Command &command) { | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->HandleCommand(command); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "CommandHandle: Command Handle failed."); | |||
GELOGE(ret, "[Handle][Command] failed, module_index:%lu.", command.module_index); | |||
return ret; | |||
} | |||
} catch (std::bad_alloc &) { | |||
REPORT_INNER_ERROR("E19999", "Bad memory allocation occur"); | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Command handle failed, bad memory allocation occur !"); | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Handle][Command] failed, " | |||
"bad memory allocation occur, module_index:%lu.", command.module_index); | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
} catch (...) { | |||
REPORT_INNER_ERROR("E19999", "Some exceptions occur"); | |||
GELOGE(FAILED, "Command handle failed, some exceptions occur !"); | |||
GELOGE(FAILED, "[Handle][Command] failed, some exceptions occur, module_index:%lu.", command.module_index); | |||
return FAILED; | |||
} | |||
@@ -184,7 +183,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model | |||
Status ret = model_manager->LoadModelOffline( | |||
model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Load model failed, model_id:%u.", model_id); | |||
GELOGE(ret, "[Load][Model] failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
GELOGI("Load model success, model_id:%u.", model_id); | |||
@@ -210,7 +209,7 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id); | |||
GELOGE(ret, "[Load][Model] with queue failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
@@ -237,7 +236,7 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn | |||
Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, | |||
input_data, input_desc, output_data, output_desc); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Execute model failed, model_id:%u.", model_id); | |||
GELOGE(ret, "[Execute][Model] failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
@@ -250,7 +249,7 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtSetDevice] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
return RT_FAILED; | |||
} | |||
size_t total_mem = 0; | |||
@@ -258,14 +257,14 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) { | |||
rt_ret = rtMemGetInfo(&free_mem, &total_mem); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMemGetInfo failed, ret:0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtMemGetInfo] failed, ret:0x%X", rt_ret); | |||
return RT_FAILED; | |||
} | |||
rt_ret = rtDeviceReset(GetContext().DeviceId()); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", | |||
GetContext().DeviceId(), rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtDeviceReset] failed, device_id:%u, ret:0x%X", GetContext().DeviceId(), rt_ret); | |||
return RT_FAILED; | |||
} | |||
// Add small page memory size | |||
@@ -280,7 +279,8 @@ Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, u | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Destroy aicpu kernel failed."); | |||
GELOGE(ret, "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u.", | |||
session_id, model_id, sub_model_id); | |||
return ret; | |||
} | |||
return SUCCESS; | |||
@@ -291,7 +291,7 @@ Status GraphLoader::DestroyAicpuSessionForInfer(uint32_t model_id) { | |||
GE_CHECK_NOTNULL(model_manager); | |||
Status ret = model_manager->DestroyAicpuSessionForInfer(model_id); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Destroy aicpu serrion for infer failed."); | |||
GELOGE(ret, "[Call][DestroyAicpuSessionForInfer] failed, model_id:%u.", model_id); | |||
return ret; | |||
} | |||
return SUCCESS; | |||
@@ -310,7 +310,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { | |||
std::lock_guard<std::mutex> lock(exeception_infos_mutex_); | |||
auto instance = ModelManager::GetInstance(); | |||
if (instance == nullptr) { | |||
GELOGE(FAILED, "Instance is nullptr"); | |||
GELOGE(FAILED, "[Get][Instance] failed, as ret is nullptr"); | |||
return; | |||
} | |||
instance->AddExceptionInfo(*rt_exception_info); | |||
@@ -26,10 +26,10 @@ | |||
#define VALIDATE_MEM_RANGE(OP, SIZE, OFFSET) \ | |||
do { \ | |||
if (SIZE <= static_cast<uint64_t>(OFFSET)) { \ | |||
REPORT_INNER_ERROR("E19999", \ | |||
"Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ | |||
REPORT_INNER_ERROR("E19999", "Node:%s(%s) offset:%ld out of range size:%lu, check invalid", \ | |||
OP->GetName().c_str(), OP->GetType().c_str(), OFFSET, SIZE); \ | |||
GELOGE(OUT_OF_MEMORY, "Node: %s, memory out of range[%lu: %ld]", OP->GetName().c_str(), SIZE, OFFSET); \ | |||
GELOGE(OUT_OF_MEMORY, "[Check][Param]Node: %s, memory out of range[%lu: %ld]", \ | |||
OP->GetName().c_str(), SIZE, OFFSET); \ | |||
return {}; \ | |||
} \ | |||
} while (0) | |||
@@ -312,8 +312,9 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co | |||
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s), check invalid", | |||
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu", | |||
op_desc->GetName().c_str(), v_memory_type.size(), inputs_size); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != input_desc.size:%zu, op:%s(%s)", | |||
ATTR_NAME_INPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), inputs_size, | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return v_input_data_addr; | |||
} | |||
for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { | |||
@@ -392,8 +393,7 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc | |||
case RT_MEMORY_RDMA_HBM: | |||
if (offset < 0) { | |||
REPORT_INNER_ERROR("E19999", "Param offset:%ld < 0, check invalid", offset); | |||
GELOGE(PARAM_INVALID, "rdma var addr is invalid, addr=%p", | |||
reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset))); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Param offset:%ld cannot be negative", offset); | |||
return PARAM_INVALID; | |||
} | |||
var_addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(offset)); | |||
@@ -403,9 +403,9 @@ Status ModelUtils::GetVarAddr(const RuntimeParam &model_param, const ConstOpDesc | |||
var_addr = model_param.var_base + offset - model_param.logic_var_base; | |||
break; | |||
default: | |||
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", | |||
mem_type, offset); | |||
GELOGE(PARAM_INVALID, "unsupported memory type %u", mem_type); | |||
REPORT_INNER_ERROR("E19999", "Get mem_type:%d for offset:%ld is unsupported, check invalid", mem_type, offset); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Get mem_type:%d for offset:%ld is unsupported, check invalid", | |||
mem_type, offset); | |||
return PARAM_INVALID; | |||
} | |||
GE_CHECK_NOTNULL(var_addr); | |||
@@ -433,9 +433,9 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C | |||
REPORT_INNER_ERROR("E19999", "Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s), check invalid", | |||
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, | |||
"Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu", | |||
op_desc->GetName().c_str(), v_memory_type.size(), outputs_size); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s, memory_type.size:%zu != output_desc.size:%zu, op:%s(%s)", | |||
ATTR_NAME_OUTPUT_MEM_TYPE_LIST.c_str(), v_memory_type.size(), outputs_size, | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return v_output_data_addr; | |||
} | |||
for (size_t i = 0; i < outputs_size; ++i) { | |||
@@ -594,7 +594,7 @@ Status ModelUtils::GetRtAddress(const RuntimeParam ¶m, uintptr_t logic_addr, | |||
} else if (logic_addr != 0) { | |||
mem_addr = nullptr; | |||
REPORT_INNER_ERROR("E19999", "Check param logic addr:0x%lx abnormal", logic_addr); | |||
GELOGE(PARAM_INVALID, "The logic addr:0x%lx is abnormal", logic_addr); | |||
GELOGE(PARAM_INVALID, "[Check][Param] The logic addr:0x%lx is abnormal", logic_addr); | |||
return PARAM_INVALID; | |||
} | |||
@@ -24,7 +24,7 @@ namespace ge { | |||
void TbeHandleInfo::used_inc(uint32_t num) { | |||
if (used_ > std::numeric_limits<uint32_t>::max() - num) { | |||
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric max", used_); | |||
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric max.", used_); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric max.", used_); | |||
return; | |||
} | |||
@@ -34,7 +34,7 @@ void TbeHandleInfo::used_inc(uint32_t num) { | |||
void TbeHandleInfo::used_dec(uint32_t num) { | |||
if (used_ < std::numeric_limits<uint32_t>::min() + num) { | |||
REPORT_INNER_ERROR("E19999", "Used:%u reach numeric min", used_); | |||
GELOGE(INTERNAL_ERROR, "Used[%u] reach numeric min.", used_); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Used[%u] reach numeric min.", used_); | |||
return; | |||
} | |||
@@ -107,9 +107,8 @@ void TBEHandleStore::ReferTBEHandle(const std::string &name) { | |||
std::lock_guard<std::mutex> lock(mutex_); | |||
auto it = kernels_.find(name); | |||
if (it == kernels_.end()) { | |||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", | |||
name.c_str()); | |||
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", name.c_str()); | |||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", name.c_str()); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", name.c_str()); | |||
return; | |||
} | |||
@@ -128,9 +127,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names | |||
for (auto &item : names) { | |||
auto it = kernels_.find(item.first); | |||
if (it == kernels_.end()) { | |||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", | |||
item.first.c_str()); | |||
GELOGE(INTERNAL_ERROR, "Kernel[%s] not found in stored.", item.first.c_str()); | |||
REPORT_INNER_ERROR("E19999", "Kernel:%s not found in stored check invalid", item.first.c_str()); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Kernel[%s] not found in stored.", item.first.c_str()); | |||
continue; | |||
} | |||
@@ -142,7 +140,8 @@ void TBEHandleStore::EraseTBEHandle(const std::map<std::string, uint32_t> &names | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_INNER_ERROR("E19999", "Call rtDevBinaryUnRegister failed for Kernel:%s fail, ret:0x%X", | |||
item.first.c_str(), rt_ret); | |||
GELOGE(INTERNAL_ERROR, "Kernel[%s] UnRegister handle fail:%u.", item.first.c_str(), rt_ret); | |||
GELOGE(INTERNAL_ERROR, "[Call][RtDevBinaryUnRegister] Kernel[%s] UnRegister handle fail:%u.", | |||
item.first.c_str(), rt_ret); | |||
} | |||
kernels_.erase(it); | |||
} | |||
@@ -43,7 +43,7 @@ class TsMemMall { | |||
for (auto it : mem_store_size_) { | |||
rtError_t ret = rtFree(it.second); | |||
if (ret != RT_ERROR_NONE) { | |||
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); | |||
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); | |||
} | |||
} | |||
mem_store_size_.clear(); | |||
@@ -52,7 +52,7 @@ class TsMemMall { | |||
void *Acquire(int64_t offset, uint64_t size) { | |||
if (size == 0) { | |||
GELOGE(RT_FAILED, "Acquire mem block failed, size: %lu", size); | |||
GELOGE(RT_FAILED, "[Check][Param] Acquire mem block failed, size:%lu", size); | |||
return nullptr; | |||
} | |||
@@ -71,7 +71,7 @@ class TsMemMall { | |||
void *addr = nullptr; | |||
rtError_t rt_ret = rtMalloc(&addr, bytes, mem_type_); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtMalloc] failed, size:%lu, ret:0x%X", bytes, rt_ret); | |||
return nullptr; | |||
} | |||
@@ -94,7 +94,7 @@ class TsMemMall { | |||
mem_store_addr_.erase(it); | |||
rtError_t ret = rtFree(addr); | |||
if (ret != RT_ERROR_NONE) { | |||
GELOGE(RT_FAILED, "Call rtFree failed, ret: 0x%X", ret); | |||
GELOGE(RT_FAILED, "[Call][RtFree] failed, ret:0x%X", ret); | |||
} | |||
} | |||
@@ -38,8 +38,13 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr | |||
op_name_ = op_desc->GetName(); | |||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | |||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | |||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, | |||
"basic_offset_size should be equal to relative_offset_size"); | |||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), | |||
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " | |||
"check invalid", zero_copy_basic_offset_.size(), | |||
zero_copy_relative_offset_.size()); | |||
return PARAM_INVALID, | |||
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", | |||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||
GELOGD("[ZCPY] zero_copy_basic_offset size is %zu", zero_copy_basic_offset_.size()); | |||
int64_t virtual_addr_offset = op_desc->GetOutputOffset().at(kDataIndex); | |||
@@ -78,7 +83,8 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list | |||
if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, size) != GRAPH_SUCCESS) { | |||
REPORT_INNER_ERROR("E19999", "Get input TensorSize in op:%s(%s) failed, input_index:%zu", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); | |||
GELOGE(FAILED, "GetTensorSizeInBytes failed!"); | |||
GELOGE(FAILED, "[Get][InputTensorSize] in op:%s(%s) failed, input_index:%zu", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), idx); | |||
return FAILED; | |||
} | |||
@@ -88,8 +94,13 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list | |||
op_name_ = op_desc->GetName(); | |||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_); | |||
(void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_); | |||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID, | |||
"basic_offset_size should be equal to relative_offset_size"); | |||
GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), | |||
REPORT_INNER_ERROR("E19999", "basic_offset_size:%zu not equal to relative_offset_size:%zu, " | |||
"check invalid", | |||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||
return PARAM_INVALID, | |||
"[Check][Param] basic_offset_size:%zu should be equal to relative_offset_size:%zu", | |||
zero_copy_basic_offset_.size(), zero_copy_relative_offset_.size()); | |||
int64_t virtual_addr_offset = op_desc->GetInputOffset().at(idx); | |||
IsL2Fusion(zero_copy_basic_offset_, virtual_addr_offset, fusion_flag); | |||
@@ -194,7 +205,8 @@ void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *ou | |||
for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { | |||
auto args_addrs = outside_addrs_[out_count].find(outside_addr); | |||
if (args_addrs != outside_addrs_[out_count].end()) { | |||
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid."); | |||
GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), | |||
"[Set][TaskArgsOffset] failed, Input args invalid, offset:%zu.", offset); | |||
void *args_val = static_cast<uint8_t *>(args) + offset; | |||
args_addrs->second.push_back(args_val); | |||
GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, | |||
@@ -36,9 +36,9 @@ ZeroCopyTask::~ZeroCopyTask() { args_addr_ = nullptr; } | |||
*/ | |||
Status ZeroCopyTask::SetTaskArgsOffset(uintptr_t addr, size_t offset) { | |||
if (offset + sizeof(uintptr_t) > args_size_) { | |||
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", | |||
offset, args_size_); | |||
GELOGE(FAILED, "[ZCPY] %s set task args failed, args size: %zu, offset: %zu", name_.c_str(), args_size_, offset); | |||
REPORT_INNER_ERROR("E19999", "Param offset:%zu + 8 > args_size_:%zu, check invalid", offset, args_size_); | |||
GELOGE(FAILED, "[Check][Param] [ZCPY] %s set task args failed, args size:%zu, offset:%zu", | |||
name_.c_str(), args_size_, offset); | |||
return FAILED; // unexpected error, need fix. | |||
} | |||
@@ -118,9 +118,8 @@ Status ZeroCopyTask::DistributeParam(bool async_mode, rtStream_t stream) { | |||
} | |||
if (rt_err != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret: 0x%X", | |||
args_size_, rt_err); | |||
GELOGE(RT_FAILED, "[ZCPY] %s distribute task param failed, error=0x%x", name_.c_str(), rt_err); | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpyAsync or rtMemcpy failed, size:%zu, ret:0x%X", args_size_, rt_err); | |||
GELOGE(RT_FAILED, "[Distribute][TaskParam] for %s failed, error = 0x%x", name_.c_str(), rt_err); | |||
return RT_ERROR_TO_GE_STATUS(rt_err); | |||
} | |||
@@ -112,7 +112,7 @@ Status CachingAllocator::Initialize(uint32_t device_id) { | |||
auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator); | |||
if (bin_ptr == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New BlockBin fail, device_id:%u", device_id); | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc BlockBin failed."); | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "[Alloc][BlockBin] failed, device_id:%u", device_id); | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
} | |||
free_block_bins_[i] = bin_ptr; | |||
@@ -147,9 +147,8 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device | |||
ptr = block->ptr; | |||
} | |||
if (ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", | |||
size, device_id); | |||
GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size); | |||
REPORT_INNER_ERROR("E19999", "FindFreeBlock fail, size:%zu, device_id:%u", size, device_id); | |||
GELOGE(FAILED, "[Check][Param] FindFreeBlock failed device id = %u, size= %zu", device_id, size); | |||
} | |||
return ptr; | |||
} | |||
@@ -157,18 +156,16 @@ uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device | |||
Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) { | |||
GELOGI("Free device id = %u", device_id); | |||
if (ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", | |||
device_id); | |||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||
REPORT_INNER_ERROR("E19999", "Param ptr is nullptr, device_id:%u, check invalid", device_id); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device_id:%u", device_id); | |||
return ge::PARAM_INVALID; | |||
} | |||
std::lock_guard<std::recursive_mutex> lock(mutex_); | |||
auto it = allocated_blocks_.find(ptr); | |||
if (it == allocated_blocks_.end()) { | |||
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", | |||
device_id); | |||
GELOGE(PARAM_INVALID, "Invalid memory pointer: %p", ptr); | |||
REPORT_INNER_ERROR("E19999", "Param ptr not allocated before, device_id:%u, check invalid", device_id); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Param ptr not allocated before, device_id:%u", device_id); | |||
return ge::PARAM_INVALID; | |||
} | |||
Block *block = it->second; | |||
@@ -225,9 +222,8 @@ Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t d | |||
Block key(device_id, size, org_ptr); | |||
BlockBin *bin = GetBlockBin(size); | |||
if (bin == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", | |||
size, device_id); | |||
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); | |||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); | |||
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); | |||
return nullptr; | |||
} | |||
std::lock_guard<std::recursive_mutex> lock(mutex_); | |||
@@ -258,9 +254,8 @@ Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, ui | |||
Block *remaining = block; | |||
Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr); | |||
if (new_block == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", | |||
size, device_id); | |||
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); | |||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); | |||
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); | |||
return block; | |||
} | |||
new_block->prev = remaining->prev; | |||
@@ -285,7 +280,7 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { | |||
size_t free_cached_memory_size = FreeCachedBlocks(); | |||
memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id); | |||
if (memory_addr == nullptr) { | |||
GELOGE(ge::FAILED, "TryExtendCache failed, no enough memory for size = %zu, device_id = %u", memory_size, | |||
GELOGE(ge::FAILED, "[Malloc][Memory] failed, no enough memory for size = %zu, device_id = %u", memory_size, | |||
device_id); | |||
return ge::FAILED; | |||
} | |||
@@ -304,16 +299,14 @@ Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) { | |||
Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size, uint32_t device_id) { | |||
BlockBin *bin = GetBlockBin(size); | |||
if (bin == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", | |||
size, device_id); | |||
GELOGE(ge::FAILED, "Get block bin failed size = %zu", size); | |||
REPORT_INNER_ERROR("E19999", "GetBlockBin fail, size:%zu, device_id:%u", size, device_id); | |||
GELOGE(ge::FAILED, "[Get][BlockBin] failed, size:%zu, device_id:%u", size, device_id); | |||
return ge::FAILED; | |||
} | |||
Block *block = new (std::nothrow) Block(device_id, size, bin, nullptr); | |||
if (block == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", | |||
size, device_id); | |||
GELOGE(ge::FAILED, "Alloc block failed size = %zu", size); | |||
REPORT_CALL_ERROR("E19999", "New Block fail, size:%zu, device_id:%u", size, device_id); | |||
GELOGE(ge::FAILED, "[Alloc][Block] failed, size:%zu, device_id:%u", size, device_id); | |||
return ge::FAILED; | |||
} | |||
@@ -33,7 +33,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { | |||
if (compute_graph_ == nullptr) { | |||
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | |||
if (graph == nullptr) { | |||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); | |||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); | |||
return; | |||
} | |||
@@ -45,7 +45,7 @@ GraphContext::GraphContext(const GraphNodePtr &graph_node) { | |||
Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { | |||
if (graph_node == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param graph_node is nullptr, check invalid"); | |||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "graphNode is NULL!"); | |||
GELOGE(GE_GRAPH_PARAM_NULLPTR, "[Check][Param] graphNode is NULL!"); | |||
return GE_GRAPH_PARAM_NULLPTR; | |||
} | |||
@@ -56,7 +56,7 @@ Status GraphContext::SetComputeGraph(const GraphNodePtr &graph_node) { | |||
std::shared_ptr<const ge::Graph> graph = graph_node->GetGraph(); | |||
if (graph == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param graph in graph_node is nullptr, check invalid"); | |||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "compute_graph by graphNode is NULL!"); | |||
GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[Get][Graph] failed, compute_graph by graphNode is NULL!"); | |||
return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL; | |||
} | |||
@@ -73,14 +73,15 @@ Status GraphContext::Finalize() const { return SUCCESS; } | |||
Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTensor &returned_tensor) { | |||
if (var_data_name.empty()) { | |||
REPORT_INNER_ERROR("E19999", "Param var_data_name is empty, check invalid"); | |||
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "Variable data name is empty!"); | |||
GELOGE(GE_GRAPH_EMPTY_STRING_NAME, "[Check][Param] Variable data name is empty!"); | |||
return GE_GRAPH_EMPTY_STRING_NAME; | |||
} | |||
if (GetVarNodeTensorTable().empty()) { | |||
REPORT_INNER_ERROR("E19999", "VarNodeTensorTable is empty, var_data_name:%s, check invalid", | |||
var_data_name.c_str()); | |||
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "VarNodeTensorTable is empty!"); | |||
GELOGE(GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE, "[Check][Param] VarNodeTensorTable is empty, var_data_name:%s", | |||
var_data_name.c_str()); | |||
return GE_GRAPH_EMPTY_VARIABLE_TENSOR_TABLE; | |||
} | |||
for (auto &var_record : GetVarNodeTensorTable()) { | |||
@@ -88,9 +89,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso | |||
returned_tensor.SetTensorDesc(var_record.second.GetTensorDesc()); | |||
auto ret = returned_tensor.SetData(var_record.second.GetData()); | |||
if (ret != SUCCESS) { | |||
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", | |||
var_data_name.c_str()); | |||
GELOGE(ret, "Set Tensor data failed!"); | |||
REPORT_INNER_ERROR("E19999", "SetData to tensor fail, var_data_name:%s", var_data_name.c_str()); | |||
GELOGE(ret, "[Set][Data] to Tensor failed, var_data_name:%s", var_data_name.c_str()); | |||
return ret; | |||
} | |||
@@ -100,7 +100,8 @@ Status GraphContext::GetVariableTensor(const std::string &var_data_name, GeTenso | |||
REPORT_INNER_ERROR("E19999", "VarRecord with data_name:%s does not exist, check invalid", | |||
var_data_name.c_str()); | |||
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "VarRecord with data_name %s does NOT exist!", var_data_name.c_str()); | |||
GELOGE(GE_GRAPH_VARIABLE_DOES_NOT_EXIST, "[Check][Param] VarRecord with data_name %s does NOT exist!", | |||
var_data_name.c_str()); | |||
return GE_GRAPH_VARIABLE_DOES_NOT_EXIST; | |||
} | |||
@@ -46,7 +46,7 @@ GraphNode::GraphNode(GraphId graph_id) | |||
sem_(1) { | |||
graph_run_async_listener_ = MakeShared<RunAsyncListener>(); | |||
if (graph_run_async_listener_ == nullptr) { | |||
GELOGE(MEMALLOC_FAILED, "Make shared failed"); | |||
GELOGE(MEMALLOC_FAILED, "[New][RunAsyncListener] failed"); | |||
} | |||
} | |||
@@ -82,7 +82,8 @@ SubGraphInfo::~SubGraphInfo() { | |||
rt_ret = rtFreeHost(buffer_addr); | |||
buffer_addr = nullptr; | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", | |||
model_id_info_.model_id); | |||
} | |||
} | |||
} | |||
@@ -94,8 +95,8 @@ Status SubGraphInfo::FreeInOutBuffer() { | |||
rtError_t rt_ret; | |||
rt_ret = rtFreeHost(*iter); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail"); | |||
GELOGE(rt_ret, "[GraphManager] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||
REPORT_CALL_ERROR("E19999", "Call rtFreeHost fail, ret:%d", rt_ret); | |||
GELOGE(rt_ret, "[Call][RtFreeHost] subgraph free buffer failed, modelId = %u", model_id_info_.model_id); | |||
buffer_addr_.erase(buffer_addr_.begin(), iter); | |||
return GE_GRAPH_FREE_FAILED; | |||
} | |||
@@ -131,7 +132,7 @@ Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, ui | |||
uint32_t GraphModelListener::GetResultCode() const { | |||
if (!is_finished_) { | |||
REPORT_CALL_ERROR("E19999", "Model not run finish"); | |||
GELOGE(INTERNAL_ERROR, "[GraphManager] model not run finish."); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] model not run finish."); | |||
return INTERNAL_ERROR; | |||
} | |||
return result_code_; | |||
@@ -170,7 +171,9 @@ bool HasCalcOp(const ComputeGraphPtr &graph) { | |||
for (const auto &node : graph->GetAllNodes()) { | |||
OpDescPtr op_desc = node->GetOpDesc(); | |||
GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(FAILED, "Node GetOpDesc is nullptr"); return false); | |||
GE_IF_BOOL_EXEC(op_desc == nullptr, | |||
REPORT_INNER_ERROR("E19999", "GetOpDesc failed, Node GetOpDesc is nullptr"); | |||
GELOGE(FAILED, "[Get][OpDesc] failed, Node GetOpDesc is nullptr"); return false); | |||
if (calc_op_type.find(op_desc->GetType()) != calc_op_type.end()) { | |||
return true; | |||
} | |||
@@ -50,9 +50,7 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size | |||
if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMalloc fail, purpose:%s, size:%zu, device_id:%u", | |||
purpose.c_str(), memory_size, device_id); | |||
GELOGE(ge::INTERNAL_ERROR, | |||
"MemoryAllocator::MallocMemory device_id = %u," | |||
" size= %lu", | |||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, device_id = %u, size= %lu", | |||
device_id, memory_size); | |||
return nullptr; | |||
@@ -68,7 +66,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con | |||
auto rtRet = rtFree(memory_addr); | |||
if (rtRet != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtFree fail, device_id:%u", device_id); | |||
GELOGE(rtRet, "MemoryAllocator::MallocMemory device_id = %u", device_id); | |||
GELOGE(rtRet, "[Call][RtFree] failed, device_id = %u", device_id); | |||
return RT_ERROR_TO_GE_STATUS(rtRet); | |||
} | |||
memory_addr = nullptr; | |||
@@ -88,10 +86,8 @@ uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memo | |||
if (memory_addr == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "Malloc Memory fail, purpose:%s, memory_key:%s, memory_size:%zu, device_id:%u", | |||
purpose.c_str(), memory_key.c_str(), memory_size, device_id); | |||
GELOGE(ge::INTERNAL_ERROR, | |||
"MemoryAllocator::MallocMemory failed," | |||
" memory_key[%s], size = %lu.", | |||
memory_key.c_str(), memory_size); | |||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][Memory] failed, memory_key[%s], size = %lu, device_id:%u.", | |||
memory_key.c_str(), memory_size, device_id); | |||
return nullptr; | |||
} | |||
@@ -126,10 +122,8 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id) | |||
if (FreeMemory(it->second.memory_addr_, device_id) != ge::SUCCESS) { | |||
REPORT_CALL_ERROR("E19999", "Free Memory fail, memory_key:%s, device_id:%u", | |||
memory_key.c_str(), device_id); | |||
GELOGE(ge::INTERNAL_ERROR, | |||
"MemoryAllocator::FreeMemory rtFree failed," | |||
" memory_key[%s]", | |||
memory_key.c_str()); | |||
GELOGE(ge::INTERNAL_ERROR, "[Free][Memory] failed, memory_key[%s], device_id:%u", | |||
memory_key.c_str(), device_id); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
@@ -40,7 +40,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens | |||
if (dev_ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param dev_ptr is nullptr, var_name:%s, session_id:%lu, " | |||
"check invalid", var_name.c_str(), session_id_); | |||
GELOGE(FAILED, "[GetVarAddr] dev_ptr is null!"); | |||
GELOGE(FAILED, "[Check][Param] Param dev_ptr is nullptr, var_name:%s, session_id:%lu", | |||
var_name.c_str(), session_id_); | |||
return FAILED; | |||
} | |||
std::string var_key = VarKey(var_name, tensor_desc); | |||
@@ -51,7 +52,8 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens | |||
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | |||
"check invalid", var_key.c_str(), var_name.c_str(), | |||
session_id_); | |||
GELOGE(FAILED, "VarResource::GetVarAddr failed, var_key %s", var_key.c_str()); | |||
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu", | |||
var_key.c_str(), var_name.c_str(), session_id_); | |||
return FAILED; | |||
} | |||
@@ -109,7 +111,8 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen | |||
REPORT_INNER_ERROR("E19999", "var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu, " | |||
"check invalid", var_key.c_str(), var_name.c_str(), | |||
session_id_); | |||
GELOGE(FAILED, "VarResource::SaveVarAddr, var_key %s save addr conflict", var_key.c_str()); | |||
GELOGE(FAILED, "[Check][Param] var_key:%s conflict in var_addr_mgr_map_, var_name:%s, session_id:%lu", | |||
var_key.c_str(), var_name.c_str(), session_id_); | |||
return FAILED; | |||
} | |||
@@ -145,14 +148,15 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O | |||
if (op_desc == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param op_desc is nullptr, var_name:%s, session_id:%lu, check invalid", | |||
var_name.c_str(), session_id_); | |||
GELOGE(FAILED, "[RenewCurVarDesc] renew var desc fail! input opdesc is null!"); | |||
GELOGE(FAILED, "[Check][Param] input opdesc is nullptr, var_name:%s, session_id:%lu", | |||
var_name.c_str(), session_id_); | |||
return FAILED; | |||
} | |||
ge::GeTensorDesc curr_desc; | |||
ge::Status ret = GetCurVarDesc(var_name, curr_desc); | |||
if (ret != SUCCESS) { | |||
GELOGE(FAILED, "[RenewCurVarDesc] Get var desc fail!"); | |||
GELOGE(FAILED, "[Get][CurVarDesc] fail, var_name:%s, session_id:%lu", var_name.c_str(), session_id_); | |||
return FAILED; | |||
} | |||
std::string key = VarKey(var_name, curr_desc); | |||
@@ -164,7 +168,8 @@ ge::Status VarResource::RenewCurVarDesc(const std::string &var_name, const ge::O | |||
REPORT_INNER_ERROR("E19999", "var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s), " | |||
"check invalid", key.c_str(), var_name.c_str(), | |||
session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(FAILED, "[RenewCurVarDesc] can't find ele with key [%s]", key.c_str()); | |||
GELOGE(FAILED, "[Check][Param] var_key:%s can't find in var_addr_mgr_map_, var_name:%s, session_id:%lu, op:%s(%s)", | |||
key.c_str(), var_name.c_str(), session_id_, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return FAILED; | |||
} | |||
auto val = iter->second; | |||
@@ -285,14 +290,15 @@ Status HbmMemResource::AssignVarMem(const std::string &var_name, uint64_t size, | |||
if (total_size_ < var_mem_size_) { | |||
REPORT_INNER_ERROR("E19999", "VarMemMaxSize:%lu < var_mem_size_:%lu, var_size:%lu, var_name:%s, check invalid" | |||
"", total_size_, var_mem_size_, size, var_name.c_str()); | |||
GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_); | |||
GELOGE(PARAM_INVALID, "[Check][Param] total_size_:%lu is smaller than var_mem_size_:%lu, var_name:%s", | |||
total_size_, var_mem_size_, var_name.c_str()); | |||
return PARAM_INVALID; | |||
} | |||
uint64_t free_size = total_size_ - var_mem_size_; | |||
if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) { | |||
REPORT_INNER_ERROR("E19999", "free_size:%lu not enough, var_align_size:%lu, var_name:%s, check invalid", | |||
free_size, size, var_name.c_str()); | |||
GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]", | |||
GELOGE(PARAM_INVALID, "[Check][Param] Out of memory: current var size[%lu] exceeds total var size[%lu]", | |||
size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_); | |||
return PARAM_INVALID; | |||
} | |||
@@ -317,7 +323,7 @@ Status RdmaMemResource::AssignVarMem(const std::string &var_name, uint64_t size, | |||
if (buffer == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "malloc rdma memory fail, var_size:%lu, var_name:%s", | |||
size, var_name.c_str()); | |||
GELOGE(MEMALLOC_FAILED, "Failed to malloc rdma memory for node %s, size = %lu", var_name.c_str(), size); | |||
GELOGE(MEMALLOC_FAILED, "[Malloc][RdmaMemory] for node %s failed, size = %lu", var_name.c_str(), size); | |||
return MEMALLOC_FAILED; | |||
} | |||
address = static_cast<size_t>(reinterpret_cast<uintptr_t>(buffer)); | |||
@@ -468,7 +474,8 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { | |||
if (mem_resource == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Find no mem_resource in map, memory_type:%d, session_id:%lu", | |||
memory_type, session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); | |||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%d, session_id:%lu", | |||
memory_type, session_id_); | |||
return 0; | |||
} | |||
return mem_resource->GetVarMemSize(); | |||
@@ -483,7 +490,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { | |||
if (mem_resource == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | |||
memory_type, session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); | |||
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu", | |||
memory_type, session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} else { | |||
mem_resource_map_[memory_type] = mem_resource; | |||
@@ -495,7 +503,8 @@ Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { | |||
if (mem_resource == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | |||
memory_type, session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); | |||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu", | |||
memory_type, session_id_); | |||
return FAILED; | |||
} | |||
mem_resource->UpdateVarMemSize(mem_size); | |||
@@ -515,7 +524,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||
if (result != ge::SUCCESS) { | |||
REPORT_CALL_ERROR("E19999", "Get size from tensor fail, var_name:%s, memory_type:%d, session_id:%lu", | |||
var_name.c_str(), memory_type, session_id_); | |||
GELOGE(result, "get size from TensorDesc failed"); | |||
GELOGE(result, "[Get][Size] from tensor fail, var_name:%s, memory_type:%u, session_id:%lu", | |||
var_name.c_str(), memory_type, session_id_); | |||
return result; | |||
} | |||
@@ -526,7 +536,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||
if (mem_resource == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "memory_type:%d invalid or New MemResource fail, session_id:%lu", | |||
memory_type, session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); | |||
GELOGE(ge::INTERNAL_ERROR, "[Alloc][MemResource] failed, memory_type:%u, session_id:%lu.", | |||
memory_type, session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} else { | |||
mem_resource_map_[memory_type] = mem_resource; | |||
@@ -538,7 +549,8 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||
if (mem_resource == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "MemResource is invalid, memory_type:%d, session_id:%lu", | |||
memory_type, session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid, memory_type = %u.", memory_type); | |||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] MemResource is invalid, memory_type:%u, session_id:%lu.", | |||
memory_type, session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
@@ -567,14 +579,15 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen | |||
if (can_not_reuse_old_memory) { | |||
result = mem_resource->AssignVarMem(var_name, tensor_desc_size, session_id_, mem_offset); | |||
if (result != SUCCESS) { | |||
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); | |||
GELOGE(ge::INTERNAL_ERROR, "[Assign][VarMem] by offset failed, session_id:%lu.", session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
result = var_resource_->SaveVarAddr( | |||
var_name, tensor_desc, reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(mem_offset)), memory_type); | |||
if (result != SUCCESS) { | |||
GELOGE(ge::INTERNAL_ERROR, "AssignVarMem by offset failed."); | |||
GELOGE(ge::INTERNAL_ERROR, "[Save][VarAddr] by offset failed, memory type:%u, session_id:%lu.", | |||
memory_type, session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
} | |||
@@ -681,7 +694,8 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt | |||
REPORT_INNER_ERROR("E19999", "VarManager has not been init, op:%s(%s), session_id:%lu, check invalid", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | |||
session_id_); | |||
GELOGE(ge::INTERNAL_ERROR, "VarManager has not been init."); | |||
GELOGE(ge::INTERNAL_ERROR, "[Check][Param] VarManager has not been init, op:%s(%s), session_id:%lu", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), session_id_); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
return var_resource_->RenewCurVarDesc(var_name, std::move(op_desc)); | |||
@@ -729,10 +743,8 @@ ge::Status VarManager::MallocVarMemory(size_t memory_size) { | |||
const string purpose("variables and constant op memory in training network."); | |||
var_mem_base = MemManager::Instance().MemInstance(RT_MEMORY_HBM).MallocMemory(purpose, memory_key, var_memory_size); | |||
if (var_mem_base == nullptr) { | |||
GELOGE(ge::INTERNAL_ERROR, | |||
"VarManager::MallocVarMemory failed " | |||
"session_id = %s", | |||
memory_key.c_str()); | |||
GELOGE(ge::INTERNAL_ERROR, "[Malloc][VarMemory] failed, size:%zu, session_id:%s", | |||
var_memory_size, memory_key.c_str()); | |||
return ge::INTERNAL_ERROR; | |||
} | |||
return SUCCESS; | |||
@@ -812,7 +824,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||
string graph_memory_manager_malloc_max_size = it->second; | |||
ge::Status ret = ParseMemoryMallocSize(graph_memory_manager_malloc_max_size, graph_mem_max_size_); | |||
if (ret != SUCCESS) { | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed."); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); | |||
return ge::GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); | |||
@@ -825,7 +837,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||
string memory_var_manager_malloc_size = it->second; | |||
ge::Status ret = ParseMemoryMallocSize(memory_var_manager_malloc_size, var_mem_max_size_); | |||
if (ret != SUCCESS) { | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse memory var manager malloc size failed."); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Call][ParseMemoryMallocSize] failed, session id:%lu.", session_id_); | |||
return ge::GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
} | |||
@@ -834,8 +846,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||
if (var_mem_logic_base_ > kMaxMemorySize) { | |||
REPORT_INNER_ERROR("E19999", "var_login_base:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | |||
var_mem_logic_base_, kMaxMemorySize, session_id_); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kMemoryVarLogicBase : %zu can not exceed max memory size : %zu.", | |||
var_mem_logic_base_, kMaxMemorySize); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kMemoryVarLogicBase:%zu can not exceed " | |||
"max memory size:%zu, session_id:%lu.", var_mem_logic_base_, kMaxMemorySize, session_id_); | |||
return ge::GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
@@ -843,8 +855,8 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) { | |||
if (use_max_mem_size_ > kMaxMemorySize) { | |||
REPORT_INNER_ERROR("E19999", "all mem_use size:%zu can not exeed limit:%zu, session_id:%lu, check invalid", | |||
use_max_mem_size_, kMaxMemorySize, session_id_); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "kUseMaxMemorySize : %zu can not exceed max memory size : %zu.", | |||
use_max_mem_size_, kMaxMemorySize); | |||
GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "[Check][Param] kUseMaxMemorySize:%zu can not exceed " | |||
"max memory size:%zu, session_id:%lu.", use_max_mem_size_, kMaxMemorySize, session_id_); | |||
return ge::GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
GELOGI("Set memory malloc size successfully"); | |||
@@ -855,7 +867,7 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||
if (memory_size.empty()) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_size is empty, session_id:%lu, check invalid", | |||
session_id_); | |||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input is empty."); | |||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "[Check][Param] Memory malloc size input is empty, session_id:%lu.", session_id_); | |||
return GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
// split string by '*' | |||
@@ -882,7 +894,9 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||
if (!isdigit(c)) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s contains non digit, session_id:%lu, check invalid", | |||
memory_size.c_str(), session_id_); | |||
GELOGE(GE_GRAPH_OPTIONS_INVALID, "Memory malloc size input contains non digit."); | |||
GELOGE(GE_GRAPH_OPTIONS_INVALID, | |||
"[Check][Param] Memory malloc size:%s input contains non digit, session_id:%lu.", | |||
memory_size.c_str(), session_id_); | |||
return GE_GRAPH_OPTIONS_INVALID; | |||
} | |||
} | |||
@@ -891,13 +905,15 @@ Status VarManager::ParseMemoryMallocSize(string &memory_size, size_t &result) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s will overflow after multi all, session_id:%lu, " | |||
"check invalid", memory_size.c_str(), | |||
session_id_); | |||
GELOGE(FAILED, "Input memory size is out of range."); | |||
GELOGE(FAILED, "[Check][Param] Param memory_size:%s will overflow after multi all, session_id:%lu", | |||
memory_size.c_str(), session_id_); | |||
return FAILED); | |||
if ((num > kMaxMemorySize) || (result * static_cast<size_t>(num) > kMaxMemorySize)) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_size:%s after multi will exceed limit:%lu, session_id:%lu, " | |||
"check invalid", memory_size.c_str(), kMaxMemorySize, | |||
session_id_); | |||
GELOGE(FAILED, "Input memory size can not exceed max memory size : %zu.", kMaxMemorySize); | |||
GELOGE(FAILED, "[Check][Param] Input memory size can not exceed max memory size:%zu, session_id:%lu.", | |||
kMaxMemorySize, session_id_); | |||
return FAILED; | |||
} | |||
result *= static_cast<size_t>(num); | |||
@@ -1001,10 +1017,7 @@ VarManager *VarManagerPool::GetVarManager(uint64_t session_id) { | |||
VarManager *var_manager = new (std::nothrow) VarManager(session_id); | |||
if (var_manager == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "New VarManager fail, session_id:%lu", session_id); | |||
GELOGE(INTERNAL_ERROR, | |||
"VarManager::Instance find session by " | |||
"session_id[%lu] failed.", | |||
session_id); | |||
GELOGE(INTERNAL_ERROR, "[New][VarManager] fail, session_id:%lu", session_id); | |||
static VarManager new_var_manager(0); | |||
return &new_var_manager; | |||
} | |||
@@ -34,8 +34,8 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { | |||
std::lock_guard<std::mutex> lock(mutex_); | |||
std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size); | |||
if (aligned_ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail"); | |||
GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed"); | |||
REPORT_INNER_ERROR("E19999", "New AlignedPtr fail, size:%zu", size); | |||
GELOGE(INTERNAL_ERROR, "[Call][MakeShared] for AlignedPtr failed, size:%zu", size); | |||
return nullptr; | |||
} | |||
allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr }; | |||
@@ -46,7 +46,7 @@ uint8_t *HostMemAllocator::Malloc(size_t size) { | |||
Status HostMemAllocator::Free(const void *memory_addr) { | |||
if (memory_addr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, check invalid"); | |||
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); | |||
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer"); | |||
return GE_GRAPH_FREE_FAILED; | |||
} | |||
@@ -54,7 +54,7 @@ Status HostMemAllocator::Free(const void *memory_addr) { | |||
auto it = allocated_blocks_.find(memory_addr); | |||
if (it == allocated_blocks_.end()) { | |||
REPORT_INNER_ERROR("E19999", "Memory_addr is not alloc before, check invalid"); | |||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer:%p", memory_addr); | |||
return PARAM_INVALID; | |||
} | |||
it->second.second.reset(); | |||
@@ -39,9 +39,8 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) { | |||
rtMallocHostSharedMemoryOut output_para; | |||
rtError_t rt_ret = rtMallocHostSharedMemory(&input_para, &output_para); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", | |||
rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api(rtMallocHostSharedMemory) failed, devid:[%u].", device_id); | |||
REPORT_CALL_ERROR("E19999", "Call rtMallocHostSharedMemory fail, ret:0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtMallocHostSharedMemory] failed, devid:[%u].", device_id); | |||
return GE_GRAPH_MEMORY_ALLOC_FAILED; | |||
} | |||
mem_info.fd = output_para.fd; | |||
@@ -60,9 +59,8 @@ Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) { | |||
mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address}; | |||
rtError_t rt_ret = rtFreeHostSharedMemory(&free_para); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", | |||
rt_ret); | |||
GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret); | |||
REPORT_CALL_ERROR("E19999", "Call rtFreeHostSharedMemory fail, ret:0x%X", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtFreeHostSharedMemory] failed, ret:0x%X.", rt_ret); | |||
return RT_FAILED; | |||
} | |||
return ge::SUCCESS; | |||
@@ -78,7 +76,7 @@ Status HostMemManager::Initialize() { | |||
allocator_ = std::unique_ptr<SharedMemAllocator>(new (std::nothrow) SharedMemAllocator()); | |||
if (allocator_ == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New SharedMemAllocator fail"); | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Shared memory allocator init failed!"); | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][SharedMemAllocator] failed!"); | |||
return GE_GRAPH_MALLOC_FAILED; | |||
} | |||
return SUCCESS; | |||
@@ -98,9 +96,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { | |||
std::lock_guard<std::recursive_mutex> lock(mutex_); | |||
auto iter = var_memory_base_map_.find(mem_info.op_name); | |||
if (iter != var_memory_base_map_.end()) { | |||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", | |||
mem_info.op_name.c_str()); | |||
GELOGE(FAILED, "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||
REPORT_INNER_ERROR("E19999", "Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||
GELOGE(FAILED, "[Check][Param] Host shared memory for op %s has been malloced", mem_info.op_name.c_str()); | |||
return FAILED; | |||
} | |||
mem_info.shm_name = OpNameToShmName(mem_info.op_name); | |||
@@ -113,9 +110,8 @@ Status HostMemManager::MallocSharedMemory(SharedMemInfo &mem_info) { | |||
Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_addr, uint64_t &data_size) { | |||
std::lock_guard<std::recursive_mutex> lock(mutex_); | |||
if (var_memory_base_map_.find(op_name) == var_memory_base_map_.end()) { | |||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", | |||
op_name.c_str()); | |||
GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str()); | |||
REPORT_INNER_ERROR("E19999", "MemInfo.op_name:%s can't find in var_memory_base_map_", op_name.c_str()); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Find host base base_addr failed, node name:%s!", op_name.c_str()); | |||
return INTERNAL_ERROR; | |||
} | |||
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address)); | |||
@@ -50,9 +50,8 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t | |||
path.append(file_name); | |||
string canonical_path = RealPath(path.c_str()); | |||
if (canonical_path.empty()) { | |||
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", | |||
canonical_path.c_str()); | |||
GELOGE(FAILED, "Failed to get realpath of %s", path.c_str()); | |||
REPORT_INNER_ERROR("E19999", "canonical_path:%s is empty, check invalid", canonical_path.c_str()); | |||
GELOGE(FAILED, "[Call][RealPath] Failed to get realpath of %s", path.c_str()); | |||
return FAILED; | |||
} | |||
GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); | |||
@@ -69,15 +68,14 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t | |||
if (hcom_remote_mem_register == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "Symbol HcomRegRemoteAccessMem can't find in %s, check invalid", | |||
canonical_path.c_str()); | |||
GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); | |||
GELOGE(FAILED, "[Check][Param] Symbol HcomRegRemoteAccessMem can't find in %s", canonical_path.c_str()); | |||
return FAILED; | |||
} | |||
HcclResult hccl_ret = hcom_remote_mem_register(reg_addrs.get(), table_len); | |||
if (hccl_ret != HCCL_SUCCESS) { | |||
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", | |||
hccl_ret); | |||
GELOGE(HCCL_E_INTERNAL, "Rdma mem register failed, ret: 0x%X", hccl_ret); | |||
REPORT_CALL_ERROR("E19999", "Call hcom_remote_mem_register failed, ret:%d,", hccl_ret); | |||
GELOGE(HCCL_E_INTERNAL, "[Call][HcomRemoteMemRegister] Rdma mem register failed, ret:0x%X", hccl_ret); | |||
return HCCL_E_INTERNAL; | |||
} | |||
return SUCCESS; | |||
@@ -88,14 +86,14 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin | |||
uint32_t type_size = 0; | |||
bool result = TypeUtils::GetDataTypeLength(tensor_info.data_type, type_size); | |||
if (!result) { | |||
GELOGE(GRAPH_FAILED, "GetDataTypeLength failed, data_type=(%s).", | |||
GELOGE(GRAPH_FAILED, "[Get][DataTypeLength] failed, data_type=(%s).", | |||
TypeUtils::DataTypeToSerialString(tensor_info.data_type).c_str()); | |||
return GRAPH_FAILED; | |||
} | |||
memory_size = type_size; | |||
for (auto dim : tensor_info.dims) { | |||
if (dim <= 0) { | |||
GELOGE(GRAPH_FAILED, "Tensor dims should be positive"); | |||
GELOGE(GRAPH_FAILED, "[Check][Param] Tensor dims should be positive"); | |||
return GRAPH_FAILED; | |||
} | |||
memory_size *= dim; | |||
@@ -103,7 +101,7 @@ Status MallocSharedMemory(const TensorInfo &tensor_info, uint64_t &dev_addr, uin | |||
SharedMemInfo mem_info(tensor_info.var_name, memory_size); | |||
Status ret = HostMemManager::Instance().MallocSharedMemory(mem_info); | |||
if (ret != SUCCESS) { | |||
GELOGE(GRAPH_FAILED, "MallocSharedMemory failed op name [%s]", tensor_info.var_name.c_str()); | |||
GELOGE(GRAPH_FAILED, "[Malloc][SharedMemory] failed, op name [%s]", tensor_info.var_name.c_str()); | |||
return GRAPH_FAILED; | |||
} | |||
dev_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(mem_info.device_address)); | |||
@@ -45,7 +45,7 @@ Status EventManager::Init(size_t event_num) { | |||
void EventManager::Release() noexcept { | |||
for (size_t i = 0; i < this->event_list_.size(); ++i) { | |||
rtError_t rt_ret = rtEventDestroy(this->event_list_[i]); | |||
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "Destroy event failed, idx is %zu, ret is 0x%x.", i, rt_ret); | |||
RETURN_IF_COND_NOT_MET(rt_ret == RT_ERROR_NONE, "[Destroy][Event] failed, idx is %zu, ret is 0x%x.", i, rt_ret); | |||
} | |||
this->event_list_.clear(); | |||
@@ -82,8 +82,8 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { | |||
auto device_id = GetContext().DeviceId(); | |||
GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id); | |||
if (rdma_base_addr_ != nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); | |||
GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced"); | |||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is not nullptr, devid:%u, check invalid", device_id); | |||
GELOGE(GE_MULTI_INIT, "[Check][Param] Rdma pool has been malloced, devid:%u", device_id); | |||
return GE_MULTI_INIT; | |||
} | |||
const std::string purpose = "Memory for rdma pool."; | |||
@@ -95,15 +95,15 @@ Status RdmaPoolAllocator::InitMemory(size_t mem_size) { | |||
rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id); | |||
if (rdma_base_addr_ == nullptr) { | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed"); | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][Memory] failed, size:%zu, device_id:%u", mem_size, device_id); | |||
return GE_GRAPH_MALLOC_FAILED; | |||
} | |||
rdma_mem_size_ = mem_size; | |||
// Init with a base block. | |||
auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_); | |||
if (base_block == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New Block failed, device_id:%u", device_id); | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed"); | |||
REPORT_CALL_ERROR("E19999", "New Block failed, size:%zu, device_id:%u", mem_size, device_id); | |||
GELOGE(GE_GRAPH_MALLOC_FAILED, "[New][Block] failed, size:%zu, device_id:%u", mem_size, device_id); | |||
return GE_GRAPH_MALLOC_FAILED; | |||
} | |||
block_bin_.insert(base_block); | |||
@@ -123,7 +123,7 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { | |||
if (block->ptr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Rdmapool memory address is nullptr, device_id:%u, check invalid", | |||
device_id); | |||
GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr."); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdmapool memory address is nullptr, device_id:%u", device_id); | |||
return nullptr; | |||
} | |||
allocated_blocks_.emplace(block->ptr, block); | |||
@@ -155,9 +155,8 @@ uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) { | |||
Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { | |||
GELOGI("Free rdma memory, device id = %u", device_id); | |||
if (memory_addr == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", | |||
device_id); | |||
GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer"); | |||
REPORT_INNER_ERROR("E19999", "Param memory_addr is nullptr, device_id:%u, check invalid", device_id); | |||
GELOGE(GE_GRAPH_FREE_FAILED, "[Check][Param] Invalid memory pointer, device id:%u", device_id); | |||
return GE_GRAPH_FREE_FAILED; | |||
} | |||
@@ -166,7 +165,7 @@ Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) { | |||
if (it == allocated_blocks_.end()) { | |||
REPORT_INNER_ERROR("E19999", "Param memory_addr is not allocated before, device_id:%u, " | |||
"check invalid", device_id); | |||
GELOGE(PARAM_INVALID, "Invalid memory pointer"); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Invalid memory pointer, device id:%u", device_id); | |||
return PARAM_INVALID; | |||
} | |||
@@ -209,7 +208,7 @@ void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) { | |||
Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) { | |||
if (rdma_base_addr_ == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param rdma_base_addr_ is nullptr, check invalid"); | |||
GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr."); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] Rdma base addr is nullptr."); | |||
return INTERNAL_ERROR; | |||
} | |||
base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_)); | |||
@@ -37,7 +37,8 @@ class RtContextSwitchGuard { | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtCtxGetCurrent failed, device_id:%u, ret:0x%X,", | |||
device_id, ret); | |||
GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret); | |||
GELOGE(RT_FAILED, "[Call][RtCtxGetCurrent] Failed to get current context, device_id:%u, ret:0x%X", | |||
device_id, ret); | |||
return; | |||
} | |||
@@ -45,15 +46,14 @@ class RtContextSwitchGuard { | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtCtxCreate failed, device_id:%u, ret:0x%X,", | |||
device_id, ret); | |||
GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret); | |||
GELOGE(RT_FAILED, "[Call][RtCtxCreate] Failed to create new context for device:%u, ret:%d", device_id, ret); | |||
return; | |||
} | |||
ret = rtCtxSetCurrent(current_); | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X,", | |||
device_id, ret); | |||
GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id); | |||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, device_id:%u, ret:0x%X", device_id, ret); | |||
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, device_id:%u, ret:0x%X", device_id, ret); | |||
return; | |||
} | |||
GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); | |||
@@ -80,7 +80,7 @@ int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { | |||
if (var_size <= 0) { | |||
REPORT_INNER_ERROR("E19999", "Data type:%s in desc, it's size:%ld < 0, check invalid", | |||
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str(), var_size); | |||
GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", | |||
GELOGE(PARAM_INVALID, "[Calc][VarDataSize] by data type %s failed.", | |||
TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); | |||
return -1; | |||
} | |||
@@ -99,7 +99,8 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), | |||
var->GetType().c_str(), trans_result.length, ret); | |||
GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length); | |||
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, op:%s(%s), size:%lu, ret:0x%X,", var->GetName().c_str(), | |||
var->GetType().c_str(), trans_result.length, ret); | |||
return RT_FAILED; | |||
} | |||
return SUCCESS; | |||
@@ -111,21 +112,17 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||
GE_CHECK_NOTNULL(var); | |||
auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic); | |||
if (ret != SUCCESS) { | |||
GELOGE(INTERNAL_ERROR, | |||
"Failed to copy var %s from device, can not find it" | |||
" from var manager %u", | |||
var->GetName().c_str(), ret); | |||
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, node:%s, session_id:%lu, ret:%d", | |||
var->GetName().c_str(), session_id, ret); | |||
return INTERNAL_ERROR; | |||
} | |||
uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); | |||
if (var_addr == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu,", | |||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, op:%s(%s), session_id:%lu", | |||
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||
GELOGE(INTERNAL_ERROR, | |||
"Failed to copy var %s from device, cant not get " | |||
"var addr from logic addr %p", | |||
var->GetName().c_str(), var_logic); | |||
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, op:%s(%s), session_id:%lu", | |||
RT_MEMORY_HBM, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||
return INTERNAL_ERROR; | |||
} | |||
@@ -136,9 +133,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||
std::unique_ptr<uint8_t[]> var_host(new(std::nothrow) uint8_t[var_size_bytes]); | |||
if (var_host == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu,", | |||
REPORT_CALL_ERROR("E19999", "New host memory failed, size:%ld, op:%s(%s), session_id:%lu", | |||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||
GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes); | |||
GELOGE(OUT_OF_MEMORY, "[New][Memory] for rt-host failed, size:%ld, op:%s(%s), session_id:%lu", | |||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id); | |||
return OUT_OF_MEMORY; | |||
} | |||
@@ -147,10 +145,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", | |||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); | |||
GELOGE(RT_FAILED, | |||
"Failed to copy var memory from device, var %s, size %ld," | |||
" rt-error-code %u", | |||
var->GetName().c_str(), var_size_bytes, ret); | |||
GELOGE(RT_FAILED, "[Call][RtMemcpy] failed, size:%ld, op:%s(%s), session_id:%lu, ret:0x%X", | |||
var_size_bytes, var->GetName().c_str(), var->GetType().c_str(), session_id, ret); | |||
return RT_FAILED; | |||
} | |||
@@ -197,9 +193,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||
formats::ShapeToString(src_shape).c_str(), | |||
formats::ShapeToString(dst_shape).c_str(), | |||
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | |||
GELOGE(INTERNAL_ERROR, | |||
"Failed to trans format from %s to %s, shape %s to %s, " | |||
"data type %s error code %u", | |||
GELOGE(INTERNAL_ERROR, "[Trans][Format] from %s to %s, shape %s to %s failed, data type %s error code %u", | |||
TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(), | |||
formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(), | |||
TypeUtils::DataTypeToSerialString(data_type).c_str(), ret); | |||
@@ -221,7 +215,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | |||
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), | |||
formats::ShapeToString(input_shape).c_str(), src_data_size, ret); | |||
GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u", | |||
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, input shape %s, data size %ld, error code %u", | |||
TypeUtils::DataTypeToSerialString(src_data_type).c_str(), | |||
TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), | |||
src_data_size, ret); | |||
@@ -230,7 +224,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats | |||
} else { | |||
REPORT_INNER_ERROR("E19999", "Trans var data failed, the trans type %s does not supported, check invalid", | |||
trans_info.node_type.c_str()); | |||
GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported", | |||
GELOGE(UNSUPPORTED, "[Trans][VarData] failed, the trans type %s does not supported", | |||
trans_info.node_type.c_str()); | |||
return UNSUPPORTED; | |||
} | |||
@@ -255,10 +249,8 @@ Status ReAssignVarAddr(uint64_t session_id, | |||
uint8_t *var_logic = nullptr; | |||
Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic); | |||
if (ret != SUCCESS) { | |||
GELOGE(INTERNAL_ERROR, | |||
"Failed to get var %s device addr, can not find it" | |||
" from var manager %u", | |||
var_name.c_str(), ret); | |||
GELOGE(INTERNAL_ERROR, "[Get][VarAddr] failed, var name:%s, session_id:%lu, ret:%u", | |||
var_name.c_str(), session_id, ret); | |||
return INTERNAL_ERROR; | |||
} | |||
@@ -266,7 +258,8 @@ Status ReAssignVarAddr(uint64_t session_id, | |||
if (var_addr == nullptr) { | |||
REPORT_CALL_ERROR("E19999", "Get variable memory addr failed, mem_type:%d, var_name:%s, session_id:%lu,", | |||
RT_MEMORY_HBM, var_name.c_str(), session_id); | |||
GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str()); | |||
GELOGE(INTERNAL_ERROR, "[Get][VarMemoryAddr] failed, mem_type:%d, var_name:%s, session_id:%lu", | |||
RT_MEMORY_HBM, var_name.c_str(), session_id); | |||
return INTERNAL_ERROR; | |||
} | |||
*var_device = var_addr; | |||
@@ -293,9 +286,8 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||
// Sync var data from device | |||
std::unique_ptr<uint8_t[]> var_data; | |||
if (trans_road.empty()) { | |||
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", | |||
session_id); | |||
GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty."); | |||
REPORT_INNER_ERROR("E19999", "Param trans_road is empty, session_id:%lu, check invalid", session_id); | |||
GELOGE(INTERNAL_ERROR, "[Check][Param] trans_road is empty, session_id:%lu", session_id); | |||
return INTERNAL_ERROR; | |||
} | |||
const GeTensorDesc &input_desc = trans_road.begin()->input; | |||
@@ -307,7 +299,7 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||
formats::TransResult trans_result{}; | |||
ret = TransVarOnHost(var_data.get(), trans_road, trans_result); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Failed to trans var data on host, error code %u", ret); | |||
GELOGE(ret, "[Call][TransVarOnHost] failed, session_id:%lu, ret:%u", session_id, ret); | |||
return ret; | |||
} | |||
@@ -319,14 +311,15 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t | |||
/// TensorDesc needs to be removed. This change is large and needs to be performed step by step. | |||
ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length); | |||
GELOGE(ret, "[Call][ReAssignVarAddr] failed, session id:%lu, op:%s, ret:%u", | |||
session_id, var->GetName().c_str(), ret); | |||
return ret; | |||
} | |||
// sync new data to device | |||
ret = CopyVarToDevice(var, trans_result, var_device); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "Failed to send var data to device"); | |||
GELOGE(ret, "[Call][CopyVarToDevice] failed, var:%s, ret:%u", var->GetName().c_str(), ret); | |||
return ret; | |||
} | |||
@@ -350,7 +343,10 @@ Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var | |||
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), | |||
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), | |||
src_data_shape_size, ret); | |||
GELOGE(INTERNAL_ERROR, "trans var data on host failed"); | |||
GELOGE(INTERNAL_ERROR, "[Trans][DataType] from %s to %s failed, data size %ld, ret:%u", | |||
TypeUtils::DataTypeToSerialString(src_data_datatype).c_str(), | |||
TypeUtils::DataTypeToSerialString(dst_data_datatype).c_str(), | |||
src_data_shape_size, ret); | |||
return ret; | |||
}); | |||
return SUCCESS; | |||
@@ -366,9 +362,11 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, | |||
/// need copy value from var_fp32 to var_fp16. | |||
/// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr] | |||
GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, | |||
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is empty, session_id:%lu, device_id:%u, " | |||
REPORT_INNER_ERROR("E19999", "Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u, " | |||
"check invalid", session_id, device_id); | |||
GELOGE(FAILED, "node var is nullptr"); return FAILED); | |||
GELOGE(FAILED, "[Check][Param] Param var_src or var_dst is nullptr, session_id:%lu, device_id:%u", | |||
session_id, device_id); | |||
return FAILED); | |||
// src_node output_desc (fp32) | |||
GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0); | |||
auto src_data_type = output_desc.GetDataType(); | |||
@@ -390,31 +388,45 @@ Status CopyTensorFromSrcVarNode(const NodePtr &var_src, | |||
RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id); | |||
// copy from src_node | |||
auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||
GELOGE(FAILED, "[Call][CopyVarFromDevice] failed, session id:%lu, var_src:%s", | |||
session_id, var_src->GetName().c_str()); | |||
return ret); | |||
// trans dtype | |||
formats::TransResult trans_result{}; | |||
ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||
GELOGE(INTERNAL_ERROR, "[Trans][Tensor] failed, var_src:%s, var_dst:%s", | |||
var_src->GetName().c_str(), var_dst->GetName().c_str()); | |||
return ret); | |||
// reset src value. | |||
void *var_device = nullptr; | |||
ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||
GELOGE(INTERNAL_ERROR, "[Call][ReAssignVarAddr] failed, session id:%lu, var_dst:%s", | |||
session_id, var_dst->GetName().c_str()); | |||
return ret); | |||
// copy to device | |||
ret = CopyVarToDevice(var_dst, trans_result, var_device); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||
GELOGE(ret, "[Call][CopyVarToDevice] failed, var_dst:%s, ret:%u", | |||
var_dst->GetName().c_str(), ret); | |||
return ret); | |||
return SUCCESS; | |||
} | |||
} // namespace | |||
Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | |||
uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { | |||
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. "); | |||
GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "[Check][Param] dst addr is nullptr."); | |||
uint8_t *src_host_addr = nullptr; | |||
int64_t src_addr_size = 0; | |||
GE_MAKE_GUARD_RTMEM(src_host_addr); | |||
GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); | |||
GELOGI("src_addr_size: %ld, dst_addr_size: %ld", src_addr_size, dst_addr_size); | |||
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, "var data size is not equal broadcast "); | |||
GE_CHK_BOOL_RET_STATUS(src_addr_size == dst_addr_size, FAILED, | |||
"[Check][Param] src_addr_size:%ld not equal to dst_addr_size:%ld", | |||
src_addr_size, dst_addr_size); | |||
GE_CHK_RT_RET(rtMemcpy(dst_addr, dst_addr_size, src_host_addr, src_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); | |||
return SUCCESS; | |||
@@ -422,7 +434,7 @@ Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge | |||
Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, | |||
const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { | |||
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "src addr is null. "); | |||
GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "[Check][Param] src addr is nullptr. "); | |||
uint8_t *host_addr = nullptr; | |||
GE_MAKE_GUARD_RTMEM(host_addr); | |||
GE_CHK_RT_RET(rtMallocHost(reinterpret_cast<void **>(&host_addr), src_addr_size)); | |||
@@ -436,7 +448,7 @@ Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_a | |||
Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, | |||
uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "get size from TensorDesc failed"); | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "[Get][Size] from TensorDesc failed"); | |||
uint8_t *src_addr = nullptr; | |||
GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); | |||
@@ -493,7 +505,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||
if (rt_ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtCtxSetCurrent failed, session_id:%lu, graph_id:%u, ret:0x%X,", | |||
session_id, graph_id, rt_ret); | |||
GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret); | |||
GELOGE(RT_FAILED, "[Call][RtCtxSetCurrent] failed, session_id:%lu, graph_id:%u, ret:0x%X,", | |||
session_id, graph_id, rt_ret); | |||
return RT_ERROR_TO_GE_STATUS(rt_ret); | |||
} | |||
uint32_t allocated_graph_id = 0; | |||
@@ -501,8 +514,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||
if (ret != SUCCESS) { | |||
REPORT_CALL_ERROR("E19999", "Get allocated GraphId failed, session_id:%lu, graph_id:%u, ret:0x%X,", | |||
session_id, graph_id, ret); | |||
GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(), | |||
graph_id); | |||
GELOGE(INTERNAL_ERROR, "[Get][AllocatedGraphId] failed, node:%s, graph_id:%u.", | |||
node->GetName().c_str(), graph_id); | |||
return INTERNAL_ERROR; | |||
} | |||
uint32_t changed_graph_id = 0; | |||
@@ -518,7 +531,8 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||
} | |||
ret = TransVarData(node, *trans_road, session_id); | |||
if (ret != SUCCESS) { | |||
GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id); | |||
GELOGE(INTERNAL_ERROR, "[Trans][VarData] failed, node:%s, graph_id:%u, session_id:%lu.", | |||
node->GetName().c_str(), graph_id, session_id); | |||
return INTERNAL_ERROR; | |||
} | |||
VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName()); | |||
@@ -527,7 +541,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||
}, | |||
node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); | |||
if (!f.valid()) { | |||
GELOGE(FAILED, "Future is invalid"); | |||
GELOGE(FAILED, "[Check][Param] Future is invalid, session id:%lu, graph id:%u", session_id, graph_id); | |||
return FAILED; | |||
} | |||
vector_future.push_back(std::move(f)); | |||
@@ -537,7 +551,7 @@ Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, | |||
for (size_t i = 0; i < vector_future.size(); ++i) { | |||
ret_status = vector_future[i].get(); | |||
if (ret_status != SUCCESS) { | |||
GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i); | |||
GELOGE(ret_status, "[Check][Param] trans %zu vardata failed", i); | |||
return ret_status; | |||
} | |||
} | |||
@@ -550,7 +564,8 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint | |||
if (compute_graph == nullptr) { | |||
REPORT_INNER_ERROR("E19999", "Param compute_graph is nullptr, session_id:%lu, device_id:%u, check invalid", | |||
session_id, device_id); | |||
GELOGE(FAILED, "compute_graph is nullptr"); | |||
GELOGE(FAILED, "[Check][Param] compute_graph is nullptr, session_id:%lu, device_id:%u", | |||
session_id, device_id); | |||
return FAILED; | |||
} | |||
@@ -568,7 +583,10 @@ Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint | |||
GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(), | |||
src_node->GetName().c_str()); | |||
auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED); | |||
GE_IF_BOOL_EXEC(ret != SUCCESS, | |||
GELOGE(FAILED, "[Copy][Tensor] failed, src_node:%s, node:%s, session_id:%lu, device_id:%u", | |||
src_node->GetName().c_str(), node->GetName().c_str(), session_id, device_id); | |||
return FAILED); | |||
// only copy once | |||
(void) ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value | |||
} | |||
@@ -63,17 +63,15 @@ Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) { | |||
uint8_t *host_addr = nullptr; | |||
rtError_t ret = rtMallocHost(reinterpret_cast<void **>(&host_addr), size); | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret: 0x%X", | |||
size, ret); | |||
GELOGE(FAILED, "Call rt api rtMallocHost failed, ret: 0x%X", ret); | |||
REPORT_CALL_ERROR("E19999", "Call rtMallocHost failed, size:%zu, ret:0x%X", size, ret); | |||
GELOGE(FAILED, "[Call][RtMallocHost] failed, size:%zu, ret:0x%X", size, ret); | |||
return FAILED; | |||
} | |||
GE_MAKE_GUARD_RTMEM(host_addr); | |||
ret = rtMemcpy(host_addr, size, addr, size, RT_MEMCPY_DEVICE_TO_HOST); | |||
if (ret != RT_ERROR_NONE) { | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret: 0x%X", | |||
size, ret); | |||
GELOGE(FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret); | |||
REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, size:%zu, ret:0x%X", size, ret); | |||
GELOGE(FAILED, "[Call][RtMemcpy] failed, size:%zu, ret:0x%X", size, ret); | |||
return FAILED; | |||
} | |||
@@ -28,7 +28,8 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||
std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) { | |||
GE_CHECK_NOTNULL(op_desc); | |||
if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); | |||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, op:%s(%s).", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
@@ -40,10 +41,10 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||
if (op_desc->GetType() == HCOMRECEIVE) { | |||
bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type); | |||
if (ret == false) { | |||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", | |||
HCOM_ATTR_DATA_TYPE.c_str(), | |||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype."); | |||
GELOGE(PARAM_INVALID, "[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_DATA_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
} else { | |||
@@ -55,13 +56,11 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||
auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type)); | |||
if (iter == kConstOpHcclDataType.end()) { | |||
REPORT_INNER_ERROR("E19999", "Attr:%s in op:%s(%s), value data_type:%s, not support in kConstOpHcclDataType now, " | |||
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | |||
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||
GELOGE(PARAM_INVALID, | |||
"HcomOmeUtil:: Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), | |||
ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||
"check invalid", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), | |||
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in op:%s(%s), value data_type:%s, " | |||
"not support in kConstOpHcclDataType now", HCOM_ATTR_DATA_TYPE.c_str(), op_desc->GetName().c_str(), | |||
op_desc->GetType().c_str(), ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str()); | |||
return PARAM_INVALID; | |||
} | |||
@@ -73,7 +72,7 @@ Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc, | |||
Status HcomOmeUtil::GetHcclTypeSize(HcclDataType data_type, int32_t &size) { | |||
auto iter = kConstOpHcclDataTypeSize.find(data_type); | |||
GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID, | |||
"HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!"); | |||
"[Check][Param] param data_type:%d not find", data_type); | |||
size = iter->second; | |||
return SUCCESS; | |||
@@ -83,21 +82,22 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType | |||
int &count) { | |||
GE_CHECK_NOTNULL(op_desc); | |||
if (!IsHCOMOp(op_desc->GetType())) { | |||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator."); | |||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op, check invalid", op_desc->GetName().c_str(), | |||
op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op", op_desc->GetName().c_str(), | |||
op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
int64_t total_size = 0; | |||
int64_t align_size = 512; | |||
int32_t size = 0; | |||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "GetHcomCount: GetHcclTypeSize fail!"); | |||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "[Get][HcclTypeSize] fail, datatype:%d", data_type); | |||
if (op_desc->GetType() == HCOMRECEIVE) { | |||
for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { | |||
int64_t output_size = 0; | |||
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), output_size), | |||
"Get size from TensorDesc failed, op: %s, output index: %zu.", op_desc->GetName().c_str(), i); | |||
"[Get][Size] from TensorDesc failed, op:%s, output index:%zu.", op_desc->GetName().c_str(), i); | |||
output_size = (output_size + align_size - 1) / align_size * align_size; | |||
total_size += output_size; | |||
} | |||
@@ -107,42 +107,48 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, HcclDataType | |||
int64_t block_size = 0; | |||
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | |||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||
// dynamic shape hccl op get size from output tensor desc | |||
if (op_desc->HasAttr(ATTR_NAME_IS_UNKNOWN_SHAPE)) { | |||
GE_CHECK_NOTNULL(op_desc->GetOutputDescPtr(i)); | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetOutputDescPtr(i), input_size), | |||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||
} | |||
GE_IF_BOOL_EXEC( | |||
op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; | |||
GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, | |||
"get HCOM_ATTR_RANK_SIZE failed"); | |||
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero"); | |||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET( | |||
ge::CheckInt64Uint32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); | |||
"[Get][Attr] %s in op:%s(%s) failed", HCOM_ATTR_RANK_SIZE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "[Check][Param] rank size is zero"); | |||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | |||
GE_CHK_STATUS_RET(ge::CheckInt64Uint32MulOverflow(shape_size, size), | |||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX, op:%s(%s)", | |||
shape_size, size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
block_size = (shape_size * size) / rank_size; | |||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); | |||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), | |||
"[Check][Param] Total size:%ld is beyond the INT64_MAX, op:%s(%s)", | |||
total_size, op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
total_size = total_size + block_size; continue;); | |||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | |||
GELOGD("hcom util node %s inputsize %ld, shapesize %ld, datasize %d.", | |||
op_desc->GetName().c_str(), input_size, shape_size, size); | |||
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | |||
"Product of shape size and size beyond INT64_MAX"); | |||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); | |||
GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); | |||
GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); | |||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); | |||
GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), | |||
"[Check][Param] Total size:%ld is beyond the INT64_MAX", total_size); | |||
total_size = total_size + block_size; | |||
} | |||
} | |||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); | |||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); | |||
count = static_cast<int>(total_size / size); | |||
GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, "total_size:%ld is not divisiable by size:%d.", | |||
total_size, size); | |||
GE_CHK_BOOL_EXEC(total_size % size == 0, return PARAM_INVALID, | |||
"[Check][Param] total_size:%ld is not divisiable by size:%d.", total_size, size); | |||
return SUCCESS; | |||
} | |||
@@ -153,32 +159,34 @@ Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc, | |||
if (!IsHorovodOp(op_desc->GetType())) { | |||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not horovod op, check invalid", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator."); | |||
GELOGE(PARAM_INVALID, "[Call][IsHorovodOp] failed, Op:%s(%s) is not horovod op", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
int64_t align_size = 512; | |||
int32_t size = 0; | |||
for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { | |||
GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<HcclDataType>(kernel_hccl_infos[i].dataType), size), | |||
"GetHorovodCount: GetHcclTypeSize fail!"); | |||
"[Call][GetHcclTypeSize] fail, op:%s(%s)", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
int64_t input_size = 0; | |||
int64_t block_size = 0; | |||
GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); | |||
GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), | |||
"get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); | |||
"[Get][Size] from TensorDesc failed, op:%s, input index:%zu", op_desc->GetName().c_str(), i); | |||
int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); | |||
GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), | |||
"Product of shape size and size beyond INT64_MAX"); | |||
"[Check][Param] Product of shape size:%ld and size:%d beyond INT64_MAX", shape_size, size); | |||
if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) { | |||
block_size = shape_size * size; | |||
} else { | |||
block_size = (input_size + align_size - 1) / align_size * align_size; | |||
} | |||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero"); | |||
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, "block_size:%ld is not divisiable by size:%d.", | |||
block_size, size); | |||
GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "[Check][Param] Size is zero"); | |||
GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, | |||
"[Check][Param] block_size:%ld is not divisiable by size:%d.", block_size, size); | |||
kernel_hccl_infos[i].count = static_cast<int>(block_size / size); | |||
} | |||
@@ -191,7 +199,8 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||
Status ret; | |||
ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos); | |||
if (ret != SUCCESS) { | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid."); | |||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] failed, the number of GETaskKernelHcclInfo is invalid, op:%s(%s).", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
@@ -200,7 +209,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||
ret = GetHcomCount(op_desc, static_cast<HcclDataType>(kernel_hccl_infos[0].dataType), | |||
kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count); | |||
if (ret != SUCCESS) { | |||
GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.", | |||
GELOGE(ret, "[Call][GetHcomCount] Node:%s Optype:%s get the Hcom operator hccl count fail.", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
@@ -210,7 +219,7 @@ Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc, | |||
if (IsHorovodOp(op_desc->GetType())) { | |||
ret = GetHorovodCount(op_desc, kernel_hccl_infos); | |||
if (ret != SUCCESS) { | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s get the Horovod hccl operator count fail.", | |||
GELOGE(PARAM_INVALID, "[Call][GetHorovodCount] Node:%s Optype:%s get the Horovod hccl operator count fail.", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
@@ -225,11 +234,10 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||
if (IsHCOMOp(op_desc->GetType())) { | |||
std::string hcom_op_type; | |||
GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), | |||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", | |||
HCOM_ATTR_REDUCE_TYPE.c_str(), | |||
REPORT_INNER_ERROR("E19999", "Get Attr:%s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID, | |||
"HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!", | |||
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
if (hcom_op_type == "min") { | |||
@@ -244,7 +252,9 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now, " | |||
"check invalid", HCOM_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), hcom_op_type.c_str()); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str()); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), hcom_op_type value:%s is not support now", | |||
HCOM_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), | |||
op_desc->GetType().c_str(), hcom_op_type.c_str()); | |||
return PARAM_INVALID; | |||
} | |||
} | |||
@@ -256,7 +266,7 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID, | |||
"HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!", | |||
"[Get][Attr] %s in op:%s(%s) fail", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<HorovodReduceOp>(horovod_op_type)); | |||
@@ -264,8 +274,8 @@ Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, Hccl | |||
REPORT_INNER_ERROR("E19999", "Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now, " | |||
"check invalid", ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Attr:%s in Op:%s(%s), horovod_op_type value:%ld is not support now", | |||
ATTR_HOROVOD_ATTR_REDUCE_TYPE.c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type); | |||
return PARAM_INVALID; | |||
} | |||
op_type = iter->second; | |||
@@ -281,7 +291,7 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro | |||
HCOM_ATTR_ROOT_RANK.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID, | |||
"HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!", | |||
"[Get][Attr] %s in op:%s(%s) fail", HCOM_ATTR_ROOT_RANK.c_str(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return SUCCESS; | |||
@@ -296,7 +306,7 @@ Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc, | |||
int64_t root_id = 0; | |||
Status dmrt = GetHcclRootId(op_desc, root_id); | |||
if (dmrt != SUCCESS) { | |||
GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt); | |||
GELOGE(FAILED, "[Get][HcclRootId] fail! domi error: %u", dmrt); | |||
return FAILED; | |||
} | |||
@@ -324,7 +334,8 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, | |||
REPORT_INNER_ERROR("E19999", "Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1, " | |||
"check invalid", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid."); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Op:%s(%s) is not hcom op or param kernel_hccl_infos.size:%zu != 1", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size()); | |||
return PARAM_INVALID; | |||
} | |||
@@ -337,7 +348,9 @@ Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc, | |||
"in op:%s(%s), check invalid", | |||
kernel_hccl_infos.size(), op_desc->GetInputsSize(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid."); | |||
GELOGE(PARAM_INVALID, "Param kernel_hccl_infos.size:%zu is empty or not equal to " | |||
"input_desc size:%zu in op:%s(%s)", kernel_hccl_infos.size(), op_desc->GetInputsSize(), | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
} | |||
@@ -360,7 +373,7 @@ Status HcomOmeUtil::GetHorovodInputs(const ge::ConstOpDescPtr &op_desc, | |||
} | |||
if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) { | |||
GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s the number of GETaskKernelHcclInfo is invalid.", | |||
GELOGE(PARAM_INVALID, "[Check][KernelHcclInfo] Node:%s Optype:%s the number of GETaskKernelHcclInfo is invalid.", | |||
op_desc->GetName().c_str(), op_desc->GetType().c_str()); | |||
return PARAM_INVALID; | |||
} | |||
@@ -54,7 +54,7 @@ void VarAccelerateCtrl::SetVarChanged(const std::string &var_name) { | |||
void VarAccelerateCtrl::AddGraph(uint32_t graph_id, const ComputeGraphPtr &compute_graph) { | |||
std::lock_guard<std::mutex> lock(mutex_); | |||
if (compute_graph == nullptr) { | |||
GELOGE(PARAM_INVALID, "Failed to add graph %u, the compute graph is null", graph_id); | |||
GELOGE(PARAM_INVALID, "[Check][Param] Failed to add graph %u, the compute graph is null", graph_id); | |||
return; | |||
} | |||
auto &var_names = graph_ids_to_var_names_[graph_id]; | |||